In [96]:
import numpy as np

In [10]:
import pandas as pd
from datetime import datetime, timedelta
from collections import defaultdict

In [104]:
def read_ec_data(filename):
    with open(filename) as fid:
        data_dct = defaultdict(dict)
        for line in fid:
            fields = line.strip('\n').split('\t')
            if fields[1] == 'SLP':
                continue
            ec_time = datetime.strptime(fields[0], '%Y%m%d%H')
            forecast_time = (ec_time + timedelta(hours=12))
            for idx in range(-12, 24):
                data_dct[forecast_time][f'{fields[1]}.{idx}'] = float(fields[idx + 12 + 2])
    return pd.DataFrame(data_dct).transpose()

In [44]:
def read_obs(filename):
    obs_data = pd.read_csv(filename, header=None, names=['time', 'obs'], sep='\t')
    obs_data['date'] = pd.to_datetime(obs_data['time'] // 10000, format='%Y%m%d')
    obs_data['hour'] = obs_data['time'] // 100 % 100
    obs_data2 = obs_data.pivot(columns='hour', index='date', values='obs')
    return obs_data2

In [99]:
ec1 = read_ec_data('data/ec_fcst_2018030112_2018103112.txt')
ec2 = read_ec_data('data/ec_fcst_2018110112_2018123012.txt')
ec = pd.concat([ec1, ec2], axis=0)

In [83]:
obs_p1 = read_obs('data/obs_2018030112_2018103112_site_01.txt')
obs_p2 = read_obs('data/obs_2018110112_2018123012_site_01.txt')
obs = pd.concat([obs_p1, obs_p2], axis=0).resample('1D').mean()

In [84]:
yesterday_obs = obs.shift(1)
yesterday_obs = yesterday_obs[[x for x in range(12, 24)]]

In [85]:
yesterday_obs.columns = [x - 24 for x in yesterday_obs.columns]

In [86]:
obs_mat = pd.concat([yesterday_obs, obs], axis=1)
obs_mat.columns = [f'obs.{x}' for x in obs_mat.columns]

In [100]:
raw_data = ec.merge(obs_mat, left_index=True, right_index=True)

### 特征加工

In [101]:
for idx in range(-12, 24):
    raw_data[f'ws.{idx}'] = np.sqrt(raw_data[f'U10.{idx}'] ** 2 + raw_data[f'V10.{idx}'] ** 2)
    raw_data[f'rh_delta.{idx}'] = raw_data[f'T.{idx}'] - raw_data[f'RH.{idx}']
    raw_data[f'bias.{idx}'] = raw_data[f'obs.{idx}'] - raw_data[f'ws.{idx}']

In [105]:
for idx in range(0, 24):
    for span in (1, 3, 6, 12):
        raw_data[f'PSFC_{span}d.{idx}'] = raw_data[f'PSFC.{idx}'] - raw_data[f'PSFC.{idx-span}']
        raw_data[f'T_{span}d.{idx}'] = raw_data[f'T.{idx}'] - raw_data[f'T.{idx-span}']
        raw_data[f'ws_{span}d.{idx}'] = raw_data[f'ws.{idx}'] - raw_data[f'ws.{idx-span}']