In [54]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from collections import defaultdict

In [55]:
import xgboost as xgb

In [56]:
import matplotlib.pyplot as plt
%matplotlib inline

In [57]:
from bayes_opt import BayesianOptimization

In [58]:
site = 'site_MID3'

In [59]:
def read_ec_data(filename):
    with open(filename) as fid:
        data_dct = defaultdict(dict)
        for line in fid:
            fields = line.strip('\n').split('\t')
            if fields[1] == 'SLP':
                continue
            ec_time = datetime.strptime(fields[0], '%Y%m%d%H')
            forecast_time = (ec_time + timedelta(hours=12))
            for idx in range(-12, 24):
                data_dct[forecast_time][f'{fields[1]}.{idx}'] = float(fields[idx + 12 + 2])
    return pd.DataFrame(data_dct).transpose()

In [60]:
def read_obs(filename):
    obs_data = pd.read_csv(filename, header=None, names=['time', 'obs'], sep='\t')
    obs_data['date'] = pd.to_datetime(obs_data['time'] // 10000, format='%Y%m%d')
    obs_data['hour'] = obs_data['time'] // 100 % 100
    obs_data2 = obs_data.pivot(columns='hour', index='date', values='obs')
    return obs_data2

In [12]:
def read_obs2(filename):
    obs_data = pd.read_csv(filename, header=None, names=['time', 'obs'], sep='\t')
    obs_data['time'] = pd.to_datetime(obs_data['time'], format='%Y%m%d%H%M%S')
    return obs_data

In [61]:
ec1 = read_ec_data('data/ec_fcst_2018030112_2018103112.txt')
ec2 = read_ec_data('data/ec_fcst_2018110112_2018123012.txt')
ec = pd.concat([ec1, ec2], axis=0)

In [62]:
obs_p1 = read_obs2(f'data/obs_2018030112_2018103112_{site}.txt')

In [63]:
obs_p1 = read_obs(f'data/obs_2018030112_2018103112_{site}.txt')
obs_p2 = read_obs(f'data/obs_2018110112_2018123012_{site}.txt')
obs = pd.concat([obs_p1, obs_p2], axis=0).resample('1D').mean()

In [36]:
yesterday2 = obs.copy()

In [64]:
# yesterday_obs = obs.shift(1)
yesterday_obs = obs.copy()
yesterday_obs.index = yesterday_obs.index + timedelta(days=1)
yesterday_obs = yesterday_obs[[x for x in range(12, 24)]]

In [65]:
yesterday_obs.head()

hour,12,13,14,15,16,17,18,19,20,21,22,23
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2018-03-03,1.66183,2.3615,1.20217,0.85583,0.9185,1.23767,1.07617,1.04233,1.29867,1.788,2.33733,2.1555
2018-03-04,0.65183,1.3185,1.111,0.8125,1.12833,3.43183,3.531,3.5745,4.248,3.805,1.7455,2.23333
2018-03-05,1.3105,1.59017,1.23883,1.51267,1.21433,1.20367,1.23967,1.37083,0.875,0.63733,0.66083,0.94167
2018-03-06,4.1455,2.46467,1.70583,2.0345,2.54333,1.71533,0.71733,0.94167,1.33633,1.1285,0.98383,1.10117
2018-03-07,4.97083,3.81583,3.43417,3.09383,2.13833,1.69967,2.03017,1.5915,1.70967,1.77683,1.498,1.89833


In [66]:
obs.head()

hour,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-03-02,1.78583,1.503,1.52333,1.76267,1.94383,2.5645,2.75383,2.20017,2.05617,1.014,...,1.20217,0.85583,0.9185,1.23767,1.07617,1.04233,1.29867,1.788,2.33733,2.1555
2018-03-03,2.36883,1.34183,1.633,1.4215,1.35967,1.57433,2.43417,1.86583,1.076,1.08633,...,1.111,0.8125,1.12833,3.43183,3.531,3.5745,4.248,3.805,1.7455,2.23333
2018-03-04,3.08933,4.24467,4.6285,4.85583,4.97117,5.06967,4.51117,4.30467,3.78883,3.43317,...,1.23883,1.51267,1.21433,1.20367,1.23967,1.37083,0.875,0.63733,0.66083,0.94167
2018-03-05,1.32933,2.37083,3.50717,4.22017,4.82467,5.25833,5.87817,5.74383,6.61217,6.11533,...,1.70583,2.0345,2.54333,1.71533,0.71733,0.94167,1.33633,1.1285,0.98383,1.10117
2018-03-06,2.0725,3.40067,3.14533,3.409,3.1275,3.34433,4.61917,4.791,4.80767,5.279,...,3.43417,3.09383,2.13833,1.69967,2.03017,1.5915,1.70967,1.77683,1.498,1.89833


In [67]:
yesterday_obs.columns = [x - 24 for x in yesterday_obs.columns]

In [68]:
obs_mat = pd.concat([yesterday_obs, obs], axis=1)
obs_mat.columns = [f'obs.{x}' for x in obs_mat.columns]

In [69]:
obs_mat = obs_mat.loc[obs_mat['obs.-12'].notnull()]

In [70]:
raw_data = ec.merge(obs_mat, left_index=True, right_index=True)

In [71]:
obs_mat.index.max() - timedelta(days=1)

Timestamp('2018-12-30 00:00:00')

### 特征加工

In [73]:
# 风速，湿球温度与气温差值，预报风速误差
for idx in range(-12, 24):
    raw_data[f'ws.{idx}'] = np.sqrt(raw_data[f'U10.{idx}'] ** 2 + raw_data[f'V10.{idx}'] ** 2)
    raw_data[f'rh_delta.{idx}'] = raw_data[f'T.{idx}'] - raw_data[f'RH.{idx}']
    raw_data[f'bias.{idx}'] = raw_data[f'obs.{idx}'] - raw_data[f'ws.{idx}']

In [74]:
# 气压变，温度变，风速变
for idx in range(0, 24):
    for span in (1, 3, 6, 12):
        raw_data[f'PSFC_{span}d.{idx}'] = raw_data[f'PSFC.{idx}'] - raw_data[f'PSFC.{idx-span}']
        raw_data[f'T_{span}d.{idx}'] = raw_data[f'T.{idx}'] - raw_data[f'T.{idx-span}']
        raw_data[f'ws_{span}d.{idx}'] = raw_data[f'ws.{idx}'] - raw_data[f'ws.{idx-span}']

In [75]:
is_train = raw_data.index < datetime(2018, 10, 1)
is_eval = (raw_data.index >= datetime(2018, 10, 1)) & (raw_data.index < datetime(2018, 10, 18))
is_test = (raw_data.index >= datetime(2018, 10, 18)) & (raw_data.index < datetime(2018, 11, 3))
is_prod = raw_data.index >= datetime(2018, 11, 3)

In [17]:
# is_train = raw_data.index < datetime(2018, 11, 3)
# is_eval = (raw_data.index >= datetime(2018, 10, 1)) & (raw_data.index < datetime(2018, 10, 18))
# is_test = (raw_data.index >= datetime(2018, 10, 18)) & (raw_data.index < datetime(2018, 11, 3))
# is_prod = raw_data.index >= datetime(2018, 11, 3)

In [76]:
is_train.sum(), is_eval.sum(), is_test.sum(), is_prod.sum()

(212, 17, 16, 30)

### 训练模型

In [19]:
def rmse(y_arr):
    return np.sqrt((y_arr ** 2).mean())

In [77]:
def get_dataset(raw_data, fc_hr, is_train, is_eval, is_test, is_prod):
    feat_list = [f'U10.{x}' for x in range(-12, 24)] + [f'V10.{x}' for x in range(-12, 24)] + \
        [f'bias.{x}' for x in range(-12, 0)] + [f'ws.{x}' for x in range(-12, 24)] + \
        [f'rh_delta.{x}' for x in range(-12, 24)] + \
        [f'PSFC_{span}d.{fc_hr}' for span in (1, 3, 6, 12)] + [f'T_{span}d.{fc_hr}' for span in (1, 3, 6, 12)] + \
        [f'ws_{span}d.{fc_hr}' for span in (1, 3, 6, 12)]
    x_train = raw_data.loc[is_train, feat_list]
    x_eval = raw_data.loc[is_eval, feat_list]
    x_test = raw_data.loc[is_test, feat_list]
    x_prod = raw_data.loc[is_prod, feat_list]
    y_train = raw_data.loc[is_train, f'bias.{fc_hr}']
    y_eval = raw_data.loc[is_eval, f'bias.{fc_hr}']
    y_test = raw_data.loc[is_test, f'bias.{fc_hr}']
    y_prod = raw_data.loc[is_prod, f'bias.{fc_hr}']
    return x_train, y_train, x_eval, y_eval, x_test, y_test, x_prod, y_prod

In [85]:
eval_dct = {}
test_dct = {}
prod_dct = {}
for fc_hr in range(0, 24):
    print(f'Hour: {fc_hr}')
    x_train, y_train, x_eval, y_eval, x_test, y_test, x_prod, y_prod = get_dataset(
        raw_data, fc_hr, is_train, is_eval, is_test, is_prod)
    clf = xgb.XGBRegressor(booster='dart', learning_rate=0.04, n_estimators=600, 
                           subsample=0.3, colsample_bytree=0.35, max_depth=3, seed=42)
    clf.fit(x_train, y_train, eval_set=[(x_eval, y_eval)], eval_metric='rmse', verbose=30, early_stopping_rounds=20)
    y_pred_eval = clf.predict(x_eval)
    y_pred_eval = pd.Series(y_pred_eval, index=y_eval.index)
    eval_dct[f'pred_{fc_hr}'] = y_pred_eval
    y_pred_test = clf.predict(x_test)
    y_pred_test = pd.Series(y_pred_test, index=y_test.index)
    test_dct[f'pred_{fc_hr}'] = y_pred_test
    y_pred_prod = clf.predict(x_prod)
    y_pred_prod = pd.Series(y_pred_prod, index=y_prod.index)
    prod_dct[f'pred_{fc_hr}'] = y_pred_prod

Hour: 0
[0]	validation_0-rmse:0.877241
Will train until validation_0-rmse hasn't improved in 20 rounds.
[30]	validation_0-rmse:0.548401
[60]	validation_0-rmse:0.50021
[90]	validation_0-rmse:0.47248
[120]	validation_0-rmse:0.450698
Stopping. Best iteration:
[120]	validation_0-rmse:0.450698

Hour: 1
[0]	validation_0-rmse:0.990305
Will train until validation_0-rmse hasn't improved in 20 rounds.
[30]	validation_0-rmse:0.719553
[60]	validation_0-rmse:0.637152
[90]	validation_0-rmse:0.590424
Stopping. Best iteration:
[86]	validation_0-rmse:0.586484



In [87]:
clf.best_iteration

86

In [84]:
clf.get_params

<bound method XGBModel.get_params of XGBRegressor(base_score=0.5, booster='dart', colsample_bylevel=1,
       colsample_bytree=0.35, gamma=0, learning_rate=0.04,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=600, n_jobs=1, nthread=None, objective='reg:linear',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=42, silent=True, subsample=0.3)>

In [22]:
df_eval = pd.DataFrame(eval_dct)[[f'pred_{x}' for x in range(0, 24)]]
ws_ec = raw_data.loc[is_eval, [f'ws.{x}' for x in range(0, 24)]].values
ws_pred = df_eval.values + ws_ec
ws_obs = raw_data.loc[is_eval, [f'obs.{x}' for x in range(0, 24)]].values

In [23]:
ws_ec0 = ws_ec.reshape((-1,))
ws_pred0 = ws_pred.reshape((-1,))
ws_obs0 = ws_obs.reshape((-1,))
gt4 = ws_obs0>=4
print('EC: ', rmse(ws_ec0[gt4]-ws_obs0[gt4]))
print('Fcst: ', rmse(ws_pred0[gt4]-ws_obs0[gt4]))

EC:  2.974839506132978
Fcst:  0.29439775167146315


In [24]:
print('EC: ', rmse(ws_ec.reshape((-1,)) - ws_obs.reshape((-1,))))
print('Fcst: ', rmse(ws_pred.reshape((-1,)) - ws_obs.reshape((-1,))))

EC:  1.5125954376597022
Fcst:  0.22161216821062463


In [25]:
df_test = pd.DataFrame(test_dct)[[f'pred_{x}' for x in range(0, 24)]]
ws_ec = raw_data.loc[is_test, [f'ws.{x}' for x in range(0, 24)]].values
ws_pred = df_test.values + ws_ec
ws_obs = raw_data.loc[is_test, [f'obs.{x}' for x in range(0, 24)]].values

In [26]:
print('EC: ', rmse(ws_ec.reshape((-1,)) - ws_obs.reshape((-1,))))
print('Fcst: ', rmse(ws_pred.reshape((-1,)) - ws_obs.reshape((-1,))))

EC:  1.732663717448987
Fcst:  0.2448067551837043


In [27]:
df_prod = pd.DataFrame(prod_dct)[[f'pred_{x}' for x in range(0, 24)]]
ws_ec = raw_data.loc[is_prod, [f'ws.{x}' for x in range(0, 24)]]
ws_pred = df_prod.values + ws_ec.values
ws_obs = raw_data.loc[is_prod, [f'obs.{x}' for x in range(0, 24)]]

In [28]:
ws_pred_list = []
for idx, fcst_time in enumerate(ws_ec.index):
    meta_arr = pd.Series(ws_pred[idx, :], index=pd.date_range(fcst_time, freq='1H', periods=24))
    ws_pred_list.append(meta_arr)
ws_pred_arr = pd.concat(ws_pred_list)

In [29]:
ws_pred_arr.to_csv(f'forecast_{site}', sep='\t', header=False, date_format='%Y%m%d%H%M')