In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from collections import defaultdict
import warnings

In [2]:
import xgboost as xgb

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
def read_ec_data(filename):
    with open(filename) as fid:
        data_dct = defaultdict(dict)
        for line in fid:
            fields = line.strip('\n').split('\t')
            if fields[1] == 'SLP':
                continue
            ec_time = datetime.strptime(fields[0], '%Y%m%d%H')
            forecast_time = (ec_time + timedelta(hours=12))
            for idx in range(-12, 24):
                data_dct[forecast_time][f'{fields[1]}.{idx}'] = float(fields[idx + 12 + 2])
    return pd.DataFrame(data_dct).transpose()

In [5]:
def read_obs(filename):
    obs_data = pd.read_csv(filename, header=None, names=['time', 'obs'], sep='\t')
    obs_data['date'] = pd.to_datetime(obs_data['time'] // 10000, format='%Y%m%d')
    obs_data['hour'] = obs_data['time'] // 100 % 100
    obs_data2 = obs_data.pivot(columns='hour', index='date', values='obs')
    return obs_data2

In [6]:
ec1 = read_ec_data('data/ec_fcst_2018030112_2018103112.txt')
ec2 = read_ec_data('data/ec_fcst_2018110112_2018123012.txt')
ec = pd.concat([ec1, ec2], axis=0)

In [7]:
obs_p1 = read_obs('data/obs_2018030112_2018103112_site_01.txt')
obs_p2 = read_obs('data/obs_2018110112_2018123012_site_01.txt')
obs = pd.concat([obs_p1, obs_p2], axis=0).resample('1D').mean()

In [8]:
yesterday_obs = obs.shift(1)
yesterday_obs = yesterday_obs[[x for x in range(12, 24)]]

In [9]:
yesterday_obs.columns = [x - 24 for x in yesterday_obs.columns]

In [10]:
obs_mat = pd.concat([yesterday_obs, obs], axis=1)
obs_mat.columns = [f'obs.{x}' for x in obs_mat.columns]

In [11]:
obs_mat = obs_mat.loc[obs_mat['obs.-12'].notnull()]

In [12]:
raw_data = ec.merge(obs_mat, left_index=True, right_index=True)

### 特征加工

In [13]:
# 风速，湿球温度与气温差值，预报风速误差
for idx in range(-12, 24):
    raw_data[f'ws.{idx}'] = np.sqrt(raw_data[f'U10.{idx}'] ** 2 + raw_data[f'V10.{idx}'] ** 2)
    raw_data[f'rh_delta.{idx}'] = raw_data[f'T.{idx}'] - raw_data[f'RH.{idx}']
    raw_data[f'bias.{idx}'] = raw_data[f'obs.{idx}'] - raw_data[f'ws.{idx}']

In [14]:
# 气压变，温度变，风速变
for idx in range(0, 24):
    for span in (1, 3, 6, 12):
        raw_data[f'PSFC_{span}d.{idx}'] = raw_data[f'PSFC.{idx}'] - raw_data[f'PSFC.{idx-span}']
        raw_data[f'T_{span}d.{idx}'] = raw_data[f'T.{idx}'] - raw_data[f'T.{idx-span}']
        raw_data[f'ws_{span}d.{idx}'] = raw_data[f'ws.{idx}'] - raw_data[f'ws.{idx-span}']

In [15]:
is_train = raw_data.index < datetime(2018, 10, 1)
is_eval = (raw_data.index >= datetime(2018, 10, 1)) & (raw_data.index < datetime(2018, 10, 18))
is_test = (raw_data.index >= datetime(2018, 10, 18)) & (raw_data.index < datetime(2018, 11, 3))
is_prod = raw_data.index >= datetime(2018, 11, 3)

In [16]:
is_train.sum(), is_eval.sum(), is_test.sum(), is_prod.sum()

(212, 17, 16, 29)

### 训练模型

每个预报时次独立训练。预测对象为实际风速与EC风速的差值。预报结果再叠加上EC风速作为最终的预报结果。

In [17]:
def rmse(y_arr):
    return np.sqrt((y_arr ** 2).mean())

In [18]:
fc_hr = 12

In [20]:
# feat_list = [f'U10.{x}' for x in range(-12, 0)] + [f'V10.{x}' for x in range(-12, 0)] + \
#     [f'bias.{x}' for x in range(-12, 0)] + [f'ws.{x}' for x in range(-12, 0)] + \
#     [f'rh_delta.{x}' for x in range(-12, 0)] + [f'U10.{fc_hr}' + f'V10.{fc_hr}' + f'ws.{fc_hr}' + f'rh_delta.{fc_hr}'] + \
#     [f'PSFC_{span}d.{fc_hr}' for span in (1, 3, 6, 12)] + [f'T_{span}d.{fc_hr}' for span in (1, 3, 6, 12)] + \
#     [f'ws_{span}d.{fc_hr}' for span in (1, 3, 6, 12)]

In [19]:
feat_list = [f'U10.{x}' for x in range(-12, 24)] + [f'V10.{x}' for x in range(-12, 24)] + \
    [f'bias.{x}' for x in range(-12, 0)] + [f'ws.{x}' for x in range(-12, 24)] + \
    [f'rh_delta.{x}' for x in range(-12, 24)] + \
    [f'PSFC_{span}d.{fc_hr}' for span in (1, 3, 6, 12)] + [f'T_{span}d.{fc_hr}' for span in (1, 3, 6, 12)] + \
    [f'ws_{span}d.{fc_hr}' for span in (1, 3, 6, 12)]

In [20]:
x_train = raw_data.loc[is_train, feat_list]
x_eval = raw_data.loc[is_eval, feat_list]
x_test = raw_data.loc[is_test, feat_list]

In [21]:
y_train = raw_data.loc[is_train, f'bias.{fc_hr}']
y_eval = raw_data.loc[is_eval, f'bias.{fc_hr}']
y_test = raw_data.loc[is_test, f'bias.{fc_hr}']

In [22]:
print(rmse(y_train), rmse(y_eval), rmse(y_test))

1.2292265630625003 1.2818262517841104 1.9991384964855663


In [23]:
len(feat_list)

168

### 贝叶斯调参

需要多次迭代，以及计算贝叶斯概率。耗时会成倍增加

In [24]:
from bayes_opt import BayesianOptimization

In [25]:
def bo_result_to_xgb(bo_res):
    xgb_params = bo_res.copy()
    if 'log_gamma' in xgb_params:
        xgb_params['gamma'] = 10**xgb_params['log_gamma']
        xgb_params.pop('log_gamma')
    if 'max_depth' in xgb_params:
        xgb_params['max_depth'] = int(np.round(xgb_params['max_depth']))
    if 'max_delta_step' in xgb_params:
        xgb_params['max_delta_step'] = int(np.round(xgb_params['max_delta_step']))
    if 'subsample' in xgb_params:
        xgb_params['subsample'] = max(min(xgb_params['subsample'], 1), 0)
    if 'colsample_bytree' in xgb_params:
        xgb_params['colsample_bytree'] = max(min(xgb_params['colsample_bytree'], 1), 0)
    return xgb_params

In [26]:
def xgb_model(**kwargs):
    xgb_params = bo_result_to_xgb(kwargs)
    clf = xgb.XGBRegressor(booster='gbtree', n_estimators=400, verbosity=0, n_jobs=16, seed=42,
                            reg_alpha=0.1, reg_lambda=0.1, **xgb_params)
    clf.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_eval, y_eval)], eval_metric='rmse',
            verbose=False)
    eval_result = clf.evals_result()
    train_rmse = eval_result['validation_0']['rmse'][-1]
    dev_rmse = eval_result['validation_1']['rmse'][-1]
    n_trees = len(eval_result['validation_0']['rmse'])
    return -dev_rmse

In [27]:
xgb_bayes = BayesianOptimization(xgb_model, {
    'learning_rate': (0.02, 0.06),
    'max_depth': (3, 7),
    'log_gamma': (-3, 1),
    'min_child_weight': (0, 20),
    'max_delta_step': (0, 10),
    'subsample': (0.3, 0.9),
    'colsample_bytree': (0.3, 0.9)
})

In [28]:
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    xgb_bayes.maximize(init_points=15, n_iter=25)

|   iter    |  target   | colsam... | learni... | log_gamma | max_de... | max_depth | min_ch... | subsample |
-------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-0.8628  [0m | [0m 0.5074  [0m | [0m 0.04001 [0m | [0m-1.818   [0m | [0m 3.14    [0m | [0m 5.057   [0m | [0m 5.451   [0m | [0m 0.4073  [0m |
| [0m 2       [0m | [0m-0.8736  [0m | [0m 0.6125  [0m | [0m 0.03991 [0m | [0m-1.475   [0m | [0m 4.43    [0m | [0m 3.641   [0m | [0m 18.42   [0m | [0m 0.4361  [0m |
| [0m 3       [0m | [0m-0.9278  [0m | [0m 0.7991  [0m | [0m 0.05466 [0m | [0m 0.8298  [0m | [0m 6.068   [0m | [0m 6.104   [0m | [0m 10.66   [0m | [0m 0.8847  [0m |
| [95m 4       [0m | [95m-0.791   [0m | [95m 0.6825  [0m | [95m 0.03344 [0m | [95m 0.4346  [0m | [95m 7.158   [0m | [95m 6.868   [0m | [95m 11.15   [0m | [95m 0.7191  [0m |
| [0m 5       [0m | [0m-0.9043  [0m | 

In [32]:
best_params = xgb_bayes.max['params']
xgb_params = bo_result_to_xgb(best_params)

In [33]:
clf = xgb.XGBRegressor(booster='gbtree', n_estimators=400, verbosity=0, n_jobs=16, seed=42,
                            reg_alpha=0.1, reg_lambda=0.1, **xgb_params)

In [34]:
clf.fit(x_train, y_train, eval_set=[(x_eval, y_eval)], eval_metric='rmse', verbose=20)

[0]	validation_0-rmse:1.52806
[20]	validation_0-rmse:1.32578
[40]	validation_0-rmse:1.16027
[60]	validation_0-rmse:1.03994
[80]	validation_0-rmse:0.980547
[100]	validation_0-rmse:0.921509
[120]	validation_0-rmse:0.888566
[140]	validation_0-rmse:0.865691
[160]	validation_0-rmse:0.843067
[180]	validation_0-rmse:0.822843
[200]	validation_0-rmse:0.812894
[220]	validation_0-rmse:0.797213
[240]	validation_0-rmse:0.795344
[260]	validation_0-rmse:0.788397
[280]	validation_0-rmse:0.787793
[300]	validation_0-rmse:0.780009
[320]	validation_0-rmse:0.78097
[340]	validation_0-rmse:0.781492
[360]	validation_0-rmse:0.774001
[380]	validation_0-rmse:0.77949
[399]	validation_0-rmse:0.779566


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.3, gamma=0.001, learning_rate=0.02,
       max_delta_step=10, max_depth=7, min_child_weight=20.0, missing=None,
       n_estimators=400, n_jobs=16, nthread=None, objective='reg:linear',
       random_state=0, reg_alpha=0.1, reg_lambda=0.1, scale_pos_weight=1,
       seed=42, silent=True, subsample=0.9, verbosity=0)

验证集最终的指标与贝叶斯优化时最佳指标理应一致

In [35]:
y_pred_eval = clf.predict(x_eval)
y_pred_eval = pd.Series(y_pred_eval, index=y_eval.index)

In [36]:
rmse(y_pred_eval - y_eval)

0.7795664854072166

In [37]:
y_test_eval = clf.predict(x_test)
y_test_eval = pd.Series(y_test_eval, index=y_test.index)

In [38]:
rmse(y_test - y_test_eval)

1.370139985013581