In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from collections import defaultdict
import warnings

In [2]:
import xgboost as xgb

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
def read_ec_data(filename):
    with open(filename) as fid:
        data_dct = defaultdict(dict)
        for line in fid:
            fields = line.strip('\n').split('\t')
            if fields[1] == 'SLP':
                continue
            ec_time = datetime.strptime(fields[0], '%Y%m%d%H')
            forecast_time = (ec_time + timedelta(hours=12))
            for idx in range(-12, 24):
                data_dct[forecast_time][f'{fields[1]}.{idx}'] = float(fields[idx + 12 + 2])
    return pd.DataFrame(data_dct).transpose()

In [5]:
def read_obs(filename):
    obs_data = pd.read_csv(filename, header=None, names=['time', 'obs'], sep='\t')
    obs_data['date'] = pd.to_datetime(obs_data['time'] // 10000, format='%Y%m%d')
    obs_data['hour'] = obs_data['time'] // 100 % 100
    obs_data2 = obs_data.pivot(columns='hour', index='date', values='obs')
    return obs_data2

In [6]:
ec1 = read_ec_data('data/ec_fcst_2018030112_2018103112.txt')
ec2 = read_ec_data('data/ec_fcst_2018110112_2018123012.txt')
ec = pd.concat([ec1, ec2], axis=0)

In [7]:
obs_p1 = read_obs('data/obs_2018030112_2018103112_site_01.txt')
obs_p2 = read_obs('data/obs_2018110112_2018123012_site_01.txt')
obs = pd.concat([obs_p1, obs_p2], axis=0).resample('1D').mean()

In [8]:
yesterday_obs = obs.shift(1)
yesterday_obs = yesterday_obs[[x for x in range(12, 24)]]

In [9]:
yesterday_obs.columns = [x - 24 for x in yesterday_obs.columns]

In [10]:
obs_mat = pd.concat([yesterday_obs, obs], axis=1)
obs_mat.columns = [f'obs.{x}' for x in obs_mat.columns]

In [11]:
obs_mat = obs_mat.loc[obs_mat['obs.-12'].notnull()]

In [12]:
raw_data = ec.merge(obs_mat, left_index=True, right_index=True)

### 特征加工

In [13]:
# 风速，湿球温度与气温差值，预报风速误差
for idx in range(-12, 24):
    raw_data[f'ws.{idx}'] = np.sqrt(raw_data[f'U10.{idx}'] ** 2 + raw_data[f'V10.{idx}'] ** 2)
    raw_data[f'rh_delta.{idx}'] = raw_data[f'T.{idx}'] - raw_data[f'RH.{idx}']
    raw_data[f'bias.{idx}'] = raw_data[f'obs.{idx}'] - raw_data[f'ws.{idx}']

In [14]:
# 气压变，温度变，风速变
for idx in range(0, 24):
    for span in (1, 3, 6, 12):
        raw_data[f'PSFC_{span}d.{idx}'] = raw_data[f'PSFC.{idx}'] - raw_data[f'PSFC.{idx-span}']
        raw_data[f'T_{span}d.{idx}'] = raw_data[f'T.{idx}'] - raw_data[f'T.{idx-span}']
        raw_data[f'ws_{span}d.{idx}'] = raw_data[f'ws.{idx}'] - raw_data[f'ws.{idx-span}']

In [15]:
is_train = raw_data.index < datetime(2018, 10, 1)
is_eval = (raw_data.index >= datetime(2018, 10, 1)) & (raw_data.index < datetime(2018, 10, 18))
is_test = (raw_data.index >= datetime(2018, 10, 18)) & (raw_data.index < datetime(2018, 11, 3))
is_prod = raw_data.index >= datetime(2018, 11, 3)

In [16]:
is_train.sum(), is_eval.sum(), is_test.sum(), is_prod.sum()

(212, 17, 16, 29)

### 训练模型

In [17]:
def rmse(y_arr):
    return np.sqrt((y_arr ** 2).mean())

In [18]:
# print(list(raw_data.columns))

In [19]:
fc_hr = 12

In [20]:
# feat_list = [f'U10.{x}' for x in range(-12, 0)] + [f'V10.{x}' for x in range(-12, 0)] + \
#     [f'bias.{x}' for x in range(-12, 0)] + [f'ws.{x}' for x in range(-12, 0)] + \
#     [f'rh_delta.{x}' for x in range(-12, 0)] + [f'U10.{fc_hr}' + f'V10.{fc_hr}' + f'ws.{fc_hr}' + f'rh_delta.{fc_hr}'] + \
#     [f'PSFC_{span}d.{fc_hr}' for span in (1, 3, 6, 12)] + [f'T_{span}d.{fc_hr}' for span in (1, 3, 6, 12)] + \
#     [f'ws_{span}d.{fc_hr}' for span in (1, 3, 6, 12)]

In [21]:
feat_list = [f'U10.{x}' for x in range(-12, 24)] + [f'V10.{x}' for x in range(-12, 24)] + \
    [f'bias.{x}' for x in range(-12, 0)] + [f'ws.{x}' for x in range(-12, 24)] + \
    [f'rh_delta.{x}' for x in range(-12, 24)] + \
    [f'PSFC_{span}d.{fc_hr}' for span in (1, 3, 6, 12)] + [f'T_{span}d.{fc_hr}' for span in (1, 3, 6, 12)] + \
    [f'ws_{span}d.{fc_hr}' for span in (1, 3, 6, 12)]

In [22]:
x_train = raw_data.loc[is_train, feat_list]
x_eval = raw_data.loc[is_eval, feat_list]
x_test = raw_data.loc[is_test, feat_list]

In [23]:
y_train = raw_data.loc[is_train, f'bias.{fc_hr}']
y_eval = raw_data.loc[is_eval, f'bias.{fc_hr}']
y_test = raw_data.loc[is_test, f'bias.{fc_hr}']

In [24]:
print(rmse(y_train), rmse(y_eval), rmse(y_test))

1.2292265630625003 1.2818262517841104 1.9991384964855663


In [25]:
len(feat_list)

168

### 贝叶斯调参

In [26]:
from bayes_opt import BayesianOptimization

In [51]:
def bo_result_to_xgb(bo_res):
    xgb_params = bo_res.copy()
    if 'log_gamma' in xgb_params:
        xgb_params['gamma'] = 10**xgb_params['log_gamma']
        xgb_params.pop('log_gamma')
    if 'max_depth' in xgb_params:
        xgb_params['max_depth'] = int(np.round(xgb_params['max_depth']))
    if 'max_delta_step' in xgb_params:
        xgb_params['max_delta_step'] = int(np.round(xgb_params['max_delta_step']))
    if 'subsample' in xgb_params:
        xgb_params['subsample'] = max(min(xgb_params['subsample'], 1), 0)
    if 'colsample_bytree' in xgb_params:
        xgb_params['colsample_bytree'] = max(min(xgb_params['colsample_bytree'], 1), 0)
    return xgb_params

In [67]:
eval_result = clf.evals_result()

In [69]:
eval_result['validation_0']['rmse'][-1]

0.817205

In [77]:
def xgb_model(**kwargs):
    xgb_params = bo_result_to_xgb(kwargs)
    clf = xgb.XGBRegressor(booster='gbtree', n_estimators=400, verbosity=0, n_jobs=16, seed=42,
                            reg_alpha=0.1, reg_lambda=0.1, **xgb_params)
    clf.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_eval, y_eval)], eval_metric='rmse',
            verbose=False)
    eval_result = clf.evals_result()
    train_rmse = eval_result['validation_0']['rmse'][-1]
    dev_rmse = eval_result['validation_1']['rmse'][-1]
    n_trees = len(eval_result['validation_0']['rmse'])
#     print(f'N trees: {n_trees}, train RMSE: {train_rmse}, eval RMSE: {dev_rmse}')
#     print(f'eval RMSE: {dev_rmse}, {xgb_params}')
    return -dev_rmse

In [84]:
xgb_bayes = BayesianOptimization(xgb_model, {
    'learning_rate': (0.02, 0.06),
    'max_depth': (3, 7),
    'log_gamma': (-3, 1),
    'min_child_weight': (0, 20),
    'max_delta_step': (0, 10),
    'subsample': (0.3, 0.9),
    'colsample_bytree': (0.3, 0.9)
})

In [85]:
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    xgb_bayes.maximize(init_points=15, n_iter=25)

|   iter    |  target   | colsam... | learni... | log_gamma | max_de... | max_depth | min_ch... | subsample |
-------------------------------------------------------------------------------------------------------------
N trees: 400, train RMSE: 0.409916, eval RMSE: 0.794613
eval RMSE: 0.794613, {'colsample_bytree': 0.8509024010331165, 'learning_rate': 0.02434365408310157, 'max_delta_step': 4, 'max_depth': 4, 'min_child_weight': 10.720115438221413, 'subsample': 0.7790867761935246, 'gamma': 1.9594703655176307}
| [0m 1       [0m | [0m-0.7946  [0m | [0m 0.8509  [0m | [0m 0.02434 [0m | [0m 0.2921  [0m | [0m 3.636   [0m | [0m 4.185   [0m | [0m 10.72   [0m | [0m 0.7791  [0m |
N trees: 400, train RMSE: 0.399945, eval RMSE: 0.920521
eval RMSE: 0.920521, {'colsample_bytree': 0.4184904926610765, 'learning_rate': 0.024844414056574383, 'max_delta_step': 3, 'max_depth': 7, 'min_child_weight': 9.562199136177592, 'subsample': 0.35179382511078694, 'gamma': 0.03472756171433207}
| [0m

In [86]:
best_params = xgb_bayes.max['params']
xgb_params = bo_result_to_xgb(best_params)

In [87]:
xgb_params

{'colsample_bytree': 0.9,
 'learning_rate': 0.06,
 'max_delta_step': 6,
 'max_depth': 7,
 'min_child_weight': 20.0,
 'subsample': 0.9,
 'gamma': 0.001}

In [201]:
# clf = xgb.XGBRegressor(booster='dart', learning_rate=0.04, n_estimators=400, subsample=0.3, colsample_bytree=0.35, max_depth=3)

In [88]:
clf = xgb.XGBRegressor(booster='gbtree', n_estimators=400, verbosity=0, n_jobs=16, seed=42,
                            reg_alpha=0.1, reg_lambda=0.1, **xgb_params)

In [89]:
clf.fit(x_train, y_train, eval_set=[(x_eval, y_eval)], eval_metric='rmse', verbose=20)

[0]	validation_0-rmse:1.48941
[20]	validation_0-rmse:1.01156
[40]	validation_0-rmse:0.818318
[60]	validation_0-rmse:0.761551
[80]	validation_0-rmse:0.733081
[100]	validation_0-rmse:0.754318
[120]	validation_0-rmse:0.732402
[140]	validation_0-rmse:0.7363
[160]	validation_0-rmse:0.743342
[180]	validation_0-rmse:0.734701
[200]	validation_0-rmse:0.729753
[220]	validation_0-rmse:0.732188
[240]	validation_0-rmse:0.728837
[260]	validation_0-rmse:0.7264
[280]	validation_0-rmse:0.726965
[300]	validation_0-rmse:0.725898
[320]	validation_0-rmse:0.727147
[340]	validation_0-rmse:0.725759
[360]	validation_0-rmse:0.724078
[380]	validation_0-rmse:0.724918
[399]	validation_0-rmse:0.725349


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.9, gamma=0.001, learning_rate=0.06,
       max_delta_step=6, max_depth=7, min_child_weight=20.0, missing=None,
       n_estimators=400, n_jobs=16, nthread=None, objective='reg:linear',
       random_state=0, reg_alpha=0.1, reg_lambda=0.1, scale_pos_weight=1,
       seed=42, silent=True, subsample=0.9, verbosity=0)

In [203]:
y_pred_eval = clf.predict(x_eval)
y_pred_eval = pd.Series(y_pred_eval, index=y_eval.index)

In [204]:
rmse(y_pred_eval - y_eval)

1.172432708131372

In [205]:
y_test_eval = clf.predict(x_test)
y_test_eval = pd.Series(y_test_eval, index=y_test.index)

In [206]:
rmse(y_test - y_test_eval)

1.5934879159175075