In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from collections import defaultdict

In [2]:
import xgboost as xgb

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
def read_ec_data(filename):
    with open(filename) as fid:
        data_dct = defaultdict(dict)
        for line in fid:
            fields = line.strip('\n').split('\t')
            if fields[1] == 'SLP':
                continue
            ec_time = datetime.strptime(fields[0], '%Y%m%d%H')
            forecast_time = (ec_time + timedelta(hours=12))
            for idx in range(-12, 24):
                data_dct[forecast_time][f'{fields[1]}.{idx}'] = float(fields[idx + 12 + 2])
    return pd.DataFrame(data_dct).transpose()

In [5]:
def read_obs(filename):
    obs_data = pd.read_csv(filename, header=None, names=['time', 'obs'], sep='\t')
    obs_data['date'] = pd.to_datetime(obs_data['time'] // 10000, format='%Y%m%d')
    obs_data['hour'] = obs_data['time'] // 100 % 100
    obs_data2 = obs_data.pivot(columns='hour', index='date', values='obs')
    return obs_data2

In [6]:
ec1 = read_ec_data('data/ec_fcst_2018030112_2018103112.txt')
ec2 = read_ec_data('data/ec_fcst_2018110112_2018123012.txt')
ec = pd.concat([ec1, ec2], axis=0)

In [7]:
obs_p1 = read_obs('data/obs_2018030112_2018103112_site_01.txt')
obs_p2 = read_obs('data/obs_2018110112_2018123012_site_01.txt')
obs = pd.concat([obs_p1, obs_p2], axis=0).resample('1D').mean()

In [8]:
yesterday_obs = obs.shift(1)
yesterday_obs = yesterday_obs[[x for x in range(12, 24)]]

In [9]:
yesterday_obs.columns = [x - 24 for x in yesterday_obs.columns]

In [10]:
obs_mat = pd.concat([yesterday_obs, obs], axis=1)
obs_mat.columns = [f'obs.{x}' for x in obs_mat.columns]

In [11]:
obs_mat = obs_mat.loc[obs_mat['obs.-12'].notnull()]

In [12]:
raw_data = ec.merge(obs_mat, left_index=True, right_index=True)

### 特征加工

In [35]:
# 风速，湿球温度与气温差值，预报风速误差
for idx in range(-12, 24):
    raw_data[f'ws.{idx}'] = np.sqrt(raw_data[f'U10.{idx}'] ** 2 + raw_data[f'V10.{idx}'] ** 2)
    raw_data[f'rh_delta.{idx}'] = raw_data[f'T.{idx}'] - raw_data[f'RH.{idx}']
    raw_data[f'bias.{idx}'] = raw_data[f'obs.{idx}'] - raw_data[f'ws.{idx}']

In [36]:
# 气压变，温度变，风速变
for idx in range(0, 24):
    for span in (1, 3, 6, 12):
        raw_data[f'PSFC_{span}d.{idx}'] = raw_data[f'PSFC.{idx}'] - raw_data[f'PSFC.{idx-span}']
        raw_data[f'T_{span}d.{idx}'] = raw_data[f'T.{idx}'] - raw_data[f'T.{idx-span}']
        raw_data[f'ws_{span}d.{idx}'] = raw_data[f'ws.{idx}'] - raw_data[f'ws.{idx-span}']

In [31]:
is_train = raw_data.index < datetime(2018, 10, 1)
is_eval = (raw_data.index >= datetime(2018, 10, 1)) & (raw_data.index < datetime(2018, 10, 18))
is_test = (raw_data.index >= datetime(2018, 10, 18)) & (raw_data.index < datetime(2018, 11, 3))
is_prod = raw_data.index >= datetime(2018, 11, 3)

In [32]:
is_train.sum(), is_eval.sum(), is_test.sum(), is_prod.sum()

(212, 17, 16, 29)

### 训练模型

In [48]:
def rmse(y_arr):
    return np.sqrt((y_arr ** 2).mean())

In [37]:
print(list(raw_data.columns))

['PSFC.-1', 'PSFC.-10', 'PSFC.-11', 'PSFC.-12', 'PSFC.-2', 'PSFC.-3', 'PSFC.-4', 'PSFC.-5', 'PSFC.-6', 'PSFC.-7', 'PSFC.-8', 'PSFC.-9', 'PSFC.0', 'PSFC.1', 'PSFC.10', 'PSFC.11', 'PSFC.12', 'PSFC.13', 'PSFC.14', 'PSFC.15', 'PSFC.16', 'PSFC.17', 'PSFC.18', 'PSFC.19', 'PSFC.2', 'PSFC.20', 'PSFC.21', 'PSFC.22', 'PSFC.23', 'PSFC.3', 'PSFC.4', 'PSFC.5', 'PSFC.6', 'PSFC.7', 'PSFC.8', 'PSFC.9', 'RH.-1', 'RH.-10', 'RH.-11', 'RH.-12', 'RH.-2', 'RH.-3', 'RH.-4', 'RH.-5', 'RH.-6', 'RH.-7', 'RH.-8', 'RH.-9', 'RH.0', 'RH.1', 'RH.10', 'RH.11', 'RH.12', 'RH.13', 'RH.14', 'RH.15', 'RH.16', 'RH.17', 'RH.18', 'RH.19', 'RH.2', 'RH.20', 'RH.21', 'RH.22', 'RH.23', 'RH.3', 'RH.4', 'RH.5', 'RH.6', 'RH.7', 'RH.8', 'RH.9', 'T.-1', 'T.-10', 'T.-11', 'T.-12', 'T.-2', 'T.-3', 'T.-4', 'T.-5', 'T.-6', 'T.-7', 'T.-8', 'T.-9', 'T.0', 'T.1', 'T.10', 'T.11', 'T.12', 'T.13', 'T.14', 'T.15', 'T.16', 'T.17', 'T.18', 'T.19', 'T.2', 'T.20', 'T.21', 'T.22', 'T.23', 'T.3', 'T.4', 'T.5', 'T.6', 'T.7', 'T.8', 'T.9', 'U10.-1', 'U

In [195]:
fc_hr = 23

In [171]:
# feat_list = [f'U10.{x}' for x in range(-12, 0)] + [f'V10.{x}' for x in range(-12, 0)] + \
#     [f'bias.{x}' for x in range(-12, 0)] + [f'ws.{x}' for x in range(-12, 0)] + \
#     [f'rh_delta.{x}' for x in range(-12, 0)] + [f'U10.{fc_hr}' + f'V10.{fc_hr}' + f'ws.{fc_hr}' + f'rh_delta.{fc_hr}'] + \
#     [f'PSFC_{span}d.{fc_hr}' for span in (1, 3, 6, 12)] + [f'T_{span}d.{fc_hr}' for span in (1, 3, 6, 12)] + \
#     [f'ws_{span}d.{fc_hr}' for span in (1, 3, 6, 12)]

In [196]:
feat_list = [f'U10.{x}' for x in range(-12, 24)] + [f'V10.{x}' for x in range(-12, 24)] + \
    [f'bias.{x}' for x in range(-12, 0)] + [f'ws.{x}' for x in range(-12, 24)] + \
    [f'rh_delta.{x}' for x in range(-12, 24)] + \
    [f'PSFC_{span}d.{fc_hr}' for span in (1, 3, 6, 12)] + [f'T_{span}d.{fc_hr}' for span in (1, 3, 6, 12)] + \
    [f'ws_{span}d.{fc_hr}' for span in (1, 3, 6, 12)]

In [197]:
x_train = raw_data.loc[is_train, feat_list]
x_eval = raw_data.loc[is_eval, feat_list]
x_test = raw_data.loc[is_test, feat_list]

In [198]:
y_train = raw_data.loc[is_train, f'bias.{fc_hr}']
y_eval = raw_data.loc[is_eval, f'bias.{fc_hr}']
y_test = raw_data.loc[is_test, f'bias.{fc_hr}']

In [199]:
print(rmse(y_train), rmse(y_eval), rmse(y_test))

1.240661642136676 1.2907025076117304 1.627885835786151


In [200]:
len(feat_list)

168

In [201]:
clf = xgb.XGBRegressor(booster='dart', learning_rate=0.04, n_estimators=400, subsample=0.3, colsample_bytree=0.35, max_depth=3)

In [202]:
clf.fit(x_train, y_train, eval_set=[(x_eval, y_eval)], eval_metric='rmse', verbose=20)

[0]	validation_0-rmse:1.2439
[20]	validation_0-rmse:1.26196
[40]	validation_0-rmse:1.2631
[60]	validation_0-rmse:1.25236
[80]	validation_0-rmse:1.24536
[100]	validation_0-rmse:1.25308
[120]	validation_0-rmse:1.22191
[140]	validation_0-rmse:1.20644
[160]	validation_0-rmse:1.1886
[180]	validation_0-rmse:1.18484
[200]	validation_0-rmse:1.19757
[220]	validation_0-rmse:1.21202
[240]	validation_0-rmse:1.19653
[260]	validation_0-rmse:1.19826
[280]	validation_0-rmse:1.2047
[300]	validation_0-rmse:1.18969
[320]	validation_0-rmse:1.18852
[340]	validation_0-rmse:1.18081
[360]	validation_0-rmse:1.17738
[380]	validation_0-rmse:1.16872
[399]	validation_0-rmse:1.17243


XGBRegressor(base_score=0.5, booster='dart', colsample_bylevel=1,
       colsample_bytree=0.35, gamma=0, learning_rate=0.04,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=400, n_jobs=1, nthread=None, objective='reg:linear',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=0.3)

In [203]:
y_pred_eval = clf.predict(x_eval)
y_pred_eval = pd.Series(y_pred_eval, index=y_eval.index)

In [204]:
rmse(y_pred_eval - y_eval)

1.172432708131372

In [205]:
y_test_eval = clf.predict(x_test)
y_test_eval = pd.Series(y_test_eval, index=y_test.index)

In [206]:
rmse(y_test - y_test_eval)

1.5934879159175075