In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import warnings
from collections import defaultdict

In [2]:
import xgboost as xgb

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
from src.apis import load_awos_by_point, load_ec_by_airport, load_wrf_by_airport
from src.model_opt import get_best_params_by_bo

In [5]:
airport = 'ZBAA'

In [37]:
site = '01'

In [38]:
start_time = datetime(2018, 11, 2)
end_time = datetime(2019, 3, 1)

In [39]:
df_wrf = load_wrf_by_airport(site, start_time-timedelta(hours=12), end_time-timedelta(hours=12), start_point=12)

In [40]:
obs_df = load_awos_by_point(airport, site, start_time, end_time=end_time)

In [46]:
obs_df.head()

Unnamed: 0,obs_wd,obs_ws
2018-11-02 00:00:00,351.0,3.64
2018-11-02 01:00:00,348.0,2.18
2018-11-02 02:00:00,345.0,3.23
2018-11-02 03:00:00,336.0,2.2
2018-11-02 04:00:00,333.0,2.21


In [41]:
def pivot_arr_by_date(arr, tag):
    meta_df = arr.to_frame(name='obs')
    meta_df['date'] = arr.index.floor('d')
    meta_df['hour'] = arr.index.hour
    new_df = meta_df.pivot(columns='hour', index='date', values='obs')
    yesterday_df = new_df.copy()
    yesterday_df.index = yesterday_df.index + timedelta(days=1)
    yesterday_df.columns = [x - 24 for x in yesterday_df.columns]
    obs_mat = pd.concat([yesterday_df, new_df], axis=1)
    obs_mat.columns = [f'obs_{tag}.{x}' for x in obs_mat.columns]
    return obs_mat

In [42]:
obs_wd_mat = pivot_arr_by_date(obs_df['obs_wd'], 'wd')

In [43]:
raw_data = pd.concat([obs_wd_mat, df_ec], axis=1)

### 特征加工

In [44]:
raw_data['obs_wd.3'].describe()

count    119.000000
mean     226.075630
std      141.829759
min        1.000000
25%       44.500000
50%      311.000000
75%      341.000000
max      358.000000
Name: obs_wd.3, dtype: float64

In [45]:
raw_data['DIR10.3'].describe()

count    119.000000
mean     198.202045
std      122.114227
min        1.657090
25%       77.290275
50%      196.855900
75%      318.426300
max      354.570310
Name: DIR10.3, dtype: float64

In [34]:
raw_data[['obs_wd.3', 'DIR10.3']]

Unnamed: 0_level_0,obs_wd.3,DIR10.3
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-11-02,352.0,97.24548
2018-11-03,357.0,130.61084
2018-11-04,10.0,318.53870
2018-11-05,333.0,76.97540
2018-11-06,347.0,25.02905
2018-11-07,1.0,217.52946
2018-11-08,352.0,75.81147
2018-11-09,340.0,321.38275
2018-11-10,345.0,45.67772
2018-11-11,23.0,166.78967


In [25]:
# 风向，湿球温度与气温差值，预报风速误差
for idx in range(-12, 24):
    raw_data[f'rh_delta.{idx}'] = raw_data[f'T2.{idx}'] - raw_data[f'TD2.{idx}']
    raw_data[f'bias.{idx}'] = (raw_data[f'obs_wd.{idx}'] - raw_data[f'DIR10.{idx}'] + 180) % 360 - 180

In [26]:
# 气压变，温度变，风速变，风向变
for idx in range(0, 24):
    for span in (1, 3, 6, 12):
        raw_data[f'PSFC_{span}d.{idx}'] = raw_data[f'PSFC.{idx}'] - raw_data[f'PSFC.{idx-span}']
        raw_data[f'T2_{span}d.{idx}'] = raw_data[f'T2.{idx}'] - raw_data[f'T2.{idx-span}']
        raw_data[f'SPD10_{span}d.{idx}'] = raw_data[f'SPD10.{idx}'] - raw_data[f'SPD10.{idx-span}']
        raw_data[f'wd_{span}d.{idx}'] = (raw_data[f'DIR10.{idx}'] - raw_data[f'DIR10.{idx-span}'] + 180) % 360 - 180

In [27]:
is_train = raw_data.index <= datetime(2019, 2, 1)
is_eval = raw_data.index > datetime(2019, 2, 1)

In [28]:
is_train.sum(), is_eval.sum()

(92, 28)

### 训练模型

In [29]:
def rmse(y_arr):
    return np.sqrt((y_arr ** 2).mean())

In [30]:
fc_hr = 6

In [31]:
feat_list = [f'U10.{x}' for x in range(-12, 24)] + [f'V10.{x}' for x in range(-12, 24)] + \
    [f'bias.{x}' for x in range(-12, 0)] + [f'SPD10.{x}' for x in range(-12, 24)] + \
    [f'DIR10.{x}' for x in range(-12, 24)] + \
    [f'rh_delta.{x}' for x in range(-12, 24)] + \
    [f'PSFC_{span}d.{fc_hr}' for span in (1, 3, 6, 12)] + [f'T2_{span}d.{fc_hr}' for span in (1, 3, 6, 12)] + \
    [f'SPD10_{span}d.{fc_hr}' for span in (1, 3, 6, 12)] + [f'DIR10_{span}d.{fc_hr}' for span in (1, 3, 6, 12)]
x_train = raw_data.loc[is_train, feat_list]
x_eval = raw_data.loc[is_eval, feat_list]
y_train = raw_data.loc[is_train, f'bias.{fc_hr}']
y_eval = raw_data.loc[is_eval, f'bias.{fc_hr}']

In [32]:
clf = xgb.XGBRegressor(booster='gbtree', learning_rate=0.04, n_estimators=300, verbosity=0, n_jobs=16, seed=42,
                       reg_alpha=0.1, reg_lambda=0.1, colsample_bytree=0.6, max_depth=6, subsample=0.5)
clf.fit(x_train, y_train)
y_pred_eval = clf.predict(x_eval)
y_pred_eval = pd.Series(y_pred_eval, index=x_eval.index)
# pred_one = yhat + x_eval[f'SPD10.{fc_hr}']
# pred_one.index = pred_one.index + timedelta(hours=fc_hr)
# pred_one.loc[pred_one < 0.52] = 0.52
# pred_list.append(pred_one)

In [33]:
rmse(y_eval), rmse(y_pred_eval - y_eval)

(115.20649171574216, 129.20999995613641)