# 版本说明v2 #
1. 使用了所有的feature
2. 使用了XGBoost
3. 使用了归一化

# 读取数据 #
训练集的大小是(4320,27)，因为前三列分别是”日期“，”测站“和”测项“，实际大小为(4320,24)。  
其中，4320又为(12,20,18)，分别对应”月份“，”日期“和”测项“。

In [1]:
import pandas as pd
import numpy as np

filepath_train = 'datas/train.csv' # 训练集的路径
filepath_test = 'datas/test.csv'   # 测试集的路径

data_train_orig = pd.read_csv(filepath_train,encoding='big5') # 读取训练集数据，台湾是用的big5码
data_test_orig = pd.read_csv(filepath_test,encoding='big5',
                          header=None,
                          index_col=0) # 读取测试集数据，没有列表，但是有index

index_pm25 = 9 # PM2.5的数据是在每日数据的第10行

data_train_orig = data_train_orig.iloc[:,3:] # 去掉前三列无用数据
data_train_orig[data_train_orig == 'NR'] = 0 # 设置雨量为NR的数据为0
data_train_orig = data_train_orig.to_numpy()

data_test_orig = data_test_orig.iloc[:,1:] # 去掉标识'测项'的第一列
data_test_orig[data_test_orig == 'NR'] = 0
data_test_numpy = data_test_orig.to_numpy()

# 转化每月数据 #
把数据(12\*20\*18,24)转化成(12,18,24\*18)的数据格式，对应(月份，测项，小时*每月天数)

In [2]:
data_train_month = np.empty([12,18,24*20])  # 定义初始值

for month in range(12):
    for day in range(20):
        row_start = (month * 20 + day) * 18 # 起始行，每月20天，每天18个特征
        row_end = row_start + 18            # 每天18个特征
        data_day = data_train_orig[row_start:row_end,:]        # 获得数据切片
        
        col_start = day * 24                # 每天有24小时
        col_end = col_start + 24
        data_train_month[month,:,col_start:col_end] = data_day # 设置数据切片

# 生成样本数据 #
因为是预测第十天的数据，所以连续九天的数据合并成一条数据,X_train_full的维度(12\*471,18\*9)

In [3]:
sample_number_month = 24 * 20 - 10 + 1                     # 计算每月的样本数

X_train_full = np.empty([sample_number_month * 12,18 * 9]) # 初始化训练集X
y_train_full = np.empty([sample_number_month * 12,1])      # 初始化训练集y

for month in range(12):
    for sample in range(sample_number_month):
        col_start = sample           # 开始列
        col_end   = col_start + 9    # 取9列数据
        data_X = data_train_month[month,:,col_start:col_end].reshape(1,-1) # 变成一行数据
        data_y = data_train_month[month,index_pm25,col_end]                # y是下一列数据
        
        row = month * sample_number_month + sample # 插入数据的行号
        X_train_full[row,:] = data_X
        y_train_full[row] = data_y

test_number = data_test_numpy.shape[0] // 18
X_test = np.empty([test_number,18 * 9])
for sample in range(test_number):
    row_start = sample * 18
    row_end = row_start + 18
    X_test[sample] = data_test_numpy[row_start:row_end,:].reshape(1,-1)

# 拆分训练集和验证集 #

In [4]:
from sklearn.model_selection import train_test_split

X_train,X_valid,y_train,y_valid = train_test_split(X_train_full,
                                                   y_train_full,
                                                   test_size=0.2) # 分割验证集和验证集

# 归一化 #

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()                 # 使用标准归一化
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)

X_train_full = scaler.fit_transform(X_train_full)
X_test  = scaler.transform(X_test)

# 训练模型 #

In [6]:
from xgboost import XGBRegressor

model = XGBRegressor(n_estimators=250,learning_rate=0.05,)
model.fit(X_train,y_train,
          eval_set=[(X_valid,y_valid)],
          early_stopping_rounds=5,
          verbose=False)

from sklearn.metrics import mean_squared_error
print("训练集的RMSE:{:.5f}".format(mean_squared_error(model.predict(X_train),y_train)**0.5))
print("验证集的RMSE:{:.5f}".format(mean_squared_error(model.predict(X_valid),y_valid)**0.5))

训练集的RMSE:3.27003
验证集的RMSE:5.88910


# 输出数据 #

In [7]:
model.fit(X_train_full,y_train_full)
preds_test = model.predict(X_test)
ids = ['id_' + str(i) for i in range(len(X_test))]
output = pd.DataFrame({'id':ids,
                       'value':preds_test})
output.to_csv('output/submission.csv',index=False)