# 1. 读取数据 #

In [1]:
import pandas as pd

filepath_train = 'datas/train.csv' # 训练集的路径
filepath_test = 'datas/test.csv' # 测试集的路径

X_full_train = pd.read_csv(filepath_train,encoding='big5') # 读取训练集数据，台湾是用的big5码
X_full_test = pd.read_csv(filepath_test,encoding='big5',
                          header=None,
                          index_col=0) # 读取测试集数据，没有列表，但是有index

# 2. 数据清洗 #

In [2]:
X_train_pm = X_full_train[X_full_train['測項']=='PM2.5'].copy() # 只保留PM2.5的部分
X_train_pm.reset_index(drop=True,inplace=True)
X_train_pm.drop(columns=['日期','測站','測項'],axis=1,inplace=True) # 删除不需要的列
 
X_train_all = X_train_pm[X_train_pm.columns[0:10]].copy() # 取前十列生成新的训练集
for i in range(1,15):
    X_tmp = pd.DataFrame(X_train_pm[X_train_pm.columns[i:10+i]]).copy() # 取连续10列
    columns_new = {col_old:col_new for col_old,col_new 
                                   in zip(X_tmp.columns,X_train_pm.columns)} # 列表改成和训练集一致
    X_tmp.rename(columns=columns_new,inplace=True) # 修改列名
    X_train_all = X_train_all.append(X_tmp,ignore_index=True) # 把数据添加到训练集后面
X_train_all = X_train_all.astype(float) # 数据类型都设置为float
    
X_test_pm = X_full_test[X_full_test[1]=='PM2.5'].copy() # 测试集只保留PM2.5的部分
X_test = X_test_pm.drop(columns=[1],axis=1) # 删除不需要的列
X_test = X_test.astype(float) # 数据类型都设置为float
columns_new = {col_old:col_new for col_old,col_new
                               in zip(X_test,X_train_all)}
X_test.rename(columns=columns_new,inplace=True) #列名改为和训练集一致

from sklearn.model_selection import train_test_split

y_train_all = X_train_all['9'] # y是最后一列
X_train_all.drop(columns='9',axis=1,inplace=True) # 把y列从训练集中删除

X_train,X_valid,y_train,y_valid = train_test_split(X_train_all,
                                                   y_train_all,
                                                   test_size=0.2) # 分割验证集和验证集

# 3. 训练模型 #

In [3]:
from xgboost import XGBRegressor
model = XGBRegressor()
model.fit(X_train,y_train,
          eval_set=[(X_valid,y_valid)],
          early_stopping_rounds=5,
          verbose=False)

from sklearn.metrics import mean_squared_error
print("训练集的RMSE:{:.5f}".format(mean_squared_error(model.predict(X_train),y_train)**0.5))
print("验证集的RMSE:{:.5f}".format(mean_squared_error(model.predict(X_valid),y_valid)**0.5))

训练集的RMSE:4.90738
验证集的RMSE:6.79184


# 4. 输出数据 #

In [4]:
preds_test = model.predict(X_test)
# output = pd.DataFrame({'value':preds_test} ,index=X_test.index)
output = pd.DataFrame({'id':X_test.index,
                       'value':preds_test})
output.to_csv('output/submission.csv',index=False)