# 版本说明 #
这个版本只是实现了用XGBoost训练了一下数据，使用的超参是之前做练习的时候获得的。

# 1. 读取数据 #

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 数据文件的路径
file_path_train = './datas/train.csv'
file_path_test = './datas/test.csv'

X_full = pd.read_csv(file_path_train,index_col='Id') # 读取训练集
X_full_test = pd.read_csv(file_path_test,index_col='Id') # 读取测试集

X_full.dropna(axis=0,subset=['SalePrice'],inplace=True) # 删除没有SalePrice的数据
y_full = X_full['SalePrice'] # 设置训练集的y
X_full.drop(columns='SalePrice',axis=1,inplace=True) # 把y从训练集X中删除

# 把训练集分割成训练集和测试集
X_full_train,X_full_valid,y_train,y_valid = train_test_split(X_full,y_full,test_size=0.2)

# 获得所有类型是文字数量小于10的列
columns_low_cardinality = [c for c in X_full_train.columns 
                                   if X_full_train[c].nunique() < 10 and
                                      X_full_train[c].dtype == 'object']
# 获得所有类型为数字的列
columns_numeric = [c for c in X_full_train.columns
                           if X_full_train[c].dtype in ['int64','float64']]

# 设置实际训练使用的列，生成新的copy，防止污染原数据
columns_my = columns_low_cardinality + columns_numeric # 用来训练的列
X_full = X_full[columns_my]
X_train = X_full_train[columns_my].copy()
X_valid = X_full_valid[columns_my].copy()
X_test = X_full_test[columns_my].copy()

# 2. 数据清洗 #

In [2]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.compose import ColumnTransformer

imputer_numeric = SimpleImputer(strategy='median') # 数字列nan用median填充

imputer_categorical = SimpleImputer(strategy='most_frequent') # 文字列用出现最多元素填充
onehot_encoder = OneHotEncoder(handle_unknown='ignore') # onehot转化器
transformer_categorical = Pipeline([('imp',imputer_categorical), # 把文本处理的整合成pipeline
                                    ('oh',onehot_encoder)])

# 把imputer和encoder根据列整合起来
preprocessor = ColumnTransformer([('num',imputer_numeric,columns_numeric),
                                  ('text',transformer_categorical,columns_low_cardinality)])

# 3.    训练模型 #

In [3]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

model = XGBRegressor(max_depth=6,n_estimators=250,learning_rate=0.03)

pipeline = Pipeline(steps=[('preprocessor',preprocessor),
                           ('model',model)])

pipeline.fit(X_train,y_train)

preds_train = pipeline.predict(X_train) # 训练集的预测值
print("训练集的MAE:{:,.2f}".format(mean_absolute_error(preds_train,y_train))) # 训练集MAE

preds_valid = pipeline.predict(X_valid) # 验证集的预测值
print("验证集的MAE:{:,.2f}".format(mean_absolute_error(preds_valid,y_valid))) # 验证集MAE

训练集的MAE:4,724.52
验证集的MAE:15,541.32


# 4. 生成测试数据 #

In [4]:
pipeline.fit(X_full,y_full) #用所有训练集训练数据

preds_test = pipeline.predict(X_test) # 测试集的预测值
output = pd.DataFrame({'SalePrice':preds_test},
                      index=X_test.index)
output.to_csv('output/submissions.csv')