In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
pd.set_option("display.max_rows",500)
pd.set_option("display.max_columns",100)
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [2]:
train_y=train["SalePrice"]
del train["SalePrice"]

In [3]:
total_data = pd.concat([train,test],ignore_index=True)

In [4]:
#データの加工
train_y = np.log(train_y+1)
total_data["GrLivArea"] = np.log(total_data["GrLivArea"]+1)
total_data["TotalBsmtSF"] = np.log(total_data["TotalBsmtSF"]+1)

In [5]:
#一旦一番シンプルなモデルを作る
#ラベルデータのエンコーディング
types=total_data.dtypes
object_columns = []
for k,v in types.items():
    if v == "object":
        object_columns.append(k)

for i in object_columns:
    le = LabelEncoder()
    total_data[i] = total_data[i].astype(str)
    le.fit(total_data[i])
    total_data[i] = le.transform(total_data[i])

In [6]:
#数値データの欠損値対応
nulls = total_data.isnull().sum()
null_columns = []
for k,v in nulls.items():
    if v != 0:
        null_columns.append(k)

#total_data[null_columns]
#total_data[null_columns].describe()
#total_data[["BsmtFinSF2","BsmtFullBath","BsmtHalfBath"]].fillna(0,inplace=True)
total_data.drop(null_columns,axis=1,inplace=True)

In [7]:
total_data.isnull().sum()

Id               0
MSSubClass       0
MSZoning         0
LotArea          0
Street           0
Alley            0
LotShape         0
LandContour      0
Utilities        0
LotConfig        0
LandSlope        0
Neighborhood     0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
RoofStyle        0
RoofMatl         0
Exterior1st      0
Exterior2nd      0
MasVnrType       0
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual         0
BsmtCond         0
BsmtExposure     0
BsmtFinType1     0
BsmtFinType2     0
Heating          0
HeatingQC        0
CentralAir       0
Electrical       0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
KitchenQual      0
TotRmsAbvGrd     0
Functional       0
Fireplaces       0
FireplaceQu      0
GarageType       0
GarageFinish     0
GarageQual  

In [8]:
#データの再分割
del total_data["Id"]
test_x = total_data[len(train_y):]
x = total_data[:len(train_y)]
train_x,valid_x,train_y,valid_y = train_test_split(x,train_y,test_size=0.2)

In [9]:
test_x

Unnamed: 0,MSSubClass,MSZoning,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
1460,20,2,11622,1,2,3,3,0,4,0,12,1,2,0,2,5,6,1961,1961,1,1,12,13,2,3,4,1,3,3,3,4,3,1,4,1,4,896,0,0,6.799056,1,0,2,1,3,5,6,0,5,1,2,4,4,2,140,0,0,0,120,0,3,2,4,0,6,2010,8,4
1461,20,3,14267,1,2,0,3,0,0,0,12,2,2,0,2,6,6,1958,1958,3,1,13,14,1,3,4,1,3,3,3,0,5,1,4,1,4,1329,0,0,7.192934,1,1,3,1,2,6,6,0,5,1,2,4,4,2,393,36,0,0,0,0,3,4,0,12500,6,2010,8,4
1462,60,3,13830,1,2,0,3,0,4,0,8,2,2,0,5,5,5,1997,1998,1,1,12,13,2,3,4,2,2,3,3,2,5,1,2,1,4,928,701,0,7.396335,2,1,3,1,3,6,6,1,4,1,0,4,4,2,212,34,0,0,0,0,3,2,4,0,3,2010,8,4
1463,60,3,9978,1,2,0,3,0,4,0,8,2,2,0,5,6,6,1998,1998,1,1,12,13,1,3,4,2,3,3,3,2,5,1,0,1,4,926,678,0,7.380879,2,1,3,1,2,7,6,1,2,1,0,4,4,2,360,36,0,0,0,0,3,4,4,0,6,2010,8,4
1464,120,3,5005,1,2,0,1,0,4,0,22,2,2,4,2,8,5,1992,1992,1,1,6,6,2,2,4,2,2,3,3,0,5,1,0,1,4,1280,0,0,7.155396,2,0,2,1,2,5,6,0,5,1,1,4,4,2,0,82,0,0,144,0,3,4,4,0,1,2010,8,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,160,4,1936,1,2,3,3,0,4,0,10,2,2,3,5,4,7,1970,1970,1,1,5,5,2,3,4,1,3,3,3,5,5,1,2,1,4,546,546,0,6.996681,1,1,3,1,3,5,6,0,5,6,3,5,5,2,0,0,0,0,0,0,3,4,4,0,6,2006,8,4
2915,160,4,1894,1,2,3,3,0,4,0,10,2,2,4,5,4,5,1970,1970,1,1,5,5,2,3,4,1,3,3,3,4,5,1,4,1,4,546,546,0,6.996681,1,1,3,1,3,6,6,0,5,4,2,4,4,2,0,24,0,0,0,0,3,4,4,0,4,2006,8,0
2916,20,3,20000,1,2,3,3,0,4,0,11,2,2,0,2,5,7,1960,1996,1,1,12,13,2,3,4,1,3,3,3,0,5,1,0,1,4,1224,0,0,7.110696,1,0,4,1,3,7,6,1,4,5,2,4,4,2,474,0,0,0,0,0,3,4,4,0,9,2006,8,0
2917,85,3,10441,1,2,3,3,0,4,0,11,2,2,0,6,5,5,1992,1992,1,1,6,15,2,3,4,2,2,3,0,2,5,1,4,1,4,970,0,0,6.878326,1,0,3,1,3,6,6,0,5,6,3,5,5,2,80,32,0,0,0,0,3,2,2,700,7,2006,8,4


In [10]:
#モデルの構築
model = XGBRegressor(
    max_depth = 5,
    n_estimators=1000,
    min_child_weight=1, #1.0がベースライン
    colsample_bytree=0.8,
    subsample=0.8,
    eta=0.3,
    seed=42
)
model.fit(
    train_x,
    train_y,
    eval_metric="rmse",
    eval_set=[(train_x,train_y),(valid_x,valid_y)],
    verbose=True,
    early_stopping_rounds = 10
)

[0]	validation_0-rmse:10.3721	validation_1-rmse:10.4187
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 10 rounds.
[1]	validation_0-rmse:9.33776	validation_1-rmse:9.38443
[2]	validation_0-rmse:8.40726	validation_1-rmse:8.45396
[3]	validation_0-rmse:7.56956	validation_1-rmse:7.6163
[4]	validation_0-rmse:6.81544	validation_1-rmse:6.8607
[5]	validation_0-rmse:6.13664	validation_1-rmse:6.18108
[6]	validation_0-rmse:5.52562	validation_1-rmse:5.56881
[7]	validation_0-rmse:4.97552	validation_1-rmse:5.01754
[8]	validation_0-rmse:4.48011	validation_1-rmse:4.52139
[9]	validation_0-rmse:4.03477	validation_1-rmse:4.07418
[10]	validation_0-rmse:3.63394	validation_1-rmse:3.6726
[11]	validation_0-rmse:3.2738	validation_1-rmse:3.31249
[12]	validation_0-rmse:2.94935	validation_1-rmse:2.98604
[13]	validation_0-rmse:2.65659	validation_1-rmse:2.69142
[14]	validation_0-rmse:2.3937	validation_1-rmse:2.42669
[

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, eta=0.3, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=None, n_estimators=1000,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
             silent=None, subsample=0.8, verbosity=1)

In [11]:
#結果の予測
y_train_pred = model.predict(train_x)
y_valid_pred = model.predict(valid_x)
y_test_pred = model.predict(test_x)

In [12]:
#結果の検証
print("学習データのRMSE: " + str(np.sqrt(mean_squared_error(train_y,y_train_pred))))
print("検証データのRMSE: " +  str(np.sqrt(mean_squared_error(valid_y,y_valid_pred))))

学習データのRMSE: 0.05057812148030681
検証データのRMSE: 0.12835121328350385


In [17]:
result = pd.DataFrame({
    "Id":test["Id"],
    "SalePrice":(np.exp(y_test_pred)-1)
})

In [18]:
result.to_csv("result2.csv",index=False,header=True)

In [None]:
#追加で色々分析する
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
train["SalePrice"].describe()

In [None]:
sns.distplot(train["SalePrice"])

In [None]:
#Relationship with numerical variables
var = "GrLivArea"
data = pd.concat([train["SalePrice"],train[var]],axis=1)
data.plot.scatter(x=var,y="SalePrice",ylim=(0.8));
var = "TotalBsmtSF"
data = pd.concat([train["SalePrice"],train[var]],axis=1)
data.plot.scatter(x=var,y="SalePrice",ylim=(0.8));

In [None]:
#Relationship with categorical features
var = "OverallQual"
data = pd.concat([train["SalePrice"],train[var]],axis=1)
f, ax = plt.subplots(figsize=(8,6))
fig = sns.boxplot(x=var,y="SalePrice",data=data)
fig.axis(ymin=0,ymax=800000);

In [None]:
var = 'YearBuilt'
data = pd.concat([train['SalePrice'], train[var]], axis=1)
f, ax = plt.subplots(figsize=(16, 8))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
plt.xticks(rotation=90);

In [None]:
#correlation matrix
corrmat = train.corr()
f, ax = plt.subplots(figsize=(16, 12))
sns.heatmap(corrmat, vmax=.8, square=True);

In [None]:
k = 10
cols = corrmat.nlargest(k,"SalePrice")["SalePrice"].index
cm = np.corrcoef(train[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm,cbar=True,annot=True,square=True,fmt=".2f",annot_kws={"size":10},yticklabels=cols.values,xticklabels=cols.values)

In [None]:
#Normalize Data
test["SalePrice"] = np.log(train["SalePrice"]+1)
sns.distplot(test["SalePrice"])
fig = plt.figure()