# Leaderboard: 0.148
# CV mean 5: 0.02185

In [1]:
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer

In [50]:
data_dir = '../data'
train_set_path = f'{data_dir}/train.csv'
test_set_path = f'{data_dir}/test.csv'

predictions_dir = '../predictions'

La colonne *MSSubClass* utilise des entiers pour identifier le type de bâtiments vendus: il faut les one-hot encoder.

In [29]:
train_df = pd.read_csv(train_set_path, header=0, dtype={'MSSubClass': object})
test_df = pd.read_csv(test_set_path, header=0, dtype={'MSSubClass': object})

print(f'Train shape: {train_df.shape}')
print(f'Test shape: {test_df.shape}')

train_df.head()

Train shape: (1460, 81)
Test shape: (1459, 80)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [30]:
labels = train_df['SalePrice']
train_df.drop('SalePrice', axis=1, inplace=True)

In [31]:
encoded_train_df = pd.get_dummies(train_df)
encoded_test_df = pd.get_dummies(test_df)

print(encoded_train_df.shape)
print(encoded_test_df.shape)

final_train, final_test = encoded_train_df.align(encoded_test_df, join='left', axis=1)

(1460, 303)
(1459, 286)


In [32]:
final_train.head()

Unnamed: 0,Id,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,0,0,1,0,0,0,0,1,0
1,2,80.0,9600,6,8,1976,1976,0.0,978,0,...,0,0,0,1,0,0,0,0,1,0
2,3,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,0,0,1,0,0,0,0,1,0
3,4,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,0,0,1,1,0,0,0,0,0
4,5,84.0,14260,8,5,2000,2000,350.0,655,0,...,0,0,0,1,0,0,0,0,1,0


In [34]:
pipeline = make_pipeline(SimpleImputer(), RandomForestRegressor(n_estimators=50))
cv_scores = cross_val_score(pipeline, final_train, labels, scoring='neg_mean_squared_log_error', cv=5)

print(f'Cross-val scores: {-1*cv_scores}')
print(f'Cross-val mean: {-1*cv_scores.mean()}')

Cross-val scores: [0.01944361 0.02364847 0.02063222 0.02039108 0.02515198]
Cross-val mean: 0.02185347267277806


In [35]:
fitted_pipeline = pipeline.fit(final_train, labels)

In [42]:
predictions = fitted_pipeline.predict(final_test)
print(len(predictions))
predictions[:5]

1459


array([128898.  , 151273.  , 177851.2 , 185279.2 , 192804.82])

In [49]:
predictions_df = final_test[['Id']].copy()
predictions_df['SalePrice'] = predictions
print(predictions_df.shape)
predictions_df.head()

(1459, 2)


Unnamed: 0,Id,SalePrice
0,1461,128898.0
1,1462,151273.0
2,1463,177851.2
3,1464,185279.2
4,1465,192804.82


In [51]:
predictions_df.to_csv(f'{predictions_dir}/2018-11-04-rf-num-estim-50.csv', header=True, index=False)