## Links
https://towardsdatascience.com/interpreting-random-forest-and-other-black-box-models-like-xgboost-80f9cc4a3c38

# Imports

In [2]:
import pandas as pd

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.inspection import permutation_importance


In [5]:
import matplotlib
print(matplotlib.__version__)

3.5.1


In [6]:
import matplotlib.pyplot as plt
from treeinterpreter import treeinterpreter as ti
from pdpbox import pdp

import waterfall_chart 

In [7]:
#!pip install treeinterpreter

In [8]:
#!pip install pdpbox

In [9]:
#!conda remove matplotlib
#! conda install matplotlib==3.1.1

# Load data

In [10]:
df = pd.read_csv('data/ready/train.csv')

# Split data

In [11]:
df_train, df_val = train_test_split(df, test_size=0.2)
df_train.shape, df_val.shape

((1168, 81), (292, 81))

In [12]:
x_train, y_train = df_train.drop('SalePrice', axis=1), df_train[['SalePrice', 'Id']]
x_val, y_val = df_val.drop('SalePrice', axis=1), df_val[['SalePrice', 'Id']]

In [13]:
x_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
986,987,50,1,59.0,5310,0,0,0,0,0,...,0,0,0,0,0,0,6,2006,0,0
980,981,85,0,-1.0,12122,0,0,1,0,0,...,0,0,0,0,0,0,7,2008,0,0
1253,1254,60,0,-1.0,17542,0,0,1,0,0,...,0,0,0,1,0,0,7,2007,0,0
808,809,80,0,85.0,13400,0,0,0,0,0,...,0,0,0,2,0,0,6,2006,0,0
567,568,20,0,70.0,10171,0,0,1,0,0,...,0,0,0,0,0,0,3,2010,0,0


In [14]:
df_train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
986,987,50,1,59.0,5310,0,0,0,0,0,...,0,0,0,0,0,6,2006,0,0,117000
980,981,85,0,-1.0,12122,0,0,1,0,0,...,0,0,0,0,0,7,2008,0,0,178400
1253,1254,60,0,-1.0,17542,0,0,1,0,0,...,0,0,1,0,0,7,2007,0,0,294000
808,809,80,0,85.0,13400,0,0,0,0,0,...,0,0,2,0,0,6,2006,0,0,159950
567,568,20,0,70.0,10171,0,0,1,0,0,...,0,0,0,0,0,3,2010,0,0,214000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
943,944,90,0,100.0,25000,0,0,0,2,0,...,0,0,0,0,0,6,2007,0,0,143000
1039,1040,180,1,21.0,1477,0,0,0,0,0,...,0,0,0,0,0,4,2009,0,0,80000
1109,1110,20,0,107.0,11362,0,0,1,0,0,...,0,0,0,0,0,3,2009,0,0,280000
1104,1105,160,1,24.0,2016,0,0,0,0,0,...,0,0,0,0,0,4,2007,0,0,106000


# Logit target

In [15]:
def one_plus_log(x):
    return np.log(1 + x)

def one_plus_log_reverse(x):
    return np.exp(x) - 1

In [16]:
df_train['SalePrice'] = df_train['SalePrice'].apply(one_plus_log)
df_val['SalePrice'] = df_val['SalePrice'].apply(one_plus_log)
df_val.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
420,421,90,1,78.0,7060,0,0,0,0,0,...,0,0,0,0,0,11,2008,0,4,12.237092
219,220,120,0,43.0,3010,0,0,0,0,0,...,0,0,0,0,0,3,2006,1,2,12.027191
730,731,120,0,39.0,5389,0,0,1,0,0,...,0,0,0,0,0,3,2010,0,0,12.373708
862,863,20,0,81.0,9672,0,0,0,0,0,...,0,0,3,0,0,5,2010,0,0,11.931642
966,967,50,0,130.0,9600,0,0,1,3,0,...,0,0,0,0,0,6,2009,0,0,11.982935


# Apply models

In [17]:
def rmse(y, y_hat):
    return mean_squared_error(y, y_hat) ** 0.5

In [18]:
cols_x = list(df_train.drop(columns=['Id', 'SalePrice']).columns)
cols_y = 'SalePrice'
cols_x[:5]

['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street']

In [19]:
rf = RandomForestRegressor(n_jobs=-1, random_state=27)
rf.fit(df_train[cols_x], df_train[cols_y])

y_train_hat = rf.predict(df_train[cols_x])
y_val_hat = rf.predict(df_val[cols_x])

print('Train RMSE', rmse(y_train_hat, df_train[cols_y]))
print('Test RMSE', rmse(y_val_hat, df_val[cols_y]))

Train RMSE 0.0541279991366629
Test RMSE 0.1549131882259671


Tune min_samples_leaf

In [20]:
rf = RandomForestRegressor(n_jobs=-1, random_state=27, min_samples_leaf=3)
rf.fit(df_train[cols_x], df_train[cols_y])

y_train_hat = rf.predict(df_train[cols_x])
y_val_hat = rf.predict(df_val[cols_x])

print('Train RMSE', rmse(y_train_hat, df_train[cols_y]))
print('Test RMSE', rmse(y_val_hat, df_val[cols_y]))

Train RMSE 0.07583009616886126
Test RMSE 0.15659825493704185


In [21]:
rf = RandomForestRegressor(n_jobs=-1, random_state=27, min_samples_leaf=5)
rf.fit(df_train[cols_x], df_train[cols_y])

y_train_hat = rf.predict(df_train[cols_x])
y_val_hat = rf.predict(df_val[cols_x])

print('Train RMSE', rmse(y_train_hat, df_train[cols_y]))
print('Test RMSE', rmse(y_val_hat, df_val[cols_y]))

Train RMSE 0.09446323972049972
Test RMSE 0.15874848590369614


In [22]:
rf = RandomForestRegressor(n_jobs=-1, random_state=27, min_samples_leaf=7)
rf.fit(df_train[cols_x], df_train[cols_y])

y_train_hat = rf.predict(df_train[cols_x])
y_val_hat = rf.predict(df_val[cols_x])

print('Train RMSE', rmse(y_train_hat, df_train[cols_y]))
print('Test RMSE', rmse(y_val_hat, df_val[cols_y]))

Train RMSE 0.10621170282567155
Test RMSE 0.16063924095848803


3 it is <br>
Optimize max_features

In [23]:
rf = RandomForestRegressor(n_jobs=-1, random_state=27, max_features=0.1)
rf.fit(df_train[cols_x], df_train[cols_y])

y_train_hat = rf.predict(df_train[cols_x])
y_val_hat = rf.predict(df_val[cols_x])

print('Train RMSE', rmse(y_train_hat, df_train[cols_y]))
print('Test RMSE', rmse(y_val_hat, df_val[cols_y]))

Train RMSE 0.05510209164219831
Test RMSE 0.154205422779658


In [26]:
rf = RandomForestRegressor(n_jobs=-1, random_state=27, max_features=0.05)
rf.fit(df_train[cols_x], df_train[cols_y])

y_train_hat = rf.predict(df_train[cols_x])
y_val_hat = rf.predict(df_val[cols_x])

print('Train RMSE', rmse(y_train_hat, df_train[cols_y]))
print('Test RMSE', rmse(y_val_hat, df_val[cols_y]))

Train RMSE 0.057268573119745945
Test RMSE 0.16009596671956433


In [27]:
rf = RandomForestRegressor(n_jobs=-1, random_state=27, max_features=0.5)
rf.fit(df_train[cols_x], df_train[cols_y])

y_train_hat = rf.predict(df_train[cols_x])
y_val_hat = rf.predict(df_val[cols_x])

print('Train RMSE', rmse(y_train_hat, df_train[cols_y]))
print('Test RMSE', rmse(y_val_hat, df_val[cols_y]))

Train RMSE 0.05253452751681316
Test RMSE 0.15168977748278445


In [28]:
rf = RandomForestRegressor(n_jobs=-1, random_state=27, max_features=0.25)
rf.fit(df_train[cols_x], df_train[cols_y])

y_train_hat = rf.predict(df_train[cols_x])
y_val_hat = rf.predict(df_val[cols_x])

print('Train RMSE', rmse(y_train_hat, df_train[cols_y]))
print('Test RMSE', rmse(y_val_hat, df_val[cols_y]))

Train RMSE 0.05382278502301724
Test RMSE 0.14753612001645794


In [35]:
rf = RandomForestRegressor(n_jobs=-1, random_state=27, max_features=0.2)
rf.fit(df_train[cols_x], df_train[cols_y])

y_train_hat = rf.predict(df_train[cols_x])
y_val_hat = rf.predict(df_val[cols_x])

print('Train RMSE', rmse(y_train_hat, df_train[cols_y]))
print('Test RMSE', rmse(y_val_hat, df_val[cols_y]))

Train RMSE 0.052809777759314065
Test RMSE 0.14983352694243646


0.25 it is

In [37]:
rf = RandomForestRegressor(n_jobs=-1, random_state=27, max_features=0.25, n_estimators=300)
rf.fit(df_train[cols_x], df_train[cols_y])

y_train_hat = rf.predict(df_train[cols_x])
y_val_hat = rf.predict(df_val[cols_x])

print('Train RMSE', rmse(y_train_hat, df_train[cols_y]))
print('Test RMSE', rmse(y_val_hat, df_val[cols_y]))

Train RMSE 0.05155085463642987
Test RMSE 0.14850119631996997


# Submission

In [38]:
df_test = pd.read_csv('data/ready/test.csv')
df_test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,4,80.0,11622,0,-1,0,0,0,...,120,0,-1,1,-1,0,6,2010,0,0
1,1462,20,0,81.0,14267,0,-1,1,0,0,...,0,0,-1,-1,2,12500,6,2010,0,0
2,1463,60,0,74.0,13830,0,-1,1,0,0,...,0,0,-1,1,-1,0,3,2010,0,0
3,1464,60,0,78.0,9978,0,-1,1,0,0,...,0,0,-1,-1,-1,0,6,2010,0,0
4,1465,120,0,43.0,5005,0,-1,1,3,0,...,144,0,-1,-1,-1,0,1,2010,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,1,21.0,1936,0,-1,0,0,0,...,0,0,-1,-1,-1,0,6,2006,0,0
1455,2916,160,1,21.0,1894,0,-1,0,0,0,...,0,0,-1,-1,-1,0,4,2006,0,1
1456,2917,20,0,160.0,20000,0,-1,0,0,0,...,0,0,-1,-1,-1,0,9,2006,0,1
1457,2918,85,0,62.0,10441,0,-1,0,0,0,...,0,0,-1,1,1,700,7,2006,0,0


In [39]:
df_test['SalePrice'] = rf.predict(df_test[cols_x])

In [40]:
df_submission = df_test[['Id', 'SalePrice']]
df_submission['SalePrice'] = df_submission['SalePrice'].apply(one_plus_log_reverse)

In [41]:
df_submission

Unnamed: 0,Id,SalePrice
0,1461,123415.928421
1,1462,152986.978545
2,1463,181400.087268
3,1464,185660.751864
4,1465,190778.624298
...,...,...
1454,2915,86576.551136
1455,2916,86301.616818
1456,2917,160121.052389
1457,2918,123348.383955


In [42]:
df_submission.to_csv('data/results/rf_submission.csv', index=False)