## Set up data set

In [None]:
import pandas as pd
import numpy as np

import sys
sys.path.append('../ames') # path the the directory
import config
from data_prep import clean, add_features, dummify

housing = clean(config.HOUSING_CSV)
housing = add_features(housing)

## Create testing and training sets

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
X = housing[config.CHOSEN_VARIABLES].copy()
dummy_df = dummify(housing, config.VARS_TO_DUMMIFY, drop_first=False)
X = pd.concat([X,dummy_df], axis=1)
y = np.log(housing['SalePrice'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

## Initial Trial of Random Forest

In [7]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(max_features='sqrt', random_state=42)
rfr.fit(X_train,y_train)
print(rfr.score(X_train, y_train))
print(rfr.score(X_test,y_test))

0.9812960018126767
0.8865657907528609


In [8]:
from sklearn.metrics import mean_squared_error
rms = mean_squared_error(y_test, rfr.predict(X_test), squared=False)
print(rms)

0.13152605358110536


## Grid Search for optimal hyperparameters

In [17]:
# Number of trees in random forest
n_estimators = [10,100,1000] #[int(x) for x in np.linspace(start=10, stop=80, num=10)]
# Maximum number of levels in tree
max_depth = [i*2+4 for i in range(1,5)] + [None]
# Minimum number of samples required to split a node
min_samples_split = [1,2,3,4]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2 ]
# Method of selecting samples for training each tree
max_features = [8, 9, 10, 11, 'auto', 'sqrt'] 
# Method of selecting number of features    

bootstrap = [True]

param_grid = {'n_estimators': n_estimators,
              'max_depth': max_depth,
              'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf,
              'bootstrap': bootstrap,
              'max_features' : max_features}

In [18]:
%timeit
rfc = RandomForestRegressor(random_state=42)
from sklearn.model_selection import GridSearchCV
rf_Grid = GridSearchCV(estimator = rfc, param_grid=param_grid, cv=5, verbose=2, n_jobs=4)
rf_Grid.fit(X_train,y_train)

Fitting 5 folds for each of 600 candidates, totalling 3000 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  57 tasks      | elapsed:   13.3s
[Parallel(n_jobs=4)]: Done 178 tasks      | elapsed:   46.7s
[Parallel(n_jobs=4)]: Done 381 tasks      | elapsed:  1.9min
[Parallel(n_jobs=4)]: Done 664 tasks      | elapsed:  4.2min
[Parallel(n_jobs=4)]: Done 1029 tasks      | elapsed:  6.5min
[Parallel(n_jobs=4)]: Done 1474 tasks      | elapsed: 10.3min
[Parallel(n_jobs=4)]: Done 2001 tasks      | elapsed: 15.2min
[Parallel(n_jobs=4)]: Done 2608 tasks      | elapsed: 21.4min
[Parallel(n_jobs=4)]: Done 3000 out of 3000 | elapsed: 26.5min finished


GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42), n_jobs=4,
             param_grid={'bootstrap': [True], 'max_depth': [6, 8, 10, 12, None],
                         'max_features': [8, 9, 10, 11, 'auto'],
                         'min_samples_leaf': [1, 2],
                         'min_samples_split': [1, 2, 3, 4],
                         'n_estimators': [10, 100, 1000]},
             verbose=2)

In [21]:
print(rf_Grid.best_params_)
rf_best = rf_Grid.best_estimator_

{'bootstrap': True, 'max_depth': None, 'max_features': 11, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 1000}


In [22]:
rf_best.fit(X_train, y_train)
print(rf_best.score(X_train, y_train))
print(rf_best.score(X_test, y_test))

0.9786118542497703
0.8965658916520878


In [23]:
from sklearn.metrics import mean_squared_error
rms = mean_squared_error(y_test, rf_best.predict(X_test), squared=False)
print(rms)

0.12559479832745096


In [24]:
rfr = RandomForestRegressor(bootstrap=True,
                            max_depth=None, 
                            min_samples_split=3, 
                            min_samples_leaf=1, 
                            n_estimators=1000, 
                            max_features=11, 
                            random_state=42) 

rfr.fit(X_train, y_train) 
print(rfr.score(X_train, y_train))

0.9786118542497703


In [15]:
print(rfr.score(X_test, y_test))

0.8917727241965219


## Feature Importance

In [32]:
from model_analysis import get_feature_importance, graph_importance
feats = get_feature_importance(rfr, X)
fig = graph_importance(feats, 'Random Forest')
fig.show()

ValueError: Index data must be 1-dimensional