## Set up data set

In [None]:
import pandas as pd
import numpy as np

import sys
sys.path.append('../ames') # path the the directory
import config
from data_prep import clean, add_features, dummify

housing = clean(config.HOUSING_CSV)
housing = add_features(housing)

## Import Random Forest Modeling packages

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel

## Create testing and training sets

In [None]:
X = housing[config.CHOSEN_VARIABLES].copy()
dummy_df = dummify(housing, config.VARS_TO_DUMMIFY, drop_first=False)
X = pd.concat([X,dummy_df], axis=1)
y = np.log(housing['SalePrice'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

## Decide on most pertinent features

In [None]:
sel = SelectFromModel(RandomForestRegressor(n_estimators=100, 
                                            oob_score=True, 
                                            max_features='sqrt', 
                                            min_impurity_decrease=0.0000001, 
                                            random_state=42))
sel.fit(X_train, y_train)
selected_feat = X_train.columns[(sel.get_support())]
print(selected_feat)

##  Resplit and retest on new feature set

In [None]:
X2 = X[selected_feat]
X_train, X_test, y_train, y_test = train_test_split(X2,y, test_size=0.2, random_state=18)

In [None]:
rfr = RandomForestRegressor(oob_score=True, max_features='sqrt', min_impurity_decrease=0.0000001, random_state=18)
rfr.fit(X_train,y_train)
print(rfr.score(X_train, y_train))
print(rfr.score(X_test,y_test))

# Alternative Feature Set Analysis

In [None]:
from treeinterpreter import treeinterpreter as ti

for i,row in X_test.iterrows():
    data_point = pd.DataFrame([row])
    data_point.set_axis(['value_variable']) # Once transposed, it will be the column name
    prediction, bias, contributions = ti.predict(rfr, data_point)
    local_interpretation = data_point.append(
            pd.DataFrame([[round(c,3) for c in contributions[0]]], 
                         columns=data_point.columns.tolist(), 
                         index=['contribution_variable'])
    ).T.sort_values('contribution_variable', ascending=False)
print(local_interpretation)

In [None]:
local_interpretation.sort_values('contribution_variable', key=np.abs, ascending=False)

In [None]:
local_interpretation.sort_values(180, ascending=False)

## GridSearchCV

In [None]:
# Number of trees in random forest
n_estimators = [i*10 for i in range(3,11)] #[int(x) for x in np.linspace(start=10, stop=80, num=10)]
# Maximum number of levels in tree
max_depth = [i*2+1 for i in range(1,5)] + [None]
# Minimum number of samples required to split a node
min_samples_split = [2,3,4,5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1]
# Method of selecting samples for training each tree
bootstrap = [True]

param_grid = {'n_estimators': n_estimators,
              'max_depth': max_depth,
              'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf,
              'bootstrap': bootstrap}

param_grid = {'bootstrap':bootstrap,'n_estimators':[100], 'max_depth':[3,None],'min_samples_split':[2,3]}

In [None]:
%timeit
rfc = RandomForestRegressor(oob_score=True, random_state=42)
from sklearn.model_selection import GridSearchCV
rf_Grid = GridSearchCV(estimator = rfc, param_grid=param_grid, cv=5, verbose=2, n_jobs=4)
rf_Grid.fit(X_train,y_train)

In [None]:
print(rf_Grid.best_params_)
rfr_best = rf_Grid.best_estimator_
rfr_best.fit(X_train,y_train)
rfr_best.score(X_test,y_test)