In [1]:
import pandas as pd
import numpy as np

In [2]:
import sys
sys.path.append('../ames') # path the the directory
import config
from data_prep import clean, add_features, dummify

In [3]:
housing = clean(config.HOUSING_CSV)
housing = add_features(housing)
# housing = pd.read_pickle(config.HOUSING_PICKLE)

In [10]:
limited_variables = ['TotalLivingArea','UnusedLotSize','HasPool','OverallQual','GarageArea','YearBuilt']
top_unique_features = ['TotalLivingArea','YearRemodAdd','GarageArea','TotalBsmtSF',
                       'FullBath','UnusedLotSize','MasVnrArea','MoSold','LotFrontage','Fireplaces',
                       'TotRmsAbvGrd', 'OverallQual','HouseAge']
the_chosen_variables = ['3SsnPorch','BedroomAbvGr','EnclosedPorch','Fireplaces',
                    'GarageArea','KitchenAbvGr','LotFrontage','LowQualFinSF','MasVnrArea',
                    'MiscVal','MoSold','MSSubClass','OpenPorchSF','OverallCond','OverallQual',
                    'ScreenPorch','TotRmsAbvGrd','WoodDeckSF','YearRemodAdd','YrSold',
                    'TotalLivingArea','UnusedLotSize','HasPool','HouseAge','Toilets','Showers',
                       'UpDownRatio']
best_vars = ['Fireplaces', 'LotFrontage', 'UnusedLotSize', 'HouseAge', 'HouseAgeSq', 'OverallQual',
            'OverallCond', 'GrLivArea', 'TotalLivingArea', 'Toilets', 'Showers', 'UpDownRatio',
             'GarageArea', 'HasPool']
X = housing[best_vars]
dummy_vars = {'BldgType': 'BT', 'BsmtQual': 'BQ',
              'LotConfig': 'LC', 'Neighborhood': 'Nbhd', 'SaleCondition': 'SC'}

dummy_df = dummify(housing, dummy_vars)
X = pd.concat([X,dummy_df], axis=1)
y = np.log(housing['SalePrice'])

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [12]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(max_features='sqrt', random_state=42)
rfr.fit(X_train,y_train)
print(rfr.score(X_train, y_train))
print(rfr.score(X_test,y_test))

0.9844847402912628
0.9040598586819906


In [20]:
from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_train, rfr.predict(X_train), squared=False))
print(mean_squared_error(y_test, rfr.predict(X_test), squared=False))

0.04812049255639273
0.1209594886860938


## Feature Importance

In [8]:
top_unique_features = ['OverallQual','TotalLivingArea','YearRemodAdd','GarageArea','TotalBsmtSF',
                       'FullBath','UnusedLotSize','MasVnrArea','MoSold','LotFrontage','Fireplaces',
                       'TotRmsAbvGrd']

In [10]:
d = pd.DataFrame(rfr.feature_importances_,index=X.columns,columns=['Importance'])
d.sort_values('Importance',ascending=False).head(25)

Unnamed: 0,Importance
OverallQual,0.178221
TotalLivingArea,0.169418
GarageArea,0.098887
HouseAge,0.092726
TotalBsmtSF,0.086853
FullBath,0.05377
YearRemodAdd,0.052227
Fireplaces,0.042593
MasVnrArea,0.038428
UnusedLotSize,0.037434


In [11]:
rfr.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [14]:
# Number of trees in random forest
n_estimators = [i*10 for i in range(3,11)] #[int(x) for x in np.linspace(start=10, stop=80, num=10)]
# Maximum number of levels in tree
max_depth = [i*2+1 for i in range(1,5)] + [None]
# Minimum number of samples required to split a node
min_samples_split = [2,3,4,5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1]
# Method of selecting samples for training each tree
bootstrap = [True]

param_grid = {'n_estimators': n_estimators,
              'max_depth': max_depth,
              'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf,
              'bootstrap': bootstrap}

param_grid = {'bootstrap':bootstrap,'n_estimators':n_estimators}

In [15]:
%timeit
rfc = RandomForestRegressor(random_state=42)
from sklearn.model_selection import GridSearchCV
rf_Grid = GridSearchCV(estimator = rfc, param_grid=param_grid, cv=10, verbose=2, n_jobs=4)
rf_Grid.fit(X_train,y_train)

Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   19.6s
[Parallel(n_jobs=4)]: Done  80 out of  80 | elapsed:   58.6s finished


GridSearchCV(cv=10, estimator=RandomForestRegressor(random_state=42), n_jobs=4,
             param_grid={'bootstrap': [True],
                         'n_estimators': [30, 40, 50, 60, 70, 80, 90, 100]},
             verbose=2)

In [16]:
rf_Grid.best_params_

{'bootstrap': True, 'n_estimators': 100}

In [18]:
rfr = RandomForestRegressor(max_depth=None,
                            min_samples_split=2, 
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_features='sqrt', 
                            random_state=42)
rfr.fit(X_train, y_train)
print(rfr.score(X_train, y_train))
print(rfr.score(X_test, y_test))

0.9844847402912628
0.9040598586819906


In [26]:
rfr = RandomForestRegressor(max_features='sqrt', random_state=42)
rfr.fit(X_train, y_train)
print(rfr.score(X_train, y_train))
print(rfr.score(X_test, y_test))

0.9834122985666877
0.9012121672141873


In [19]:
from sklearn.metrics import mean_squared_error
rms = mean_squared_error(y_train, rfr.predict(X_train), squared=False)
print(rms)

0.04812049255639273
