In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
import sys
sys.path.append('../ames') # path the the directory
import config
from data_prep import clean, add_features, dummify

In [3]:
housing = clean(config.HOUSING_CSV)
housing = add_features(housing)
housing = pd.read_pickle(config.HOUSING_PICKLE)

In [4]:
the_chosen_variables = ['1stFlrSF','2ndFlrSF', '3SsnPorch', 'BedroomAbvGr', 
             'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 
             'BsmtUnfSF', 'EnclosedPorch', 'Fireplaces', 'FullBath', 
             'GarageArea', 'GarageCars', 'GarageYrBlt', 'GrLivArea', 
             'HalfBath', 'KitchenAbvGr', 'LotArea', 'LotFrontage', 
             'LowQualFinSF', 'MasVnrArea', 'MiscVal', 'MoSold', 'MSSubClass', 
             'OpenPorchSF', 'OverallCond', 'OverallQual', 'PoolArea', 
             'ScreenPorch', 'TotalBsmtSF', 'TotRmsAbvGrd', 'WoodDeckSF', 
             'YearBuilt', 'YearRemodAdd', 'YrSold','DecadeBuilt', 'DecadeRemodel',
             'TotalLivingArea','UnusedLotSize','HasPool']

In [5]:
limited_variables = ['TotalLivingArea','UnusedLotSize','HasPool','OverallQual','GarageArea','YearBuilt']
top_unique_features = ['TotalLivingArea','YearRemodAdd','GarageArea','TotalBsmtSF',
                       'FullBath','UnusedLotSize','MasVnrArea','MoSold','LotFrontage','Fireplaces',
                       'TotRmsAbvGrd', 'OverallQual',]
X = housing[top_unique_features]
dummy_vars = {'Neighborhood':'Nbhd','LotConfig':'LC','SaleCondition':'SC'}
dummy_df = dummify(housing, dummy_vars)
X = pd.concat([X,dummy_df], axis=1)
y = housing['SalePrice']

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [7]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(random_state=42)
rfr.fit(X_train,y_train)
print(rfr.score(X_train, y_train))
print(rfr.score(X_test,y_test))

0.9826946892513438
0.9035966612843293


<p>0.9840305848772934</p>
<p>0.9023077469434058</p>
<p>From ALL the variables</p>
<p>24412.676585562695</p>

In [8]:
from sklearn.metrics import mean_squared_error
rms = mean_squared_error(y_test, rfr.predict(X_test), squared=False)
print(rms)

24251.09608439986


## Feature Importance

In [9]:
top_unique_features = ['OverallQual','TotalLivingArea','YearRemodAdd','GarageArea','TotalBsmtSF',
                       'FullBath','UnusedLotSize','MasVnrArea','MoSold','LotFrontage','Fireplaces',
                       'TotRmsAbvGrd']

In [10]:
d = pd.DataFrame(rfr.feature_importances_,index=X.columns,columns=['Importance'])
d.sort_values('Importance',ascending=False).head(25)

Unnamed: 0,Importance
OverallQual,0.558901
TotalLivingArea,0.261964
GarageArea,0.027879
FullBath,0.024912
TotalBsmtSF,0.024453
UnusedLotSize,0.023911
YearRemodAdd,0.018594
MasVnrArea,0.013134
LotFrontage,0.009763
MoSold,0.00868
