## Running models

In this notebook, we run a series of models to see what model performs the best via minimum mean squared error. 


In [135]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [113]:
## importing the data
home_data = pd.read_csv('cleaned_data.csv')

## the list of features we narrowed down from previous notebook
features = ['LotFrontage', 'LotArea', 'OverallQual', 'YearBuilt', 'ExterQual', 'BsmtFinSF1', 'TotalBsmtSF', 
            '2ndFlrSF', 'GrLivArea', 'FullBath', 'HalfBath', 'KitchenQual', 'Fireplaces', 'GarageArea', 
            'GarageQual', 'GarageCond', 'Bsmt_magnitude', 'Remod_diff', 'MSSubClass_encoded', 'Neighborhood_encoded', 
            'MSZoning_encoded', 'HouseStyle_encoded', 'LotConfig_encoded', 'Condition1_encoded', 'BldgType_encoded', 
            'SaleType_encoded', 'SaleCondition_encoded', 'Attchd', 'Detchd', 'other_garage', 'RFn', 'Unf', 'Fin']

## target is the log of selling price. 
target = home_data.columns[-1]

home_data.sample(5)

Unnamed: 0,Id,LotFrontage,LotArea,LotShape,LandContour,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,...,other_garage,RFn,Unf,Fin,Reg,IR1,other_lots,Gable,Hip,other_roofs
327,436,43.0,10667,IR2,Lvl,7,6,1996,1996,Gable,...,0,1,0,0,0,0,1,1,0,0
935,1243,85.0,10625,Reg,Lvl,7,6,1974,1974,Gable,...,0,1,0,0,1,0,0,1,0,0
802,1072,78.0,11700,Reg,Lvl,6,6,1968,1968,Gable,...,0,0,1,0,1,0,0,1,0,0
426,566,66.0,6858,Reg,Bnk,6,4,1915,1950,Gable,...,0,0,1,0,1,0,0,1,0,0
234,313,65.0,7800,Reg,Lvl,5,7,1939,1950,Gable,...,0,0,1,0,1,0,0,1,0,0


In [136]:
# a dictionary of models

models = {'slc': LinearRegression(),
          'lasso': Lasso(),
           'svc': SVR()
          ,'knn': Pipeline((('scale', StandardScaler()), ('knnr',KNeighborsRegressor()))),
         'ada': AdaBoostRegressor(),
         'gbc': GradientBoostingRegressor(),
         'rfc': RandomForestRegressor(),
          'etc': ExtraTreesRegressor(),
         'xgbc': XGBRegressor()}

## keeping track of model names to output the best model
model_list = []
for key in models.items():
    model_list.append(key[0])

In [137]:
# number of splits for cross validation
splits = 5

# initiating our KFold validation
kfold = KFold(n_splits = splits, 
             shuffle = True,
             random_state = 555)

# arrary to track our error
mses = np.zeros((splits, len(models)))


for i, (train_index, test_index) in enumerate(kfold.split(home_data)):
    house_tt = home_data.iloc[train_index]
    house_ho = home_data.iloc[test_index]
    
    ## running through our models
    for j,x in enumerate(models):

        model = models[x]
    
        # fit our model
        model.fit(house_tt[features].values, house_tt['log_price'].values)
        
        #predict
        pred = model.predict(house_ho[features].values)
        
        # record error
        mses[i,j] = mean_squared_error(house_ho['log_price'].values.reshape(-1,1), pred.reshape(-1,1))
        
print('Done')

Done


In [141]:
## compute the average mean squared error
mean_mses = np.mean(mses,axis = 0)
print(mean_mses)

[0.02488291 0.04404892 0.04424652 0.03120705 0.02949345 0.01811342
 0.02104612 0.02007264 0.02312234]


In [144]:
# pick out our top model with lowest average mean squared error
top_model = model_list[mean_mses.argmin()]
print(top_model + ' had minimum error ' + str(mean_mses.min()))

gbc had minimum error 0.018113417178606917


In [145]:
top = models[top_model]

In [146]:
## pulling out the important features per the top model
feat_importance = top.feature_importances_
feat_importance

## storing as a dictionary
feat_dict = {'feature': features, 'importance': feat_importance}

## 
df = pd.DataFrame(feat_dict)

## printing out the most important features
important_feat = df.loc[df.importance > .01]
important_feat

Unnamed: 0,feature,importance
1,LotArea,0.014298
2,OverallQual,0.476722
4,ExterQual,0.020631
5,BsmtFinSF1,0.020019
6,TotalBsmtSF,0.059776
8,GrLivArea,0.140417
11,KitchenQual,0.018207
13,GarageArea,0.037018
16,Bsmt_magnitude,0.023352
17,Remod_diff,0.03984
