In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

%matplotlib inline

#this file path can be changed depending on where you want to work.
#for me, this folder is the root folder for all my ml_learning projects
ROOT_DIR = os.path.join('C:\\', 'users', 'sebas', 'onedrive', 'python', 'machine_learning', 'ml_learning')

#this data path is specific for the iowa housing project, and its where the datasets are stored on my local machine
DATA_PATH = os.path.join(ROOT_DIR, 'datasets', 'iowa_housing')

def load_data(filename, data_path=DATA_PATH):
    '''This function will load the data as a pandas dataframe. it takes the filename 
    as an argument which should be the name that the csv file is saved as in your directory.'''
    
    csv_path = os.path.join(data_path, filename)
    return pd.read_csv(csv_path).fillna(0) #we import the data with the null values replaced with 0s right off the bat. this avoids us having to do it later

training_data = load_data('train.csv')

training_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,0,Reg,Lvl,AllPub,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,0,Reg,Lvl,AllPub,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,0,IR1,Lvl,AllPub,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,0,IR1,Lvl,AllPub,...,0,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,0,IR1,Lvl,AllPub,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [2]:
#here we are setting the Id column as the index instead of having a seperate column for index
#the inplace=True parameter makes the changes to occur on the actual dataframe instead of creating a copy.
training_data.set_index('Id', inplace=True)
training_data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,0,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,0,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,0,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,0,IR1,Lvl,AllPub,Corner,...,0,0,0,0,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,0,IR1,Lvl,AllPub,FR2,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [3]:
#lets use the describe method to get a better understanding of what this data looks like
training_data.describe()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,56.89726,57.623288,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.117123,443.639726,46.549315,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,42.300571,34.664304,9981.264932,1.382997,1.112799,30.202904,20.645407,180.731373,456.098091,161.319273,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,20.0,0.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,20.0,42.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,50.0,63.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,0.0,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,70.0,79.0,11601.5,7.0,6.0,2000.0,2004.0,164.25,712.25,0.0,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1474.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [4]:
#notice all the columns which contains missing values in this dataset.
#there are lots of columns here to deal with, make sure to look through it carefully and see what is relevant
#and what is not relevant

training_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1460 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            1460 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 no

In [5]:
#lets make a copy of the dataset that contains only the columns which have categorical values, not numerical values
train_df_cats = training_data.select_dtypes(include=['object']).copy()

#now lets see the how many null valuesare in each column

print(train_df_cats.isnull().sum())

MSZoning         0
Street           0
Alley            0
LotShape         0
LandContour      0
Utilities        0
LotConfig        0
LandSlope        0
Neighborhood     0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
RoofStyle        0
RoofMatl         0
Exterior1st      0
Exterior2nd      0
MasVnrType       0
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual         0
BsmtCond         0
BsmtExposure     0
BsmtFinType1     0
BsmtFinType2     0
Heating          0
HeatingQC        0
CentralAir       0
Electrical       0
KitchenQual      0
Functional       0
FireplaceQu      0
GarageType       0
GarageFinish     0
GarageQual       0
GarageCond       0
PavedDrive       0
PoolQC           0
Fence            0
MiscFeature      0
SaleType         0
SaleCondition    0
dtype: int64


In [6]:
training_data = training_data.apply(lambda x: x.astype('|S') if x.dtype == 'object' else x, axis=0)

In [7]:
training_data

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,b'RL',65.0,8450,b'Pave',b'0',b'Reg',b'Lvl',b'AllPub',b'Inside',...,0,b'0',b'0',b'0',0,2,2008,b'WD',b'Normal',208500
2,20,b'RL',80.0,9600,b'Pave',b'0',b'Reg',b'Lvl',b'AllPub',b'FR2',...,0,b'0',b'0',b'0',0,5,2007,b'WD',b'Normal',181500
3,60,b'RL',68.0,11250,b'Pave',b'0',b'IR1',b'Lvl',b'AllPub',b'Inside',...,0,b'0',b'0',b'0',0,9,2008,b'WD',b'Normal',223500
4,70,b'RL',60.0,9550,b'Pave',b'0',b'IR1',b'Lvl',b'AllPub',b'Corner',...,0,b'0',b'0',b'0',0,2,2006,b'WD',b'Abnorml',140000
5,60,b'RL',84.0,14260,b'Pave',b'0',b'IR1',b'Lvl',b'AllPub',b'FR2',...,0,b'0',b'0',b'0',0,12,2008,b'WD',b'Normal',250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,b'RL',62.0,7917,b'Pave',b'0',b'Reg',b'Lvl',b'AllPub',b'Inside',...,0,b'0',b'0',b'0',0,8,2007,b'WD',b'Normal',175000
1457,20,b'RL',85.0,13175,b'Pave',b'0',b'Reg',b'Lvl',b'AllPub',b'Inside',...,0,b'0',b'MnPrv',b'0',0,2,2010,b'WD',b'Normal',210000
1458,70,b'RL',66.0,9042,b'Pave',b'0',b'Reg',b'Lvl',b'AllPub',b'Inside',...,0,b'0',b'GdPrv',b'Shed',2500,5,2010,b'WD',b'Normal',266500
1459,20,b'RL',68.0,9717,b'Pave',b'0',b'Reg',b'Lvl',b'AllPub',b'Inside',...,0,b'0',b'0',b'0',0,4,2010,b'WD',b'Normal',142125


After looking at all the categories with missing values and comparing them to the data_description.txt file provided, we can see that all the categories with missing
values are because they dont have any, and its not that its missing, so we can fill all the NAN values with 0, and then do something like label encoding for all the other categories to turn any other input into a number

In [8]:
#lets split this dataset into a seperate testing and training set so that we can evaluate our model, even though there is already a testing set provided.
from sklearn.model_selection import train_test_split
train_set, test_set, y_train, y_test = train_test_split(training_data, training_data.SalePrice, test_size=0.2, random_state=42)

train_set_cats = train_set.select_dtypes(include=['object'].copy()) #this line will show you all the columns which have a categorical value, if you needed to know that


In [9]:
train_set

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
255,20,b'RL',70.0,8400,b'Pave',b'0',b'Reg',b'Lvl',b'AllPub',b'Inside',...,0,b'0',b'0',b'0',0,6,2010,b'WD',b'Normal',145000
1067,60,b'RL',59.0,7837,b'Pave',b'0',b'IR1',b'Lvl',b'AllPub',b'Inside',...,0,b'0',b'0',b'0',0,5,2009,b'WD',b'Normal',178000
639,30,b'RL',67.0,8777,b'Pave',b'0',b'Reg',b'Lvl',b'AllPub',b'Inside',...,0,b'0',b'MnPrv',b'0',0,5,2008,b'WD',b'Normal',85000
800,50,b'RL',60.0,7200,b'Pave',b'0',b'Reg',b'Lvl',b'AllPub',b'Corner',...,0,b'0',b'MnPrv',b'0',0,6,2007,b'WD',b'Normal',175000
381,50,b'RL',50.0,5000,b'Pave',b'Pave',b'Reg',b'Lvl',b'AllPub',b'Inside',...,0,b'0',b'0',b'0',0,5,2010,b'WD',b'Normal',127000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1096,20,b'RL',78.0,9317,b'Pave',b'0',b'IR1',b'Lvl',b'AllPub',b'Inside',...,0,b'0',b'0',b'0',0,3,2007,b'WD',b'Normal',176432
1131,50,b'RL',65.0,7804,b'Pave',b'0',b'Reg',b'Lvl',b'AllPub',b'Inside',...,0,b'0',b'MnPrv',b'0',0,12,2009,b'WD',b'Normal',135000
1295,20,b'RL',60.0,8172,b'Pave',b'0',b'Reg',b'Lvl',b'AllPub',b'Inside',...,0,b'0',b'0',b'0',0,4,2006,b'WD',b'Normal',115000
861,50,b'RL',55.0,7642,b'Pave',b'0',b'Reg',b'Lvl',b'AllPub',b'Corner',...,0,b'0',b'GdPrv',b'0',0,6,2007,b'WD',b'Normal',189950


In [10]:
X_train = train_set.drop('SalePrice', axis=1)
y_train = train_set['SalePrice'].copy()



'''This here deals with the cateforical data. It replaces all the null values with none, since 
in the original data, a null value means that the instance did not have that given attribute, none is appropriate'''

# from sklearn.impute import SimpleImputer

# imputer = SimpleImputer(fill_value='none', strategy='constant')

# X = imputer.fit_transform(train_set)
# X = imputer.transform(train_set)
# train_tr = pd.DataFrame(X, columns=train_set.columns, index=train_set.index)
# train_tr


'This here deals with the cateforical data. It replaces all the null values with none, since \nin the original data, a null value means that the instance did not have that given attribute, none is appropriate'

In [46]:
'''Now that we have done the imputing above and it works, lets try to change it into a pipeline, so that we can apply it all at once'''
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

#here we have to distinguish between the numerical columns and the categorical columns, because the transformations being applied to each is different
num_attributes = list(X_train.select_dtypes(exclude=['object'])) #to select all num columns, we exclude any column with object types
cat_attributes = list(X_train.select_dtypes(include=['object'])) #here we select all columns with object types

# cat_pipeline = Pipeline([
#     ('one_hot', OneHotEncoder())
# ])

full_pipeline = ColumnTransformer([
    ('num', StandardScaler(), num_attributes),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_attributes)
])

X_train_prepared = full_pipeline.fit_transform(X_train, y_train)

In [13]:
'''SELECT AND TRAIN YOUR MODEL'''
#first lets start with a linear regression model to see what kind of values we get
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()

lin_reg.fit(X_train_prepared, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [14]:
some_data = train_set.iloc[:5]
some_labels = y_train.iloc[:5]

some_data_prepared = full_pipeline.transform(some_data)
print('Predictions: ', lin_reg.predict(some_data_prepared))
print('Labels: ', list(some_labels))

Predictions:  [145152.07722357 170222.90594586  95470.25501555 165644.34459812
 170383.14343404]
Labels:  [145000, 178000, 85000, 175000, 127000]


note that the results above are quite off from the actual labels, so our model isnt great at predicting.
lets try to measure the root mean squared error to see how off we are

In [15]:
from sklearn.metrics import mean_squared_error

predictions = lin_reg.predict(X_train_prepared)
lin_mse = mean_squared_error(y_train, predictions)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)

18895.185448724387


In [16]:
y_train.describe()
#as you can see, the rmse is around 18,900, while the mean sale price is around 181,000

count      1168.000000
mean     181441.541952
std       77263.583862
min       34900.000000
25%      130000.000000
50%      165000.000000
75%      214925.000000
max      745000.000000
Name: SalePrice, dtype: float64

In [17]:
#lets see if we can get a better score, using a different model
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train_prepared, y_train)

#now that model is trained, lets evaluate it on the training set
predictions = tree_reg.predict(X_train_prepared)
tree_mse = mean_squared_error(y_train, predictions)
tree_rmse = np.sqrt(tree_mse)
print(tree_rmse)

0.0


note we get an rmse of 0. clearly this is not correct. the problem is that the model is overfitting. instead, 
lets use cross-validation to test our model and actually evaluate the performance of the tree regressor

In [18]:
#lets use cross_val_score to split our training set
from sklearn.model_selection import cross_val_score

'''cross_val_score(estimator, X, y, scoring= , cv= )'''
scores = cross_val_score(tree_reg, X_train_prepared, y_train, 
                        scoring='neg_mean_squared_error', cv=10)
#cv is the amount of different times the model will be evaluated
tree_rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    '''This function displays the results in a neat way to look at'''
    print('Scores: ', scores)
    print('Mean: ', scores.mean())
    print('std: ', scores.std())

display_scores(tree_rmse_scores)


Scores:  [32259.33947864 37194.49874508 46129.79657473 47005.06878687
 57672.96205484 44898.91375088 35989.77818533 32576.19986246
 43932.12381686 34937.47191622]
Mean:  41259.6153171897
std:  7662.807379853428


In [19]:
#compute scores for Linear regression model to compare
lin_reg_scores = cross_val_score(lin_reg, X_train_prepared, y_train, 
                                scoring='neg_mean_squared_error', cv=10)
lin_rmse_scores = np.sqrt(-lin_reg_scores)

display_scores(lin_rmse_scores)

Scores:  [109585.75391255  32874.92719443  22491.97743204  46495.29003277
 104502.61086288  29490.62277859  50670.19058449  21309.52615828
  46717.17416725  26736.54627818]
Mean:  49087.461940144
std:  30612.259051640976


In [20]:
#now lets see how a random forest regressor works
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()

forest_reg.fit(X_train_prepared, y_train)
tree_predict = forest_reg.predict(X_train_prepared)
forest_mse = mean_squared_error(y_train, tree_predict)
forest_rmse = np.sqrt(forest_mse)
print('Forest RMSE: ', forest_rmse)
print('='*50)

forest_reg_scores = cross_val_score(forest_reg, X_train_prepared, y_train, 
                                   scoring='neg_mean_squared_error', cv=10)
forest_rmse_scores = np.sqrt(-forest_reg_scores)


display_scores(forest_rmse_scores)



Forest RMSE:  13914.631477840803
Scores:  [24983.6001953  33342.37817784 25249.27247485 43066.12473551
 39836.9406963  37340.44159554 26971.70237345 25054.94555793
 26545.02325699 27588.67258667]
Mean:  30997.910165037814
std:  6493.793488790379


This model is performing much better than our other two models looked at (the Mean from the cross_val_score is much lower) however, notice that the mean is still higher than the score on the training set (that is, the model is performing better on the training set than it is on the validation set). this means that the model is still overfitting

In [39]:
#first lets save these models so that we can come back to them later
# import joblib
# def save_models(model, save_name):
#     os.mkdir('saved_models')
#     save_path = os.path.join('saved_models', save_name + '.pkl')
#     joblib.dump(model, save_path)

In [21]:
'''Now lets try and fine tune the model'''

#NOTE THAT THIS BLOCK OF CODE WILL TAKE QUITE SOME TIME TO COMPUTE.

from sklearn.model_selection import GridSearchCV

parameters = [
    {'n_estimators': [1, 10, 100], 'max_features': [2, 4, 8, 16, 32]},
    {'bootstrap': [False], 'n_estimators': [10, 100], 'max_features': [8, 32]}
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, parameters, cv=5,
                          scoring = 'neg_mean_squared_error', 
                          return_train_score = True)

grid_search.fit(X_train_prepared, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{'max_features': [2, 4, 8, 16, 

In [22]:
grid_search.best_params_

{'bootstrap': False, 'max_features': 32, 'n_estimators': 100}

In [23]:
grid_search.best_estimator_

RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=None,
                      max_features=32, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [24]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(np.sqrt(-mean_score), params)

66078.88613091993 {'max_features': 2, 'n_estimators': 1}
38377.37889576646 {'max_features': 2, 'n_estimators': 10}
36762.44441223664 {'max_features': 2, 'n_estimators': 100}
62812.86323789881 {'max_features': 4, 'n_estimators': 1}
37871.64234245261 {'max_features': 4, 'n_estimators': 10}
35027.7954549494 {'max_features': 4, 'n_estimators': 100}
64060.3630687361 {'max_features': 8, 'n_estimators': 1}
36950.808603310375 {'max_features': 8, 'n_estimators': 10}
33167.0787822414 {'max_features': 8, 'n_estimators': 100}
50464.59659678624 {'max_features': 16, 'n_estimators': 1}
34842.25865992637 {'max_features': 16, 'n_estimators': 10}
31440.577282757935 {'max_features': 16, 'n_estimators': 100}
55568.00254076728 {'max_features': 32, 'n_estimators': 1}
33167.513504815186 {'max_features': 32, 'n_estimators': 10}
30444.615979518185 {'max_features': 32, 'n_estimators': 100}
34396.52183898035 {'bootstrap': False, 'max_features': 8, 'n_estimators': 10}
31745.01943197347 {'bootstrap': False, 'max_f

You can also see the relative importance of each attribute for making accurate predictions

In [25]:
feature_importances = grid_search.best_estimator_.feature_importances_
print(feature_importances)

[4.05315612e-03 8.06053497e-03 1.92308010e-02 1.25632171e-01
 3.46389757e-03 3.65283212e-02 2.18096088e-02 8.84888809e-03
 3.20577347e-02 9.71471440e-04 6.81640575e-03 3.82276204e-02
 4.95144125e-02 2.65109067e-02 2.25613634e-04 7.65549325e-02
 2.99031637e-03 1.40874566e-04 3.38311274e-02 3.21187882e-03
 4.93769411e-03 1.06855378e-03 1.30006969e-02 2.19070212e-02
 2.63068803e-02 6.80747764e-02 3.86326370e-02 4.95853903e-03
 7.80645728e-03 9.43503051e-04 2.50267940e-04 3.07375525e-03
 2.08162015e-03 1.38234019e-04 3.28705504e-03 1.95621311e-03
 7.80986635e-05 1.63437208e-04 6.86594646e-05 1.50357425e-03
 1.18291507e-03 1.33470830e-05 2.30820695e-06 2.30697832e-04
 1.78477057e-04 1.76900439e-04 8.35749229e-04 5.14474909e-04
 4.35938635e-04 1.25486303e-03 1.44246978e-03 8.19807761e-04
 4.80315773e-04 1.50855188e-03 6.43099263e-07 3.31288724e-06
 8.26466830e-04 1.04731869e-03 1.32288409e-04 4.48973231e-05
 7.13589455e-04 1.16928036e-03 4.83351955e-04 4.12684343e-04
 3.74491334e-05 1.885374

In [26]:
attributes = num_attributes + cat_attributes
sorted(zip(feature_importances, attributes), reverse=True)

[(0.1256321707520391, 'OverallQual'),
 (0.07655493245875174, 'GrLivArea'),
 (0.06807477639816001, 'GarageCars'),
 (0.04951441251695005, '1stFlrSF'),
 (0.038632637011933486, 'GarageArea'),
 (0.03822762041515487, 'TotalBsmtSF'),
 (0.0365283211953355, 'YearBuilt'),
 (0.033831127350465624, 'FullBath'),
 (0.032057734725156405, 'BsmtFinSF1'),
 (0.026510906655914667, '2ndFlrSF'),
 (0.026306880297032813, 'GarageYrBlt'),
 (0.021907021190253242, 'Fireplaces'),
 (0.02180960878479208, 'YearRemodAdd'),
 (0.01923080099490955, 'LotArea'),
 (0.013000696885697246, 'TotRmsAbvGrd'),
 (0.008848888093126155, 'MasVnrArea'),
 (0.008060534966445551, 'LotFrontage'),
 (0.007806457278265271, 'OpenPorchSF'),
 (0.006816405746111255, 'BsmtUnfSF'),
 (0.004958539030550783, 'WoodDeckSF'),
 (0.004937694106859778, 'BedroomAbvGr'),
 (0.004053156115842183, 'MSSubClass'),
 (0.003463897569837035, 'OverallCond'),
 (0.0032870550390248347, 'MoSold'),
 (0.0032118788158796235, 'HalfBath'),
 (0.003073755249445144, 'ScreenPorch'),

In [47]:
final_model = grid_search.best_estimator_

X_test = test_set.drop('SalePrice', axis=1)
X_test_prepared = full_pipeline.transform(X_test)

final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print(final_rmse)

30861.95615968882


In [84]:
# '''From the list above, we see the list of most important attributes, from highest to lowest. based on this, we make a new list of the top attributes
# and get rid of the rest. now lets try to see how well our model does after simplifying it this way'''


# cats_to_keep = ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', '1stFlrSF', 'YearBuilt', 'TotalBsmtSF', '2ndFlrSF', 'LotArea', 'BsmtFinSF1', 
#                'FullBath', 'TotRmsAbvGrd', 'GarageYrBlt']

# X_mod = train_set[cats_to_keep]

# X_mod
# #here i might need to make a custome pipeline that will select only those specific columns for further transformation

# scaler = StandardScaler()

# X_mod_tr = scaler.fit_transform(X_mod)

# scaler

In [58]:
test_data = load_data('test.csv')
index = test_data.Id

In [71]:
test_data_prepared = full_pipeline.transform(test_data)

predictions = final_model.predict(test_data_prepared)

predictions = pd.DataFrame(predictions)

final_predictions = predictions.set_index(index)
final_predictions.columns = ['SalePrice']

In [72]:
final_predictions.to_csv('iowa_predictions.csv')

In [73]:
final_predictions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1459 entries, 1461 to 2919
Data columns (total 1 columns):
SalePrice    1459 non-null float64
dtypes: float64(1)
memory usage: 22.8 KB


In [74]:
final_predictions

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,128611.44
1462,151843.37
1463,188513.18
1464,183815.04
1465,198907.89
...,...
2915,102689.41
2916,105380.16
2917,176123.29
2918,127226.40
