In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

%matplotlib inline

#this file path can be changed depending on where you want to work.
#for me, this folder is the root folder for all my ml_learning projects
ROOT_DIR = os.path.join('C:\\', 'users', 'sebas', 'onedrive', 'python', 'machine_learning', 'ml_learning')

#this data path is specific for the iowa housing project, and its where the datasets are stored on my local machine
DATA_PATH = os.path.join(ROOT_DIR, 'datasets', 'iowa_housing')

def load_data(filename, data_path=DATA_PATH):
    '''This function will load the data as a pandas dataframe. it takes the filename 
    as an argument which should be the name that the csv file is saved as in your directory.'''
    
    csv_path = os.path.join(data_path, filename)
    return pd.read_csv(csv_path).fillna(0) #we import the data with the null values replaced with 0s right off the bat. this avoids us having to do it later

training_data = load_data('train.csv')

#set id as our index column
training_data.set_index('Id', inplace=True)
#convert object categories to strings
training_data = training_data.apply(lambda x: x.astype('|S') if x.dtype == 'object' else x, axis=0)
training_data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,b'RL',65.0,8450,b'Pave',b'0',b'Reg',b'Lvl',b'AllPub',b'Inside',...,0,b'0',b'0',b'0',0,2,2008,b'WD',b'Normal',208500
2,20,b'RL',80.0,9600,b'Pave',b'0',b'Reg',b'Lvl',b'AllPub',b'FR2',...,0,b'0',b'0',b'0',0,5,2007,b'WD',b'Normal',181500
3,60,b'RL',68.0,11250,b'Pave',b'0',b'IR1',b'Lvl',b'AllPub',b'Inside',...,0,b'0',b'0',b'0',0,9,2008,b'WD',b'Normal',223500
4,70,b'RL',60.0,9550,b'Pave',b'0',b'IR1',b'Lvl',b'AllPub',b'Corner',...,0,b'0',b'0',b'0',0,2,2006,b'WD',b'Abnorml',140000
5,60,b'RL',84.0,14260,b'Pave',b'0',b'IR1',b'Lvl',b'AllPub',b'FR2',...,0,b'0',b'0',b'0',0,12,2008,b'WD',b'Normal',250000


In [2]:
#lets split this dataset into a seperate testing and training set so that we can evaluate our model, even though there is already a testing set provided.
from sklearn.model_selection import train_test_split
train_set, test_set, y_train, y_test = train_test_split(training_data, training_data.SalePrice, test_size=0.2, random_state=42)

train_set_cats = train_set.select_dtypes(include=['object'].copy()) #this line will show you all the columns which have a categorical value, if you needed to know that

X_train = train_set.drop('SalePrice', axis=1)
y_train = train_set['SalePrice'].copy()

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):

        return X[self.columns]
 

In [4]:
relevant_columns = ['OverallQual','GrLivArea','GarageCars','GarageArea','YearBuilt','BsmtFinSF1','FullBath', 
                    'GarageYrBlt', 'TotalBsmtSF', '2ndFlrSF', '1stFlrSF', 'HalfBath']

cs = ColumnSelector(columns=relevant_columns)
transformed = cs.fit_transform(X_train)

transformed

Unnamed: 0_level_0,OverallQual,GrLivArea,GarageCars,GarageArea,YearBuilt,BsmtFinSF1,FullBath,GarageYrBlt,TotalBsmtSF,2ndFlrSF,1stFlrSF,HalfBath
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
255,5,1314,1,294,1957,922,1,1957.0,1314,0,1314,0
1067,6,1571,2,380,1993,0,2,1993.0,799,772,799,1
639,5,796,0,0,1910,0,1,0.0,796,0,796,0
800,5,1768,1,240,1937,569,1,1939.0,731,787,981,1
381,5,1691,1,308,1924,218,2,1924.0,1026,665,1026,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1096,6,1314,2,440,2006,24,2,2006.0,1314,0,1314,0
1131,4,1981,2,576,1928,622,2,1981.0,1122,653,1328,0
1295,5,864,2,572,1955,167,1,1957.0,864,0,864,0
861,7,1426,1,216,1918,0,1,1925.0,912,514,912,1


In [68]:
 total_bsmt_sa_ix, second_flr_ix, first_flr_ix, full_bath_ix, half_bath_ix = [
            list(transformed.columns).index(col) for col in ('TotalBsmtSF', '2ndFlrSF', '1stFlrSF', 'FullBath', 'HalfBath')]

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    
    
    def __init__(self, add_total_sa=True, add_total_baths=True):
        self.add_total_sa = add_total_sa
        self.add_total_baths = add_total_baths
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
            
            
        if self.add_total_sa and self.add_total_baths:
            total_sa = X.iloc[:, total_bsmt_sa_ix] + X.iloc[:, second_flr_ix] + X.iloc[:, first_flr_ix]
            total_bath = X.iloc[:, full_bath_ix] + X.iloc[:, half_bath_ix]/2
            return np.c_[X, total_sa, total_bath]

        elif self.add_total_sa:
            total_sa = X.iloc[:, total_bsmt_sa_ix] + X.iloc[:, second_flr_ix] + X.iloc[:, first_flr_ix]
            return np.c_[X, total_sa]

        elif self.add_total_baths:
            total_bath = X.iloc[:, full_bath_ix] + X.iloc[:, half_bath_ix]/2
            return np.c_[X, total_bath]

        else:
            return X
            
            

In [62]:
atr_adder = CombinedAttributesAdder()
housing_extra_attr = atr_adder.transform(transformed)

In [63]:
housing_extra_attr = pd.DataFrame(housing_extra_attr, columns=relevant_columns+['total_sa', 'total_bath'], index=transformed.index)
housing_extra_attr.head()
                                

Unnamed: 0_level_0,OverallQual,GrLivArea,GarageCars,GarageArea,YearBuilt,BsmtFinSF1,FullBath,GarageYrBlt,TotalBsmtSF,2ndFlrSF,1stFlrSF,HalfBath,total_sa,total_bath
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
255,5.0,1314.0,1.0,294.0,1957.0,922.0,1.0,1957.0,1314.0,0.0,1314.0,0.0,2628.0,1.0
1067,6.0,1571.0,2.0,380.0,1993.0,0.0,2.0,1993.0,799.0,772.0,799.0,1.0,2370.0,2.5
639,5.0,796.0,0.0,0.0,1910.0,0.0,1.0,0.0,796.0,0.0,796.0,0.0,1592.0,1.0
800,5.0,1768.0,1.0,240.0,1937.0,569.0,1.0,1939.0,731.0,787.0,981.0,1.0,2499.0,1.5
381,5.0,1691.0,1.0,308.0,1924.0,218.0,2.0,1924.0,1026.0,665.0,1026.0,0.0,2717.0,2.0


In [89]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline([
    ('column_selector', ColumnSelector(columns=relevant_columns)),
    ('attr adder', CombinedAttributesAdder(add_total_sa=False)),
    ('scaler', StandardScaler())
])

X_train_prepd = pipeline.fit_transform(X_train)

In [24]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train_prepd, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [25]:
# let's try the full preprocessing pipeline on a few training instances
some_data = X_train.iloc[:5]
some_labels = y_train.iloc[:5]
some_data_prepared = pipeline.transform(some_data)

print("Predictions:", lin_reg.predict(some_data_prepared))

Predictions: [139923.45965399 173279.30614767  78841.29491201 140939.62845131
 126708.85399221]


In [27]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train_prepd, y_train)

predictions = tree_reg.predict(X_train_prepd)
tree_mse = mean_squared_error(y_train, predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

516.0253391888149

In [28]:
#lets use cross_val_score to split our training set
from sklearn.model_selection import cross_val_score

'''cross_val_score(estimator, X, y, scoring= , cv= )'''
scores = cross_val_score(tree_reg, X_train_prepd, y_train, 
                        scoring='neg_mean_squared_error', cv=10)
#cv is the amount of different times the model will be evaluated
tree_rmse_scores = np.sqrt(-scores)

def display_scores(scores):
    '''This function displays the results in a neat way to look at'''
    print('Scores: ', scores)
    print('Mean: ', scores.mean())
    print('std: ', scores.std())

display_scores(tree_rmse_scores)


Scores:  [34021.14006026 40958.96952914 32462.65000347 51678.44553451
 49222.95737772 40303.73603518 42648.38661641 32908.82679478
 42406.17957337 38305.09150208]
Mean:  40491.63830269089
std:  6152.750149527408


In [29]:
#compute scores for Linear regression model to compare
lin_reg_scores = cross_val_score(lin_reg, X_train_prepd, y_train, 
                                scoring='neg_mean_squared_error', cv=10)
lin_rmse_scores = np.sqrt(-lin_reg_scores)

display_scores(lin_rmse_scores)

Scores:  [28516.07247209 41047.17472359 27603.77988333 47443.46001853
 67301.36004983 41006.98939888 33881.44749781 26648.90661731
 27735.38726962 31510.81646948]
Mean:  37269.539440047265
std:  12050.095509158655


In [30]:
#now lets see how a random forest regressor works
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor()

forest_reg.fit(X_train_prepd, y_train)
tree_predict = forest_reg.predict(X_train_prepd)
forest_mse = mean_squared_error(y_train, tree_predict)
forest_rmse = np.sqrt(forest_mse)
print('Forest RMSE: ', forest_rmse)
print('='*50)

forest_reg_scores = cross_val_score(forest_reg, X_train_prepd, y_train, 
                                   scoring='neg_mean_squared_error', cv=10)
forest_rmse_scores = np.sqrt(-forest_reg_scores)


display_scores(forest_rmse_scores)



Forest RMSE:  12749.920801881799
Scores:  [21482.90739072 37248.07755856 26460.50888167 50033.22743721
 37295.43788283 34015.31607932 29523.10309105 27674.32705715
 27497.69226616 27136.58558627]
Mean:  31836.718323093904
std:  7711.299503139754


In [87]:
'''Now lets try and fine tune the model'''

#NOTE THAT THIS BLOCK OF CODE WILL TAKE QUITE SOME TIME TO COMPUTE.

from sklearn.model_selection import GridSearchCV

parameters = [
    {'n_estimators': [1, 10, 100], 'max_features': [2, 4, 8, 13]},
    {'bootstrap': [False], 'n_estimators': [10, 100], 'max_features': [8, 13]}
]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, parameters, cv=5,
                          scoring = 'neg_mean_squared_error', 
                          return_train_score = True)

grid_search.fit(X_train_prepd, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{'max_features': [2, 4, 8, 13],

In [35]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 10}

In [84]:
final_model = grid_search.best_estimator_

X_test = test_set.drop('SalePrice', axis=1)
X_test_prepared = pipeline.transform(X_test)

final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print(final_rmse)

29157.576811890325


In [88]:
#add_total_baths in pipleine set to false
final_model = grid_search.best_estimator_

X_test = test_set.drop('SalePrice', axis=1)
X_test_prepared = pipeline.transform(X_test)

final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print(final_rmse)

30643.278722518327
