# Importing all the needed Libraries

In [212]:
'''Importing Numpy and Pandas'''
import numpy as np 
import pandas as pd 

'''Machine Learning Packages'''
from sklearn.cross_validation import cross_val_predict, cross_val_score 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression,ElasticNetCV, LassoCV, RidgeCV , Ridge, Lasso, LassoLarsIC,  ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest, RFE, RFECV, SelectPercentile

'''Preprocessing Packages'''
from sklearn.preprocessing import StandardScaler, Imputer, PolynomialFeatures,Binarizer, OneHotEncoder, LabelEncoder


'''Plotting Packages'''
import seaborn as sns
import matplotlib.pyplot as plt 
%matplotlib inline 


from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import SelectPercentile, SelectKBest
from sklearn.ensemble import RandomForestRegressor

# import xgboost as xgb


# Loading the Test and the Train Datasets

In [6]:
'''Loading the CSV into the training and the testing dataframe'''
train_raw = pd.read_csv('train.csv')
test_raw = pd.read_csv('test.csv')

In [7]:
'''Replacing the spaces in the column names with the _'''
train_raw.columns = train_raw.columns.str.replace(' ', '')
test_raw.columns = test_raw.columns.str.replace(' ', '')

# Preliminary Data Analysis

In [36]:
'''Validating the training dataset'''
train_raw.head(2)

Unnamed: 0,Id,PID,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,source
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,,,,0,3,2010,WD,Normal,130500,Train
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,,,,0,4,2009,WD,Normal,220000,Train


In [37]:
'''Validating the test dataset'''
test_raw.head(2)

Unnamed: 0,Id,PID,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,source
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,,,,0,4,2006,WD,Test
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,,,,0,8,2006,WD,Test


In [38]:
'''Adding a source column in the training dataset'''

train_raw['source'] = 'Train'
train_raw.head(2)

Unnamed: 0,Id,PID,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,source
0,109,533352170,60,RL,,13517,Pave,,IR1,Lvl,...,,,,0,3,2010,WD,Normal,130500,Train
1,544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,...,,,,0,4,2009,WD,Normal,220000,Train


In [45]:
'''Adding a source column in the test dataset'''

test_raw['source'] = 'Test'
test_raw.head(2)

Unnamed: 0,Id,PID,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,source
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,,,,0,4,2006,WD,Test
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,,,,0,8,2006,WD,Test


# Data Conversion

In [50]:
'''Converting MSSubClass and OverallCond to String'''

train_raw['MSSubClass'] = train_raw['MSSubClass'].astype(str)
test_raw['MSSubClass'] = test_raw['MSSubClass'].astype(str)
train_raw['OverallCond'] = train_raw['OverallCond'].astype(str)
test_raw['OverallCond'] = test_raw['OverallCond'].astype(str)

In [51]:
'''Merging the Training and the Test Dataset before creating the dummies'''
train_test_merged = train_raw.append(test_raw)
train_test_merged.head(2)

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemod/Add,YrSold,source
0,725,754,0,,3,1Fam,TA,No,533.0,0.0,...,0,Pave,6,725.0,AllPub,0,1976,2005,2010,Train
1,913,1209,0,,4,1Fam,TA,No,637.0,0.0,...,0,Pave,8,913.0,AllPub,0,1996,1997,2009,Train


In [62]:
'''
Validating the count of the Train and Test Records in the Merged Dataframe 
Note: This column was added manually to deal with the mutually exclusive class categories in the training and the test dataset
'''
train_test_merged.source.value_counts()

Train    2051
Test      879
Name: source, dtype: int64

# Reviewing the list of non-null columns in the dataset

In [53]:
train_test_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2930 entries, 0 to 878
Data columns (total 83 columns):
1stFlrSF         2930 non-null int64
2ndFlrSF         2930 non-null int64
3SsnPorch        2930 non-null int64
Alley            198 non-null object
BedroomAbvGr     2930 non-null int64
BldgType         2930 non-null object
BsmtCond         2850 non-null object
BsmtExposure     2847 non-null object
BsmtFinSF1       2929 non-null float64
BsmtFinSF2       2929 non-null float64
BsmtFinType1     2850 non-null object
BsmtFinType2     2849 non-null object
BsmtFullBath     2928 non-null float64
BsmtHalfBath     2928 non-null float64
BsmtQual         2850 non-null object
BsmtUnfSF        2929 non-null float64
CentralAir       2930 non-null object
Condition1       2930 non-null object
Condition2       2930 non-null object
Electrical       2929 non-null object
EnclosedPorch    2930 non-null int64
ExterCond        2930 non-null object
ExterQual        2930 non-null object
Exterior1st      2930

Right away we noticed that there are handful of columns that have null values. This will require some level of handling (e.g. Imputation)

In [56]:
'''Dropping columns that have very little non-null values '''

cols_not_needed_for_regression = ['Alley', 'Fence', 'PoolQC', 'MiscFeature']
train_test_merged.drop(cols_not_needed_for_regression,inplace=True)

# Initial Review of the Correlation of the predictor variables with the Target Variables

In [58]:
'''Code below looks at the correlation of the individual predictor variables with the target variables, and also lists out the predictors that have a positive correlation with the target variables'''

traindata_corr = train_raw.corr()['SalePrice'][:-1]
positively_corelated = traindata_corr[traindata_corr.sort_values(ascending=False)>0]
positively_corelated_columns= list(positively_corelated.index)
positively_corelated_columns

['LotFrontage',
 'LotArea',
 'OverallQual',
 'YearBuilt',
 'YearRemod/Add',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'GrLivArea',
 'BsmtFullBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MoSold']

# List of Predictors that have a high level of correlation with the target Variables

In [61]:
positively_corelated.sort_values(ascending=False)

OverallQual      0.800207
GrLivArea        0.697038
GarageArea       0.650270
GarageCars       0.648220
TotalBsmtSF      0.628925
1stFlrSF         0.618486
YearBuilt        0.571849
YearRemod/Add    0.550370
FullBath         0.537969
GarageYrBlt      0.533922
MasVnrArea       0.512230
TotRmsAbvGrd     0.504014
Fireplaces       0.471093
BsmtFinSF1       0.423519
LotFrontage      0.341842
OpenPorchSF      0.333476
WoodDeckSF       0.326490
LotArea          0.296566
BsmtFullBath     0.283662
HalfBath         0.283001
2ndFlrSF         0.248452
BsmtUnfSF        0.190210
BedroomAbvGr     0.137067
ScreenPorch      0.134581
3SsnPorch        0.048732
MoSold           0.032735
PoolArea         0.023106
BsmtFinSF2       0.016255
Name: SalePrice, dtype: float64

# Shape Validation

In [64]:
'''Validating the Shape of the Training, Test and the Merged Dataframes'''

print ('Training Shape: ', train_raw.shape)
print ('Testing Shape: ', test_raw.shape)
print ('Merge Shape: ', train_test_merged.shape)

Training Shape:  (2051, 83)
Testing Shape:  (879, 81)
Merge Shape:  (2930, 83)


This step is necessary to confirm/validate that all the data manipulations has not convulated the dataset. We notice that, the test dataset has two columns less. This is because the test dataset does not contain the SalePrice and the SaleCondition

# List of Null Columns

In [70]:
'''Retrieving the list of null columns from the merged Dataset'''
null_col_list = list(train_test_merged.columns[train_test_merged.isnull().sum()>0])
null_col_list

['Alley',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtFinType1',
 'BsmtFinType2',
 'BsmtFullBath',
 'BsmtHalfBath',
 'BsmtQual',
 'BsmtUnfSF',
 'Electrical',
 'Fence',
 'FireplaceQu',
 'GarageArea',
 'GarageCars',
 'GarageCond',
 'GarageFinish',
 'GarageQual',
 'GarageType',
 'GarageYrBlt',
 'LotFrontage',
 'MasVnrArea',
 'MasVnrType',
 'MiscFeature',
 'PoolQC',
 'SaleCondition',
 'SalePrice',
 'TotalBsmtSF']

# Total Count of Null Columns

In [71]:
'''Getting the count of total number of null columns'''
print ('Number of Null Columns: ', len(null_col_list))

Number of Null Columns:  29


# Total Count of Numeric and Non-Numeric Columns

In [79]:
# '''This step is to count the number of the Numeric and Non-Numeric columns'''
# object_col_count = 0 
# numeric_col_count = 0 


# for col_type in train_test_merged.dtypes: 
#     if col_type ==np.object: 
#         object_col_count += 1 
#     elif col_type == np.int64 or col_type == np.float64: 
#         numeric_col_count += 1
#     else: 
#         continue

# print ('Object Col Count: ', object_col_count)
# print ('Numberic Col Count: ', numeric_col_count)

In [80]:
object_col_list = []
numeric_col_list = [] 


for k,v in enumerate(train_test_merged.dtypes):
    if v == np.object: 
        object_col_list.append(train_test_merged.columns[k])
    else: 
        numeric_col_list.append(train_test_merged.columns[k])

print ('Columns of type Object:\n\n',  object_col_list)
print ('\n')
print ('Number of Object Columns: ', len(object_col_list))
print ('\n')
print ('Columns of type Int or Float:\n\n',  numeric_col_list)
print ('\n')
print ('Number of Numeric Columns: ', len(numeric_col_list))

Columns of type Object:

 ['Alley', 'BldgType', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual', 'CentralAir', 'Condition1', 'Condition2', 'Electrical', 'ExterCond', 'ExterQual', 'Exterior1st', 'Exterior2nd', 'Fence', 'FireplaceQu', 'Foundation', 'Functional', 'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType', 'Heating', 'HeatingQC', 'HouseStyle', 'KitchenQual', 'LandContour', 'LandSlope', 'LotConfig', 'LotShape', 'MSSubClass', 'MSZoning', 'MasVnrType', 'MiscFeature', 'Neighborhood', 'OverallCond', 'PavedDrive', 'PoolQC', 'RoofMatl', 'RoofStyle', 'SaleCondition', 'SaleType', 'Street', 'Utilities', 'source']


Number of Object Columns:  46


Columns of type Int or Float:

 ['1stFlrSF', '2ndFlrSF', '3SsnPorch', 'BedroomAbvGr', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtUnfSF', 'EnclosedPorch', 'Fireplaces', 'FullBath', 'GarageArea', 'GarageCars', 'GarageYrBlt', 'GrLivArea', 'HalfBath', 'Id', 'KitchenAbvGr', 'LotArea', 'LotFrontage', 'Lo

# Null Columns in the Merged Dataframe 

In [83]:
'''Reviewing the null columns types in the merged dataset'''
train_test_merged[null_col_list].dtypes

Alley             object
BsmtCond          object
BsmtExposure      object
BsmtFinSF1       float64
BsmtFinSF2       float64
BsmtFinType1      object
BsmtFinType2      object
BsmtFullBath     float64
BsmtHalfBath     float64
BsmtQual          object
BsmtUnfSF        float64
Electrical        object
Fence             object
FireplaceQu       object
GarageArea       float64
GarageCars       float64
GarageCond        object
GarageFinish      object
GarageQual        object
GarageType        object
GarageYrBlt      float64
LotFrontage      float64
MasVnrArea       float64
MasVnrType        object
MiscFeature       object
PoolQC            object
SaleCondition     object
SalePrice        float64
TotalBsmtSF      float64
dtype: object

# Function for the Categorical Imputer

In [85]:
'''Function to impute the categorical columns with the Most Frequent Values'''

def categorical_imputer(col):
    print ('Null Count in ', '"',  cols ,'"',  'column Pre Transform: ' , train_test_merged[col].isnull().sum())
    train_test_merged[col].fillna(value = train_test_merged[col].value_counts().index[0], inplace=True)
    print ('Null Count in ', '"',  cols ,'"',  'column Post Transform: ' , train_test_merged[col].isnull().sum())
    print ('\n')
    

In [86]:
'''Using the for loop on the null columns and imputing the object columns with the most frequent value'''

for cols in null_col_list: 
    if train_test_merged[cols].dtypes == 'object': 
        categorical_imputer(cols)

Null Count in  " Alley " column Pre Transform:  2732
Null Count in  " Alley " column Post Transform:  0


Null Count in  " BsmtCond " column Pre Transform:  80
Null Count in  " BsmtCond " column Post Transform:  0


Null Count in  " BsmtExposure " column Pre Transform:  83
Null Count in  " BsmtExposure " column Post Transform:  0


Null Count in  " BsmtFinType1 " column Pre Transform:  80
Null Count in  " BsmtFinType1 " column Post Transform:  0


Null Count in  " BsmtFinType2 " column Pre Transform:  81
Null Count in  " BsmtFinType2 " column Post Transform:  0


Null Count in  " BsmtQual " column Pre Transform:  80
Null Count in  " BsmtQual " column Post Transform:  0


Null Count in  " Electrical " column Pre Transform:  1
Null Count in  " Electrical " column Post Transform:  0


Null Count in  " Fence " column Pre Transform:  2358
Null Count in  " Fence " column Post Transform:  0


Null Count in  " FireplaceQu " column Pre Transform:  1422
Null Count in  " FireplaceQu " column Post

**The result above gives the entire list of categorical columns that were imputed using the most frequent value**

In [87]:
'''Validating the remaning null columms'''

train_test_merged.columns[train_test_merged.isnull().sum()>0]

Index(['BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtUnfSF',
       'GarageArea', 'GarageCars', 'GarageYrBlt', 'LotFrontage', 'MasVnrArea',
       'SalePrice', 'TotalBsmtSF'],
      dtype='object')

**On quick review, these look like numerical columns. We are going to impute these columns in the subsequent cells using the median imputer from the Pre-Processing module**

# Creating the Dummies for the Categorical Columns

In [98]:
train_test_merge_dummies = pd.get_dummies(train_test_merged, drop_first=True)
print (train_test_merge_dummies.shape)

(2930, 286)


In [107]:
cols_to_exclude = ['PID', 'SaleCondition', 'SalePrice', 'Id']

In [100]:
# train_test_merged.source.value_counts()

# Setting the list of columns to be included in the Model Building

In [101]:
cols_to_include= []
for columns in list(train_test_merge_dummies.columns): 
    if columns not in cols_to_exclude: 
        cols_to_include.append(columns)
cols_to_include[0:5]

['1stFlrSF', '2ndFlrSF', '3SsnPorch', 'BedroomAbvGr', 'BsmtFinSF1']

The list of columns to be included is stored in the **cols_to_include** list

# Validating the list of Null Columns in the Dummied Dataframe

In [96]:
train_test_merge_dummies.columns[train_test_merge_dummies.isnull().sum()>0]

Index(['BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtUnfSF',
       'GarageArea', 'GarageCars', 'GarageYrBlt', 'LotFrontage', 'MasVnrArea',
       'SalePrice', 'TotalBsmtSF'],
      dtype='object')

All of the columns are numerical values

# Extracting the test and training sets to build the model 

In [104]:
'''Setting up the Predictors (X) and the Target (Y) variables on the Training Set'''

X_new_training_data = train_test_merge_dummies.loc[train_test_merge_dummies['source_Train'] == 1,cols_to_include]
y_new_training_data = train_test_merge_dummies.loc[train_test_merge_dummies['source_Train'] == 1,'SalePrice']

In [105]:
'''Setting up the predictors (X) on the Test Set'''

X_new_test_data = train_test_merge_dummies.loc[train_test_merge_dummies['source_Train'] == 0,cols_to_include]

In [108]:
# X_new_training_data.drop(['GarageYrBlt', 'LotFrontage', 'MasVnrArea','Id'], axis=1, inplace=True)

In [109]:
# X_new_test_data.drop(['GarageYrBlt', 'LotFrontage', 'MasVnrArea', 'Id'], axis=1, inplace=True)

# Checking the Shape of the Datasets

In [127]:
print (X_new_training_data.shape)
print (y_new_training_data.shape)
print (X_new_test_data.shape)

(2051, 283)
(2051,)
(879, 283)


In [128]:
# X_new_training_data.columns

In [129]:
# def numeric_col_extractor(df):
#     for k,v in enumerate(df.dtypes):
#         if v == 'int64' or v == 'float64' :
#             numeric_col_list_dummies.append(df.columns[k])
#         else: 
#             continue
#     return numeric_col_list_dummies


In [130]:
# num_cols = numeric_col_extractor(X_new_training_data)
# numeric_df = X_new_training_data.loc[:,num_cols]
# im = Imputer(strategy = 'median')
# transformed_train = im.fit_transform(numeric_df)

# num_test_cols = numeric_col_extractor(X_new_training_data)
# numeric_test_df = X_new_test_data.loc[:,num_test_cols]
# transformed_test = im.transform(numeric_test_df)

# Setting up the Median Imputer

In [131]:
median_imputer = Imputer(strategy='median')

# Retrieving Count of Null Numeric Columns in the Training Set

In [132]:
'''Numeric Columns in the Training set that needs to be addressed. In other words, we are going to apply the median imputer on all the numerical columns in the training set'''

print ('Total Number of Null Numeric Columns that needs to be addressed: ', len(X_new_training_data.columns[X_new_training_data.isnull().sum()>0]))

print ('\nHere is the list: ')

X_new_training_data.columns[X_new_training_data.isnull().sum()>0]


Total Number of Null Numeric Columns that needs to be addressed:  11

Here is the list: 


Index(['BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtUnfSF',
       'GarageArea', 'GarageCars', 'GarageYrBlt', 'LotFrontage', 'MasVnrArea',
       'TotalBsmtSF'],
      dtype='object')

# Getting a sense of numerical columns in the training set

In [133]:
numeric_col_list_dummies=[]
for k,v in enumerate(X_new_training_data.dtypes):
    if v == 'int64' or v == 'float64' : 
        numeric_col_list_dummies.append(X_new_training_data.columns[k])

    else: 
        continue
print (numeric_col_list_dummies)

['1stFlrSF', '2ndFlrSF', '3SsnPorch', 'BedroomAbvGr', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtUnfSF', 'EnclosedPorch', 'Fireplaces', 'FullBath', 'GarageArea', 'GarageCars', 'GarageYrBlt', 'GrLivArea', 'HalfBath', 'KitchenAbvGr', 'LotArea', 'LotFrontage', 'LowQualFinSF', 'MasVnrArea', 'MiscVal', 'MoSold', 'OpenPorchSF', 'OverallQual', 'PoolArea', 'ScreenPorch', 'TotRmsAbvGrd', 'TotalBsmtSF', 'WoodDeckSF', 'YearBuilt', 'YearRemod/Add', 'YrSold']


In [134]:
# '''Imputing all the numeric colums in X_Train'''
# for cols in numeric_col_list_dummies:
#     X_new_training_data[cols] = median_imputer.fit_transform(X_new_training_data[[cols]])

# Imputing the Null Columns with the Median Values using the Imputer

In [154]:
BsmtFinSF1_fit = median_imputer.fit(X_new_training_data[['BsmtFinSF1']])
X_new_training_data['BsmtFinSF1'] = BsmtFinSF1_fit.transform(X_new_training_data[['BsmtFinSF1']])

BsmtFinSF2_fit = median_imputer.fit(X_new_training_data[['BsmtFinSF2']])
X_new_training_data['BsmtFinSF2'] = BsmtFinSF2_fit.transform(X_new_training_data[['BsmtFinSF2']])

BsmtFullBath_fit = median_imputer.fit(X_new_training_data[['BsmtFullBath']])
X_new_training_data['BsmtFullBath'] = BsmtFullBath_fit.transform(X_new_training_data[['BsmtFullBath']])

BsmtHalfBath_fit = median_imputer.fit(X_new_training_data[['BsmtHalfBath']])
X_new_training_data['BsmtHalfBath'] = BsmtHalfBath_fit.transform(X_new_training_data[['BsmtHalfBath']])

BsmtUnfSF_fit = median_imputer.fit(X_new_training_data[['BsmtUnfSF']])
X_new_training_data['BsmtUnfSF'] = BsmtUnfSF_fit.transform(X_new_training_data[['BsmtUnfSF']])

GarageArea_fit = median_imputer.fit(X_new_training_data[['GarageArea']])
X_new_training_data['GarageArea'] = GarageArea_fit.transform(X_new_training_data[['GarageArea']])

GarageCars_fit = median_imputer.fit(X_new_training_data[['GarageCars']])
X_new_training_data['GarageCars'] = GarageCars_fit.transform(X_new_training_data[['GarageCars']])

GarageYrBlt_fit = median_imputer.fit(X_new_training_data[['GarageYrBlt']])
X_new_training_data['GarageYrBlt'] = GarageYrBlt_fit.transform(X_new_training_data[['GarageYrBlt']])

LotFrontage_fit = median_imputer.fit(X_new_training_data[['LotFrontage']])
X_new_training_data['LotFrontage'] = LotFrontage_fit.transform(X_new_training_data[['LotFrontage']])

MasVnrArea_fit = median_imputer.fit(X_new_training_data[['MasVnrArea']])
X_new_training_data['MasVnrArea'] = MasVnrArea_fit.transform(X_new_training_data[['MasVnrArea']])

TotalBsmtSF_fit = median_imputer.fit(X_new_training_data[['TotalBsmtSF']])
X_new_training_data['TotalBsmtSF'] = TotalBsmtSF_fit.transform(X_new_training_data[['TotalBsmtSF']])


**The above imputation should probably be carried out in a loop**

In [155]:
'''Validating to see if there are any null values in the predictors within the training set'''

X_new_training_data.columns[X_new_training_data.isnull().sum()>0]

Index([], dtype='object')

In [157]:
'''Validating to see if there are any null values in the Target Variables within the training set'''

y_new_training_data.isnull().sum()

0

As expected the predictors and the targets in the dataset dont have any null values 

# Model Building and Evaluation

# Basic Regression

In [160]:
'''Multiple Linear Regression '''

lr = LinearRegression()

In [165]:
'''Fitting the Model on the Training Set'''

lr.fit(X_new_training_data, y_new_training_data)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [167]:
'''Scoring the Model on the Training Set'''

lr.score(X_new_training_data, y_new_training_data)

0.9444094662289878

In [168]:
'''Predicting the Training Values '''

y_train_predicted = lr.predict(X_new_training_data)

# Applying the model on to Test Set 

#### Checking the list of null columns in the Test Set that will require imputation

In [171]:
X_new_test_data.columns[X_new_test_data.isnull().sum()>0]

Index(['GarageYrBlt', 'LotFrontage', 'MasVnrArea'], dtype='object')

The columns *'GarageYrBlt', 'LotFrontage', 'MasVnrArea'* needs imputation from the training model

In [173]:
X_new_test_data['GarageYrBlt'] = GarageYrBlt_fit.transform(X_new_test_data[['GarageYrBlt']])

X_new_test_data['LotFrontage'] = LotFrontage_fit.transform(X_new_test_data[['LotFrontage']])

X_new_test_data['MasVnrArea'] = MasVnrArea_fit.transform(X_new_test_data[['MasVnrArea']])

In [174]:
'''Performing validation to see if there are any more null values'''
X_new_test_data.columns[X_new_test_data.isnull().sum()>0]

Index([], dtype='object')

As expected after the imputation there are no more null values

# Predicting the value of the Target Variable on the Test Set

In [176]:
y_predicted = lr.predict(X_new_test_data)

In [177]:
'''Reviewing the first five values of the Predicted set'''
y_predicted[0:5]

array([ 132434.44175261,  254158.01500878,  193002.24266482,
        106105.24980435,  269366.29379859])

In [180]:
'''Creating a Dataframe of the predicted values. This will be used to create the CSV file'''

d = {'Id': test_raw['Id'], 'SalePrice': y_predicted}
predicted_df= pd.DataFrame(d)
predicted_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 879 entries, 0 to 878
Data columns (total 2 columns):
Id           879 non-null int64
SalePrice    879 non-null float64
dtypes: float64(1), int64(1)
memory usage: 13.8 KB


## CSV File Creation with the Basic Model

In [184]:
predicted_df.to_csv('ac_basic_predictions.csv', index=False)

## Root Mean Square Error 

In [185]:
rmse = np.sqrt(mean_squared_error(y_new_training_data, y_train_predicted))
print ('Root Mean Square Error: ', rmse)

Root Mean Square Error:  18682.769015


# Ridge 

In [199]:
'''Using the RidgeCV Approach to get the Optimal Alpha Value'''

ridge_alphas = np.logspace(0, 5, 200)

optimal_ridge = RidgeCV(alphas=ridge_alphas, cv=10)
optimal_ridge.fit(X_new_training_data, y_new_training_data)

print(optimal_ridge.alpha_)

5.05263106534


In [200]:
'''Using the Optimal Alpha Value calculated in the previous step to perform a Ridge Regression'''

ridge = Ridge(alpha=optimal_ridge.alpha_)
ridge_scores = cross_val_score(ridge, X_new_training_data, y_new_training_data, cv=10)

print(ridge_scores)
print(np.mean(ridge_scores))

[ 0.89090039  0.86367156  0.90952369  0.91556022  0.60831713  0.90414927
  0.90775448  0.89704457  0.89179566  0.72519327]
0.851391023175


In [201]:
'''Fitting the Model'''

ridge.fit(X_new_training_data, y_new_training_data)

Ridge(alpha=5.0526310653356807, copy_X=True, fit_intercept=True,
   max_iter=None, normalize=False, random_state=None, solver='auto',
   tol=0.001)

In [202]:
'''Scoring the Model'''
ridge.score(X_new_training_data, y_new_training_data)

0.91620867193542199

In [203]:
'''Predicting on the Training Set'''
y_ridge_train_predict = ridge.predict(X_new_training_data)

In [204]:
'''Predicting on the Test Set'''
y_ridge_test_predict = ridge.predict(X_new_test_data)

In [205]:
'''Creating a Dataframe of the predicted values. This will be used to create the CSV file'''

y_ridge_test_predict = ridge.predict(X_new_test_data)
d = {'Id': test_raw['Id'], 'SalePrice': y_ridge_test_predict}
predicted_df= pd.DataFrame(d)
predicted_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 879 entries, 0 to 878
Data columns (total 2 columns):
Id           879 non-null int64
SalePrice    879 non-null float64
dtypes: float64(1), int64(1)
memory usage: 13.8 KB


## CSV File Creation with the Ridge Model

In [206]:
predicted_df.to_csv('ac_ridge_predictions.csv', index=False)

## Root Mean Square Error 

In [207]:
rmse = np.sqrt(mean_squared_error(y_new_training_data, y_ridge_train_predict))
print ('Root Mean Square Error: ', rmse)

Root Mean Square Error:  22937.1977774


# Lasso

In [220]:
'''Calculating the Optimal Value of Alpha using the LassoLarsIC Method'''

model_aic = LassoLarsIC(criterion='aic')
model_aic.fit(X_new_training_data, y_new_training_data)
alpha_aic_ = model_aic.alpha_

print ('Optimal Alpha: ', alpha_aic_)

Optimal Alpha:  2.08540113522




In [227]:
'''Fitting the Model'''

lasso = Lasso(max_iter=8000, alpha=alpha_aic_)
lasso.fit(X_new_training_data, y_new_training_data)



Lasso(alpha=2.0854011352200117, copy_X=True, fit_intercept=True,
   max_iter=8000, normalize=False, positive=False, precompute=False,
   random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [228]:
'''Scoring the Model'''

lasso.score(X_new_training_data, y_new_training_data)

0.94388551546838118

In [229]:
'''Predicting on the Test Set'''
y_lasso_test_predict = lasso.predict(X_new_test_data)

In [231]:
'''Predicting on the Train Set'''
y_lasso_train_predict = lasso.predict(X_new_training_data)

In [232]:
'''Creating a Dataframe of the predicted values. This will be used to create the CSV file'''

d = {'Id': test_raw['Id'], 'SalePrice': y_lasso_test_predict}
predicted_df= pd.DataFrame(d)
predicted_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 879 entries, 0 to 878
Data columns (total 2 columns):
Id           879 non-null int64
SalePrice    879 non-null float64
dtypes: float64(1), int64(1)
memory usage: 13.8 KB


## CSV File Creation from the Basic Lasso Model 

In [235]:
predicted_df.to_csv('ac_lasso_predictions.csv', index=False)

## Root Mean Square Error 

In [237]:
# Root Mean Square Error 
rmse = np.sqrt(mean_squared_error(y_new_training_data, y_lasso_train_predict))
print ('Root Mean Square Error: ', rmse)


Root Mean Square Error:  18770.6067548


# Elastic Net 

In [239]:
ensCV = ElasticNetCV(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 2, 10],\
                       l1_ratio=[.01, .1, .5, .9, .99], max_iter=5000)



In [240]:
ensCV.get_params

<bound method BaseEstimator.get_params of ElasticNetCV(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 2, 10], copy_X=True,
       cv=None, eps=0.001, fit_intercept=True,
       l1_ratio=[0.01, 0.1, 0.5, 0.9, 0.99], max_iter=5000, n_alphas=100,
       n_jobs=1, normalize=False, positive=False, precompute='auto',
       random_state=None, selection='cyclic', tol=0.0001, verbose=0)>

In [245]:
'''Fitting the Basic Model'''

ensCV.fit(X_new_training_data, y_new_training_data)



ElasticNetCV(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 2, 10], copy_X=True,
       cv=None, eps=0.001, fit_intercept=True,
       l1_ratio=[0.01, 0.1, 0.5, 0.9, 0.99], max_iter=5000, n_alphas=100,
       n_jobs=1, normalize=False, positive=False, precompute='auto',
       random_state=None, selection='cyclic', tol=0.0001, verbose=0)

In [246]:
'''Scoring the Basic Model'''

ensCV.score(X_new_training_data, y_new_training_data)

0.92012928479840117

In [249]:
'''Predicting on the Train'''

y_enscv_train_predict = ensCV.predict(X_new_training_data)

In [248]:
'''Creating a Dataframe of the predicted values. This will be used to create the CSV file'''

y_enscv_test_predict = ensCV.predict(X_new_test_data)

d = {'Id': test_raw['Id'], 'SalePrice': y_enscv_test_predict}
predicted_df= pd.DataFrame(d)
predicted_df.info()

predicted_df.to_csv('ac_ens_predictions.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 879 entries, 0 to 878
Data columns (total 2 columns):
Id           879 non-null int64
SalePrice    879 non-null float64
dtypes: float64(1), int64(1)
memory usage: 13.8 KB


# Root Mean Square Error 

In [250]:
rmse = np.sqrt(mean_squared_error(y_new_training_data, y_enscv_train_predict))
print ('Root Mean Square Error: ', rmse)

Root Mean Square Error:  22394.1513503


# Using RFE to perform Feature Selection

In [862]:
rfe = RFE(gb, n_features_to_select=100, verbose=2)

In [863]:
rfe.get_support

<bound method SelectorMixin.get_support of RFE(estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.05, loss='ls', max_depth=8, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=8, min_samples_split=6,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False),
  n_features_to_select=100, step=1, verbose=2)>

In [864]:
rfe.fit(X_new_training_data, y_new_training_data)
rfe.score(X_new_training_data, y_new_training_data)

Fitting estimator with 281 features.
Fitting estimator with 280 features.
Fitting estimator with 279 features.
Fitting estimator with 278 features.
Fitting estimator with 277 features.
Fitting estimator with 276 features.
Fitting estimator with 275 features.
Fitting estimator with 274 features.
Fitting estimator with 273 features.
Fitting estimator with 272 features.
Fitting estimator with 271 features.
Fitting estimator with 270 features.
Fitting estimator with 269 features.
Fitting estimator with 268 features.
Fitting estimator with 267 features.
Fitting estimator with 266 features.
Fitting estimator with 265 features.
Fitting estimator with 264 features.
Fitting estimator with 263 features.
Fitting estimator with 262 features.
Fitting estimator with 261 features.
Fitting estimator with 260 features.
Fitting estimator with 259 features.
Fitting estimator with 258 features.
Fitting estimator with 257 features.
Fitting estimator with 256 features.
Fitting estimator with 255 features.
F

0.98931146572853856

In [865]:
# rfe.n_features_to_select

In [866]:
# rfe.support_

In [867]:
rfe_col_list = []
for col, keep in zip(X_new_training_data.columns, rfe.support_):
    if keep == True:
        rfe_col_list.append(col)
        print(col, 'kept!')
    else: 
        continue

1stFlrSF kept!
2ndFlrSF kept!
BedroomAbvGr kept!
BsmtFinSF1 kept!
BsmtFinSF2 kept!
BsmtFullBath kept!
BsmtUnfSF kept!
EnclosedPorch kept!
Fireplaces kept!
FullBath kept!
GarageArea kept!
GarageCars kept!
GrLivArea kept!
HalfBath kept!
Id kept!
LotArea kept!
LowQualFinSF kept!
MoSold kept!
OpenPorchSF kept!
OverallQual kept!
ScreenPorch kept!
TotRmsAbvGrd kept!
TotalBsmtSF kept!
WoodDeckSF kept!
YearBuilt kept!
YearRemod/Add kept!
YrSold kept!
BsmtCond_Fa kept!
BsmtCond_Gd kept!
BsmtCond_TA kept!
BsmtExposure_Gd kept!
BsmtExposure_No kept!
BsmtFinType1_GLQ kept!
BsmtFinType1_Rec kept!
BsmtQual_Gd kept!
BsmtQual_TA kept!
CentralAir_Y kept!
Condition1_Feedr kept!
Condition1_Norm kept!
ExterCond_Fa kept!
ExterCond_TA kept!
ExterQual_Gd kept!
ExterQual_TA kept!
Exterior1st_BrkFace kept!
Exterior1st_MetalSd kept!
Exterior1st_VinylSd kept!
Exterior1st_Wd Sdng kept!
Exterior2nd_HdBoard kept!
Exterior2nd_MetalSd kept!
Exterior2nd_VinylSd kept!
Exterior2nd_Wd Sdng kept!
FireplaceQu_Gd kept!
Foun

In [868]:
gb = GradientBoostingRegressor(learning_rate=0.05, max_depth=8, min_samples_leaf=8,\
                               min_samples_split=6, loss='ls', alpha=0.9)
gb.fit(X_new_training_data[rfe_col_list], y_new_training_data)
gb.score(X_new_training_data[rfe_col_list], y_new_training_data)


y_gb_train_predict = gb.predict(X_new_training_data[rfe_col_list])
y_gb_test_predict = gb.predict(X_new_test_data[rfe_col_list])

d = {'Id': X_new_test_data['Id'], 'SalePrice': y_gb_test_predict}
predicted_df= pd.DataFrame(d)
predicted_df.info()

predicted_df.to_csv('ac_gb_predictions.csv', index=False)

np.sqrt(mean_squared_error(y_new_training_data, y_gb_train_predict))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 879 entries, 0 to 878
Data columns (total 2 columns):
Id           879 non-null int64
SalePrice    879 non-null float64
dtypes: float64(1), int64(1)
memory usage: 20.6 KB


8192.187796094915

# Gradient Boost Ensemble Model 

In [253]:

gb = GradientBoostingRegressor()
gb_params = {
#     'n_estimators': [11000],
    'max_depth': [2,3,4,5,6,7,8,9,10],
    'max_features': [4,6,8,10,100,150, 200, 250, 'sqrt']
}
gb_model = GridSearchCV(gb, param_grid=gb_params, n_jobs=-1)
gb_model.fit(X_new_training_data, y_new_training_data)
print('best score', gb_model.best_score_)
print('best params', gb_model.best_params_)
print('test score', gb_model.score(X_new_training_data, y_new_training_data))

# y_gb_train_predict = GBest.predict(X_new_training_data)
# y_gb_test_predict = gb_model.predict(X_new_test_data)

# d = {'Id': X_new_test_data['Id'], 'SalePrice': y_gb_test_predict}
# predicted_df= pd.DataFrame(d)
# predicted_df.info()

# predicted_df.to_csv('ac_gb_predictions.csv', index=False)

# np.sqrt(mean_squared_error(y_new_training_data, y_gb_train_predict))

best score 0.915720964935
best params {'max_depth': 3, 'max_features': 100}
test score 0.960013365164


In [1164]:
# MSE of 26,195 - also submitted on Kaggle 
# GBest = GradientBoostingRegressor(n_estimators=8000, learning_rate=0.05, max_depth=4, max_features='sqrt',
#                                                min_samples_leaf=8, min_samples_split=8, loss='huber', alpha=0.95)

# MSE Of 28k but on the Train it was close to 2.3K 
# GBest = GradientBoostingRegressor(n_estimators=8000, learning_rate=0.05, max_depth=4, max_features=150,
#                                                min_samples_leaf=8, min_samples_split=8, loss='huber', alpha=0.99)

# MSE Of 28k, score of 0.9999, and score of 0.022 in train
# GBest = GradientBoostingRegressor(n_estimators=9000, learning_rate=0.05, max_depth=8, max_features=150,
#                                                min_samples_leaf=8, min_samples_split=6, loss='ls', alpha=0.9)


GBest = GradientBoostingRegressor(n_estimators=30000, learning_rate=0.05, max_depth=4, max_features='sqrt',
                                               min_samples_leaf=20, min_samples_split=15, loss='huber', alpha=0.8)


# GBest = GradientBoostingRegressor(n_estimators=30000, learning_rate=0.05,
#                                    max_depth=4, max_features='sqrt',
#                                    min_samples_leaf=15, min_samples_split=10, 
#                                    loss='huber')


In [1165]:
GBest.fit(X_new_training_data, y_new_training_data)

GradientBoostingRegressor(alpha=0.999, criterion='friedman_mse', init=None,
             learning_rate=0.05, loss='huber', max_depth=4,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=20,
             min_samples_split=15, min_weight_fraction_leaf=0.0,
             n_estimators=30000, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [1166]:
GBest.score(X_new_training_data, y_new_training_data)

0.99997474711539891

In [1167]:
y_gb_train_predict = GBest.predict(X_new_training_data)
y_gb_test_predict = GBest.predict(X_new_test_data)

d = {'Id': test_raw['Id'], 'SalePrice': y_gb_test_predict}
predicted_df= pd.DataFrame(d)
predicted_df.info()

predicted_df.to_csv('ac_gb_predictions.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 879 entries, 0 to 878
Data columns (total 2 columns):
Id           879 non-null int64
SalePrice    879 non-null float64
dtypes: float64(1), int64(1)
memory usage: 13.8 KB


In [1168]:
np.sqrt(mean_squared_error(y_new_training_data, y_gb_train_predict))

398.19547403520789

In [1169]:
# Average of two models
# Final_labels = (np.exp(GBest.predict(X_new_test_data)) + np.exp(ensCV.predict(X_new_test_data))) / 2

In [1170]:
# pd.DataFrame({'Id': X_new_test_data['Id'], 'SalePrice': Final_labels}).to_csv('ac_combinded.csv', index =False)    

In [1163]:
# X_new_training_data.Id

In [766]:
# from sklearn.pipeline import make_pipeline
# pipeline = make_pipeline(StandardScaler(), gb)

In [767]:
# pipeline.fit(X_new_training_data, y_new_training_data)

In [768]:
# pipeline.score(X_new_training_data, y_new_training_data)

In [769]:
# pipeline.predict(X_new_training_data)

In [770]:
# np.sqrt(mean_squared_error(y_new_training_data, pipeline.predict(X_new_training_data)))

# Test

In [606]:
clf = RandomForestRegressor(n_estimators=8000, max_depth=4, n_jobs=-1)
clf.fit(X_new_training_data, y_new_training_data)
clf.score(X_new_training_data, y_new_training_data)

0.86807357339283664

In [607]:
y_rf_train_predict = clf.predict(X_new_training_data)
y_rf_test_predict = clf.predict(X_new_test_data)
np.sqrt(mean_squared_error(y_new_training_data, y_gb_train_predict))

4817.3763622222741

In [608]:
# rfe=RFECV(gb, n_jobs = -1, verbose=1, cv=5)