**Loading Required Libraries**

In [13]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x),'display.max_columns', None) #Limiting floats output to 3 decimal points

import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

**Data Ingestion**

In [83]:
df = pd.read_csv('train1.csv')

In [15]:
# pivot table for the missing values
total = df.isnull().sum().sort_values(ascending=False)
percentage = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percentage], axis=1, keys=['Total', 'Percentage'])
missing_data.head(20)

Unnamed: 0,Total,Percentage
PoolQC,1453,0.995
MiscFeature,1406,0.963
Alley,1369,0.938
Fence,1179,0.808
FireplaceQu,690,0.473
LotFrontage,238,0.163
GarageCond,81,0.055
GarageType,81,0.055
GarageYrBlt,81,0.055
GarageFinish,81,0.055


In [84]:
df1 = df.drop(['PoolQC','MiscFeature','Alley','Fence','FireplaceQu','LotFrontage','GarageCond','GarageType','GarageYrBlt','GarageFinish','GarageQual','BsmtExposure','BsmtFinType2','BsmtFinType1','BsmtCond','BsmtQual','MasVnrArea','MasVnrType'], axis=1)

In [85]:
corr_matrix = df1.corr()[['SalePrice']].sort_values(by = ['SalePrice'], ascending = False).drop(['SalePrice'])
corr_matrix.style.background_gradient(cmap = 'coolwarm').set_precision(2)


Unnamed: 0,SalePrice
OverallQual,0.79
GrLivArea,0.71
GarageCars,0.64
GarageArea,0.62
TotalBsmtSF,0.61
1stFlrSF,0.61
FullBath,0.56
TotRmsAbvGrd,0.53
YearBuilt,0.52
YearRemodAdd,0.51


In [86]:
df2 = df1.drop(['HalfBath','LotArea','BsmtFullBath','BsmtUnfSF','BedroomAbvGr','ScreenPorch','PoolArea','MoSold','3SsnPorch','BsmtFinSF2','BsmtHalfBath','MiscVal','LowQualFinSF','YrSold','OverallCond','MSSubClass','EnclosedPorch','KitchenAbvGr'], axis=1)

In [87]:
df2['Electrical'].fillna('None', inplace= True)

In [19]:
corr_matrix.shape

(34, 1)

In [21]:
df2.shape

(1460, 44)

In [22]:
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x),'display.max_columns', None)

In [23]:
df2.dtypes

MSZoning         object
Street           object
LotShape         object
LandContour      object
Utilities        object
LotConfig        object
LandSlope        object
Neighborhood     object
Condition1       object
Condition2       object
BldgType         object
HouseStyle       object
OverallQual       int64
YearBuilt         int64
YearRemodAdd      int64
RoofStyle        object
RoofMatl         object
Exterior1st      object
Exterior2nd      object
ExterQual        object
ExterCond        object
Foundation       object
BsmtFinSF1        int64
TotalBsmtSF       int64
Heating          object
HeatingQC        object
CentralAir       object
Electrical       object
1stFlrSF          int64
2ndFlrSF          int64
GrLivArea         int64
FullBath          int64
KitchenQual      object
TotRmsAbvGrd      int64
Functional       object
Fireplaces        int64
GarageCars        int64
GarageArea        int64
PavedDrive       object
WoodDeckSF        int64
OpenPorchSF       int64
SaleType        

#Feature Engineering and pipeline building

In [25]:
df2.select_dtypes(include=['int64']).columns.tolist()

['OverallQual',
 'YearBuilt',
 'YearRemodAdd',
 'BsmtFinSF1',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'GrLivArea',
 'FullBath',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'SalePrice']

In [88]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [89]:
numeric_features = ['OverallQual',
 'YearBuilt',
 'YearRemodAdd',
 'BsmtFinSF1',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'GrLivArea',
 'FullBath',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF']
                    
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

In [90]:
categorical_features = ['MSZoning',
 'Street',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'PavedDrive',
 'SaleType',
 'SaleCondition']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [91]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [92]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LinearRegression())])

In [93]:
X = df2.drop('SalePrice', axis=1)
y = df2[['SalePrice']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1168, 44)
(292, 44)
(1168, 1)
(292, 1)


In [94]:
#Get the missing variables and their corresponding number of missing values
missing_columns = len(df2) - df2.loc[:, np.sum(df2.isnull())>0].count()
missing_columns

Series([], dtype: int64)

In [96]:
clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('scaler',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=True,
                                                                                  with_std=True))],
                                                           verbose=False),
                                                  ['OverallQual', 'YearBuilt',
                                                   'YearRemodAdd', 'BsmtFinSF1',
                             

In [97]:
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

In [98]:
y_test_pred

array([[152597.35699993],
       [333576.14297397],
       [ 83517.12614239],
       [179307.19587296],
       [310152.542026  ],
       [ 56934.72386716],
       [230220.20273743],
       [151069.3730126 ],
       [ 46849.04896895],
       [122332.08104256],
       [139657.31533864],
       [105849.52910787],
       [ 68686.15704434],
       [229220.34916945],
       [174773.16737339],
       [130430.1335991 ],
       [193643.36669056],
       [129245.61737188],
       [122681.1161967 ],
       [218687.63034025],
       [166382.75485474],
       [210388.80967387],
       [165946.88595149],
       [120813.06670166],
       [204721.78146644],
       [132598.20633663],
       [197650.07400279],
       [ 95495.00961213],
       [173322.16964703],
       [193595.64519264],
       [137338.34634519],
       [286904.80182448],
       [254006.2760187 ],
       [ 97444.55731217],
       [235689.20531022],
       [158013.93200179],
       [141801.39201566],
       [207490.43139025],
       [3229

In [101]:
submission = pd.DataFrame()
submission['Id']= X_test.Id
submission['SalePrice'] = y_test_pred

In [102]:
submission

Unnamed: 0,Id,SalePrice
892,893,152597.357
1105,1106,333576.143
413,414,83517.126
522,523,179307.196
1036,1037,310152.542
...,...,...
479,480,144370.360
1361,1362,282383.147
802,803,185014.983
651,652,101683.016


In [43]:

clf.score(X_train, y_train)

0.9151732030274681

In [44]:

clf.score(X_test, y_test)

0.8694631181264455

In [76]:
import numpy as np

In [78]:
y_pred = np.expm1(clf.predict(X_test))

  """Entry point for launching an IPython kernel.


In [53]:
print(X_test, y_test)

     MSZoning Street LotShape LandContour Utilities LotConfig LandSlope  \
892        RL   Pave      Reg         Lvl    AllPub    Inside       Gtl   
1105       RL   Pave      IR1         Lvl    AllPub    Corner       Gtl   
413        RM   Pave      Reg         Lvl    AllPub    Inside       Gtl   
522        RM   Pave      Reg         Lvl    AllPub    Corner       Gtl   
1036       RL   Pave      IR1         HLS    AllPub    Inside       Gtl   
...       ...    ...      ...         ...       ...       ...       ...   
479        RM   Pave      Reg         Bnk    AllPub    Inside       Gtl   
1361       RL   Pave      IR1         Low    AllPub    Inside       Mod   
802        RL   Pave      Reg         Lvl    AllPub    Inside       Gtl   
651        RL   Pave      Reg         Lvl    AllPub    Inside       Gtl   
722        RL   Pave      Reg         Lvl    AllPub    Inside       Gtl   

     Neighborhood Condition1 Condition2 BldgType HouseStyle  OverallQual  \
892        Sawyer      

In [48]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, make_scorer, accuracy_score
from math import sqrt

In [49]:
scorer = make_scorer(mean_squared_error, greater_is_better = False)
rmse_train = np.sqrt(-cross_val_score(clf, X_train, y_train, scoring = scorer, cv=10))
rmse_test = np.sqrt(-cross_val_score(clf, X_test, y_test, scoring = scorer, cv=10))
print ('Mean RMSE for training set is',rmse_train.mean())
print ('Mean RMSE for the test set is',rmse_test.mean())

Mean RMSE for training set is 33327.76569658233
Mean RMSE for the test set is 45397.432229284044


In [51]:
import pickle

pickle.dump(clf, open('lr_model.pkl','wb'))
# Loading model to compare the results
model = pickle.load(open('lr_model.pkl','rb'))
print(model)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('scaler',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=True,
                                                                                  with_std=True))],
                                                           verbose=False),
                                                  ['OverallQual', 'YearBuilt',
                                                   'YearRemodAdd', 'BsmtFinSF1',
                             

In [65]:
df_test = pd.read_csv('test1.csv')

In [66]:
df_t = df_test.drop(['PoolQC','MiscFeature','Alley','Fence','FireplaceQu','LotFrontage','GarageCond','GarageType','GarageYrBlt','GarageFinish','GarageQual','BsmtExposure','BsmtFinType2','BsmtFinType1','BsmtCond','BsmtQual','MasVnrArea','MasVnrType',
                     'HalfBath','LotArea','BsmtFullBath','BsmtUnfSF','BedroomAbvGr','ScreenPorch','PoolArea','MoSold','3SsnPorch','BsmtFinSF2','BsmtHalfBath','MiscVal','Id','LowQualFinSF','YrSold','OverallCond','MSSubClass','EnclosedPorch','KitchenAbvGr'], axis=1)

In [67]:
#Get the missing variables and their corresponding number of missing values
missing_columns_t = len(df_t) - df_t.loc[:, np.sum(df_t.isnull())>0].count()
missing_columns_t

MSZoning       4
Utilities      2
Exterior1st    1
Exterior2nd    1
BsmtFinSF1     1
TotalBsmtSF    1
KitchenQual    1
Functional     2
GarageCars     1
GarageArea     1
SaleType       1
dtype: int64

In [72]:
to_impute_by_none = df_t.loc[:, ['MSZoning','Utilities','Exterior1st','Exterior2nd','BsmtFinSF1','TotalBsmtSF','KitchenQual','Functional','GarageCars','GarageArea']]
for i in to_impute_by_none.columns:
    df_t[i].fillna('None', inplace = True)

In [103]:
def predict_with_optimized_models(mdl):
    mdl.fit(X_train, y_train)
    y_pred = np.expm1(mdl.predict(X_test))
    submission = pd.DataFrame()
    submission['Id']= X_test.Id
    submission['SalePrice'] = y_test_pred
    return submission

In [104]:
predict_with_optimized_models(clf).to_csv('lr_reg.csv', index = False)
# predict_with_optimized_models(ridge_opt).to_csv('ridge_optimized.csv', index = False)

  This is separate from the ipykernel package so we can avoid doing imports until
