# Regression Project - Prediction of Sales Price for a given House

## 1. Reading Dataset CSV file

In [1]:
from warnings import filterwarnings
filterwarnings ('ignore')

In [111]:
import pandas as pd
df = pd.read_csv('training_set.csv',na_values=[''])
df = df.drop(columns='Id')
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


## 2. Basic Data Quality check

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   MSSubClass     1460 non-null   int64 
 1   MSZoning       1460 non-null   object
 2   LotFrontage    1460 non-null   object
 3   LotArea        1460 non-null   int64 
 4   Street         1460 non-null   object
 5   Alley          1460 non-null   object
 6   LotShape       1460 non-null   object
 7   LandContour    1460 non-null   object
 8   Utilities      1460 non-null   object
 9   LotConfig      1460 non-null   object
 10  LandSlope      1460 non-null   object
 11  Neighborhood   1460 non-null   object
 12  Condition1     1460 non-null   object
 13  Condition2     1460 non-null   object
 14  BldgType       1460 non-null   object
 15  HouseStyle     1460 non-null   object
 16  OverallQual    1460 non-null   int64 
 17  OverallCond    1460 non-null   int64 
 18  YearBuilt      1460 non-null

In [4]:
m = df.isna().sum()
m[m>0]

Series([], dtype: int64)

In [5]:
df.duplicated().sum()

0

## 3. Separate X and Y features

In [6]:
X = df.drop(columns=['SalePrice'])
Y = df[['SalePrice']]

In [7]:
X.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


In [8]:
Y.head()

Unnamed: 0,SalePrice
0,208500
1,181500
2,223500
3,140000
4,250000


## 4. Exploratory Data Analysis - Univariate & Bivariate

### Separate Categorical & Continuous features for EDA

In [9]:
catA = list(df.columns[df.dtypes=='object'])
conA = list(df.columns[df.dtypes!='object'])

In [10]:
catA

['MSZoning',
 'LotFrontage',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'MasVnrArea',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [11]:
conA

['MSSubClass',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold',
 'SalePrice']

### Univariate Analysis

# Descriptive Analysis of Categorical features
df[catA].describe().T


# Descriptive Analysis of Continuous features
df[conA].describe().T

In [12]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl


# Plotting Bar Chart/Count Plot for catA

for i in catA:
    df[i].value_counts().plot(kind='bar',
                              title=f'Count Plot for {i}',
                              figsize=(16,8))
    plt.show()

## 5. Separating Categorical & Continuous features for MLR

In [13]:
cat = list(X.columns[X.dtypes=='object'])
con = list(X.columns[X.dtypes!='object'])

In [14]:
cat

['MSZoning',
 'LotFrontage',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'MasVnrArea',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [15]:
con

['MSSubClass',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

## 5. Data Preprocessing on X (independent features) - Feature Selection Pipelines 

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [17]:
num_pipe1 = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),
                            ('scaler', StandardScaler())])

In [18]:
cat_pipe1 = Pipeline(steps=[('impute', SimpleImputer(strategy='constant', fill_value='NotAvail')),
                            ('ordinal', OrdinalEncoder())])

In [19]:
pre1 = ColumnTransformer([('num', num_pipe1, con),
                          ('cat', cat_pipe1, cat)]).set_output(transform='pandas')

In [20]:
pre1

In [21]:
X_pre = pre1.fit_transform(X)
X_pre.head()

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__YearRemodAdd,num__BsmtFinSF1,num__BsmtFinSF2,num__BsmtUnfSF,num__TotalBsmtSF,...,cat__GarageYrBlt,cat__GarageFinish,cat__GarageQual,cat__GarageCond,cat__PavedDrive,cat__PoolQC,cat__Fence,cat__MiscFeature,cat__SaleType,cat__SaleCondition
0,0.073375,-0.207142,0.651479,-0.5172,1.050994,0.878668,0.575425,-0.288653,-0.944591,-0.459303,...,89.0,2.0,5.0,5.0,2.0,3.0,4.0,1.0,8.0,4.0
1,-0.872563,-0.091886,-0.071836,2.179628,0.156734,-0.429577,1.171992,-0.288653,-0.641228,0.466465,...,62.0,2.0,5.0,5.0,2.0,3.0,4.0,1.0,8.0,4.0
2,0.073375,0.07348,0.651479,-0.5172,0.984752,0.830215,0.092907,-0.288653,-0.301643,-0.313369,...,87.0,2.0,5.0,5.0,2.0,3.0,4.0,1.0,8.0,4.0
3,0.309859,-0.096897,0.651479,-0.5172,-1.863632,-0.720298,-0.499274,-0.288653,-0.06167,-0.687324,...,84.0,3.0,5.0,5.0,2.0,3.0,4.0,1.0,8.0,0.0
4,0.073375,0.375148,1.374795,-0.5172,0.951632,0.733308,0.463568,-0.288653,-0.174865,0.19968,...,86.0,2.0,5.0,5.0,2.0,3.0,4.0,1.0,8.0,4.0


## 5(i). Forward Feature Selection Pipeline

#### Categorical Features --> Ordinal Encoding

In [22]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector

In [23]:
flr = LinearRegression() #forward selection
fsel = SequentialFeatureSelector(flr, direction='forward')
fsel.fit_transform(X_pre, Y)
fsel_cols = fsel.get_feature_names_out()
fsel_cols

array(['num__MSSubClass', 'num__LotArea', 'num__OverallQual',
       'num__OverallCond', 'num__YearBuilt', 'num__BsmtFinSF1',
       'num__GrLivArea', 'num__BsmtFullBath', 'num__Fireplaces',
       'num__GarageCars', 'num__WoodDeckSF', 'num__OpenPorchSF',
       'num__ScreenPorch', 'num__PoolArea', 'num__YrSold',
       'cat__LotFrontage', 'cat__Street', 'cat__LandContour',
       'cat__Utilities', 'cat__Neighborhood', 'cat__BldgType',
       'cat__HouseStyle', 'cat__RoofStyle', 'cat__RoofMatl',
       'cat__Exterior1st', 'cat__ExterQual', 'cat__BsmtQual',
       'cat__BsmtCond', 'cat__BsmtExposure', 'cat__HeatingQC',
       'cat__Electrical', 'cat__KitchenQual', 'cat__Functional',
       'cat__FireplaceQu', 'cat__GarageYrBlt', 'cat__GarageCond',
       'cat__PavedDrive', 'cat__Fence', 'cat__MiscFeature'], dtype=object)

In [24]:
# getting original column names per dataframe
imp_cols_f = []
for i in fsel_cols:
    s = i.split('__')[1]
    imp_cols_f.append(s)

imp_cols_f 

['MSSubClass',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'BsmtFinSF1',
 'GrLivArea',
 'BsmtFullBath',
 'Fireplaces',
 'GarageCars',
 'WoodDeckSF',
 'OpenPorchSF',
 'ScreenPorch',
 'PoolArea',
 'YrSold',
 'LotFrontage',
 'Street',
 'LandContour',
 'Utilities',
 'Neighborhood',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'ExterQual',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'HeatingQC',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageYrBlt',
 'GarageCond',
 'PavedDrive',
 'Fence',
 'MiscFeature']

#### Converting the above to Dataframe

In [25]:
X_fsel = X[imp_cols_f]
X_fsel.head()

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,BsmtFinSF1,GrLivArea,BsmtFullBath,Fireplaces,GarageCars,...,HeatingQC,Electrical,KitchenQual,Functional,FireplaceQu,GarageYrBlt,GarageCond,PavedDrive,Fence,MiscFeature
0,60,8450,7,5,2003,706,1710,1,0,2,...,Ex,SBrkr,Gd,Typ,,2003,TA,Y,,
1,20,9600,6,8,1976,978,1262,0,1,2,...,Ex,SBrkr,TA,Typ,TA,1976,TA,Y,,
2,60,11250,7,5,2001,486,1786,1,1,2,...,Ex,SBrkr,Gd,Typ,TA,2001,TA,Y,,
3,70,9550,7,5,1915,216,1717,1,1,3,...,Gd,SBrkr,Gd,Typ,Gd,1998,TA,Y,,
4,60,14260,8,5,2000,655,2198,1,1,3,...,Ex,SBrkr,Gd,Typ,TA,2000,TA,Y,,


In [26]:
X_fsel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 39 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   MSSubClass    1460 non-null   int64 
 1   LotArea       1460 non-null   int64 
 2   OverallQual   1460 non-null   int64 
 3   OverallCond   1460 non-null   int64 
 4   YearBuilt     1460 non-null   int64 
 5   BsmtFinSF1    1460 non-null   int64 
 6   GrLivArea     1460 non-null   int64 
 7   BsmtFullBath  1460 non-null   int64 
 8   Fireplaces    1460 non-null   int64 
 9   GarageCars    1460 non-null   int64 
 10  WoodDeckSF    1460 non-null   int64 
 11  OpenPorchSF   1460 non-null   int64 
 12  ScreenPorch   1460 non-null   int64 
 13  PoolArea      1460 non-null   int64 
 14  YrSold        1460 non-null   int64 
 15  LotFrontage   1460 non-null   object
 16  Street        1460 non-null   object
 17  LandContour   1460 non-null   object
 18  Utilities     1460 non-null   object
 19  Neighb

#### Separating Categorical and Continuous features from X_fsel 

In [27]:
cat_fsel = list(X_fsel.columns[X_fsel.dtypes=='object'])
con_fsel = list(X_fsel.columns[X_fsel.dtypes!='object'])

In [28]:
cat_fsel

['LotFrontage',
 'Street',
 'LandContour',
 'Utilities',
 'Neighborhood',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'ExterQual',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'HeatingQC',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageYrBlt',
 'GarageCond',
 'PavedDrive',
 'Fence',
 'MiscFeature']

In [29]:
con_fsel

['MSSubClass',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'BsmtFinSF1',
 'GrLivArea',
 'BsmtFullBath',
 'Fireplaces',
 'GarageCars',
 'WoodDeckSF',
 'OpenPorchSF',
 'ScreenPorch',
 'PoolArea',
 'YrSold']

## 5(ii). Final Pipeline on X_fsel for Model Building

#### Categorical Features --> One Hot Encoding

In [30]:
from sklearn.preprocessing import OneHotEncoder

In [31]:
num_pipe2 = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),
                            ('scaler', StandardScaler())])

In [32]:
cat_pipe2 = Pipeline(steps=[('impute', SimpleImputer(strategy='most_frequent')),
                            ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])

In [33]:
pre2 = ColumnTransformer([('num', num_pipe2, con_fsel),
                          ('cat', cat_pipe2, cat_fsel)]).set_output(transform='pandas')
pre2

In [34]:
X_fsel_pre = pre2.fit_transform(X_fsel)
X_fsel_pre.head()

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__BsmtFinSF1,num__GrLivArea,num__BsmtFullBath,num__Fireplaces,num__GarageCars,...,cat__Fence_GdPrv,cat__Fence_GdWo,cat__Fence_MnPrv,cat__Fence_MnWw,cat__Fence_NA,cat__MiscFeature_Gar2,cat__MiscFeature_NA,cat__MiscFeature_Othr,cat__MiscFeature_Shed,cat__MiscFeature_TenC
0,0.073375,-0.207142,0.651479,-0.5172,1.050994,0.575425,0.370333,1.10781,-0.951226,0.311725,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1,-0.872563,-0.091886,-0.071836,2.179628,0.156734,1.171992,-0.482512,-0.819964,0.600495,0.311725,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,0.073375,0.07348,0.651479,-0.5172,0.984752,0.092907,0.515013,1.10781,0.600495,0.311725,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,0.309859,-0.096897,0.651479,-0.5172,-1.863632,-0.499274,0.383659,1.10781,0.600495,1.650307,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,0.073375,0.375148,1.374795,-0.5172,0.951632,0.463568,1.299326,1.10781,0.600495,1.650307,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


## 5(iii). Backward-Elimination Feature Selection

#### note: scikit packages already imported in Forward Feature Selection steps

In [35]:
blr = LinearRegression() #backward-elimination feature selection
bsel = SequentialFeatureSelector(flr, direction='backward')
bsel.fit_transform(X_pre, Y)
bsel_cols = bsel.get_feature_names_out()
bsel_cols

array(['num__MSSubClass', 'num__LotArea', 'num__OverallQual',
       'num__OverallCond', 'num__YearBuilt', 'num__BsmtFinSF1',
       'num__BsmtUnfSF', 'num__TotalBsmtSF', 'num__1stFlrSF',
       'num__2ndFlrSF', 'num__LowQualFinSF', 'num__GrLivArea',
       'num__BsmtHalfBath', 'num__TotRmsAbvGrd', 'num__Fireplaces',
       'num__GarageCars', 'num__WoodDeckSF', 'num__OpenPorchSF',
       'num__ScreenPorch', 'num__PoolArea', 'num__YrSold',
       'cat__LotFrontage', 'cat__LandContour', 'cat__Utilities',
       'cat__LandSlope', 'cat__Neighborhood', 'cat__BldgType',
       'cat__HouseStyle', 'cat__RoofStyle', 'cat__RoofMatl',
       'cat__ExterQual', 'cat__BsmtQual', 'cat__BsmtCond',
       'cat__BsmtExposure', 'cat__HeatingQC', 'cat__KitchenQual',
       'cat__Functional', 'cat__FireplaceQu', 'cat__GarageYrBlt',
       'cat__MiscFeature'], dtype=object)

In [36]:
# getting original column names per dataframe
imp_cols_b = []
for i in bsel_cols:
    s = i.split('__')[1]
    imp_cols_b.append(s)

imp_cols_b

['MSSubClass',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'BsmtFinSF1',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtHalfBath',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'WoodDeckSF',
 'OpenPorchSF',
 'ScreenPorch',
 'PoolArea',
 'YrSold',
 'LotFrontage',
 'LandContour',
 'Utilities',
 'LandSlope',
 'Neighborhood',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'ExterQual',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'HeatingQC',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageYrBlt',
 'MiscFeature']

#### Converting the above to Dataframe

In [37]:
X_bsel = X[imp_cols_b]
X_bsel.head()

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,...,ExterQual,BsmtQual,BsmtCond,BsmtExposure,HeatingQC,KitchenQual,Functional,FireplaceQu,GarageYrBlt,MiscFeature
0,60,8450,7,5,2003,706,150,856,856,854,...,Gd,Gd,TA,No,Ex,Gd,Typ,,2003,
1,20,9600,6,8,1976,978,284,1262,1262,0,...,TA,Gd,TA,Gd,Ex,TA,Typ,TA,1976,
2,60,11250,7,5,2001,486,434,920,920,866,...,Gd,Gd,TA,Mn,Ex,Gd,Typ,TA,2001,
3,70,9550,7,5,1915,216,540,756,961,756,...,TA,TA,Gd,No,Gd,Gd,Typ,Gd,1998,
4,60,14260,8,5,2000,655,490,1145,1145,1053,...,Gd,Gd,TA,Av,Ex,Gd,Typ,TA,2000,


#### Separating Categorical and Continuous features from X_bsel 

In [38]:
cat_bsel = list(X_bsel.columns[X_bsel.dtypes=='object'])
con_bsel = list(X_bsel.columns[X_bsel.dtypes!='object'])

In [39]:
cat_bsel

['LotFrontage',
 'LandContour',
 'Utilities',
 'LandSlope',
 'Neighborhood',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'ExterQual',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'HeatingQC',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageYrBlt',
 'MiscFeature']

In [40]:
con_bsel

['MSSubClass',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'BsmtFinSF1',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtHalfBath',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'WoodDeckSF',
 'OpenPorchSF',
 'ScreenPorch',
 'PoolArea',
 'YrSold']

## 5(iV). Final Pipeline on X_bsel for Model Building

#### Categorical Features --> One Hot Encoding

In [41]:
num_pipe3 = Pipeline(steps=[('impute', SimpleImputer(strategy='mean')),
                            ('scaler', StandardScaler())])

In [42]:
cat_pipe3 = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                            ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])

In [43]:
pre3 = ColumnTransformer([('num', num_pipe3, con_bsel),
                          ('cat', cat_pipe3, cat_bsel)]).set_output(transform='pandas')
pre3

In [44]:
X_bsel_pre = pre3.fit_transform(X_bsel)
X_bsel_pre.head()

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__BsmtFinSF1,num__BsmtUnfSF,num__TotalBsmtSF,num__1stFlrSF,num__2ndFlrSF,...,cat__GarageYrBlt_2007,cat__GarageYrBlt_2008,cat__GarageYrBlt_2009,cat__GarageYrBlt_2010,cat__GarageYrBlt_NA,cat__MiscFeature_Gar2,cat__MiscFeature_NA,cat__MiscFeature_Othr,cat__MiscFeature_Shed,cat__MiscFeature_TenC
0,0.073375,-0.207142,0.651479,-0.5172,1.050994,0.575425,-0.944591,-0.459303,-0.793434,1.161852,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.872563,-0.091886,-0.071836,2.179628,0.156734,1.171992,-0.641228,0.466465,0.25714,-0.795163,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.073375,0.07348,0.651479,-0.5172,0.984752,0.092907,-0.301643,-0.313369,-0.627826,1.189351,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.309859,-0.096897,0.651479,-0.5172,-1.863632,-0.499274,-0.06167,-0.687324,-0.521734,0.937276,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.073375,0.375148,1.374795,-0.5172,0.951632,0.463568,-0.174865,0.19968,-0.045611,1.617877,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## 6. Train-Test Split - Feature Selection

In [45]:
from sklearn.model_selection import train_test_split

### 6(i) Forward Feature Selection

In [46]:
xftrain, xftest, yftrain, yftest = train_test_split(X_fsel_pre, Y, test_size=0.2, random_state=21)

In [47]:
xftrain.shape

(1168, 365)

In [48]:
xftest.shape

(292, 365)

### 6(ii) Backward-Elimination Feature Selection

In [49]:
xbtrain, xbtest, ybtrain, ybtest = train_test_split(X_bsel_pre, Y, test_size=0.2, random_state=21) 

In [50]:
xbtrain.shape

(1168, 337)

In [51]:
xbtest.shape

(292, 337)

In [52]:
xbtrain.head()

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__BsmtFinSF1,num__BsmtUnfSF,num__TotalBsmtSF,num__1stFlrSF,num__2ndFlrSF,...,cat__GarageYrBlt_2007,cat__GarageYrBlt_2008,cat__GarageYrBlt_2009,cat__GarageYrBlt_2010,cat__GarageYrBlt_NA,cat__MiscFeature_Gar2,cat__MiscFeature_NA,cat__MiscFeature_Othr,cat__MiscFeature_Shed,cat__MiscFeature_TenC
710,-0.636078,-0.640101,-2.241782,0.381743,-1.201217,-0.973018,-0.672923,-1.795509,-1.122062,-0.795163,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1098,-0.163109,-0.452686,-1.518467,0.381743,-1.168096,0.500854,-1.284176,-0.878862,-1.049608,0.504166,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1286,-0.872563,-0.072844,-0.071836,-0.5172,-0.273836,0.274948,0.250749,0.616959,0.427923,-0.795163,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
992,0.073375,-0.075851,-0.071836,2.179628,-0.240715,0.20257,-0.901577,-0.53683,-0.369064,1.109145,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
631,1.492282,-0.593999,1.374795,-0.5172,1.150356,-0.92038,2.179592,1.132288,1.012726,-0.795163,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [53]:
ybtrain.head()

Unnamed: 0,SalePrice
710,52000
1098,128000
1286,143000
992,187000
631,209500


In [54]:
xbtest.head()

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__BsmtFinSF1,num__BsmtUnfSF,num__TotalBsmtSF,num__1stFlrSF,num__2ndFlrSF,...,cat__GarageYrBlt_2007,cat__GarageYrBlt_2008,cat__GarageYrBlt_2009,cat__GarageYrBlt_2010,cat__GarageYrBlt_NA,cat__MiscFeature_Gar2,cat__MiscFeature_NA,cat__MiscFeature_Othr,cat__MiscFeature_Shed,cat__MiscFeature_TenC
880,-0.872563,-0.350058,-0.795151,-0.5172,1.117235,1.176379,-1.035147,0.074268,-0.18793,-0.795163,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
605,0.073375,0.309002,0.651479,0.381743,-0.207594,0.022723,-0.573311,-0.659961,0.060481,1.038106,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1166,-0.872563,-0.004192,1.374795,-0.5172,1.216598,-0.973018,2.550871,1.451518,1.374993,-0.795163,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
216,-0.872563,-0.207142,0.651479,-0.5172,1.084115,1.101808,-0.174865,0.863222,0.707387,-0.795163,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
970,-0.163109,0.02838,-1.518467,-1.416142,-0.737526,-0.973018,0.345832,-0.769412,-1.14535,0.286466,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [55]:
ybtest.head()

Unnamed: 0,SalePrice
880,157000
605,205000
1166,245350
216,210000
970,135000


## 7. Evaluate Feature Selection Model

In [56]:
from sklearn.linear_model import LinearRegression

### Forward Selection - Linear Regression Model

In [57]:
modelf = LinearRegression()
modelf.fit(xftrain, yftrain)

In [58]:
modelf.score(xftrain, yftrain)

0.9485773154793286

In [59]:
modelf.score(xftest, yftest)

-3.929063325723063e+18

### Backward-Elimination Selection - Linear Regression Model

In [60]:
modelb = LinearRegression()
modelb.fit(xbtrain, ybtrain)

In [61]:
modelb.score(xbtrain, ybtrain)

0.948399160066343

In [62]:
modelb.score(xbtest, ybtest)

-4.147866448519663e+16

## Conclusion on Feature Selection Methods

#### In both the methods the train score is good test score is not good i.e. the data is overfitting
#### For this reason, we will perform regularization on the data 

## 8. Train-Test Split - Regularization

In [63]:
xtrain, xtest, ytrain, ytest = train_test_split(X_pre, Y, test_size=0.2, random_state=21)

In [64]:
xtrain.shape

(1168, 79)

In [65]:
xtest.shape

(292, 79)

### 8(i) Creating Linear Regression Model for comparison

In [66]:
model = LinearRegression()
model.fit(xtrain, ytrain)

In [67]:
model.score(xtrain, ytrain)

0.8598554595135445

In [68]:
model.score(xtest,ytest)

0.7969754641408158

### 8(ii) Ridge/L2 Regularization

In [69]:
from sklearn.linear_model import Ridge
modelr = Ridge(alpha=2)
modelr.fit(xtrain, ytrain)

In [70]:
modelr.score(xtrain, ytrain)

0.859274139671927

In [71]:
modelr.score(xtest, ytest)

0.7979950392463381

#### Hyperparameter tuning - Alpha tuning

In [72]:
import numpy as np 
params = {'alpha' : np.arange(1, 1000, 1)}
params

{'alpha': array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
         27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
         40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
         53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
         66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
         79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
         92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
        105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
        118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
        131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
        157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
        170, 171, 172, 173, 174, 175, 176,

In [73]:
from sklearn.model_selection import GridSearchCV
rr = Ridge()
rgscv = GridSearchCV(estimator=rr, param_grid=params, cv=5, scoring='neg_mean_squared_error')
rgscv.fit(xtrain, ytrain)

In [74]:
rgscv.best_params_

{'alpha': 354}

In [75]:
rgscv.best_score_

-1203643793.8880317

In [76]:
best_ridge = rgscv.best_estimator_
best_ridge

### 8(iii) Lasso/L1 Regularization

In [77]:
from sklearn.linear_model import Lasso
modell = Lasso(alpha=2)
modell.fit(xtrain, ytrain)

In [78]:
modell.score(xtrain, ytrain)

0.8598540245069991

In [79]:
modell.score(xtest, ytest)

0.7970410461978703

#### Hyperparameter tuning - Alpha tuning

In [80]:
from sklearn.model_selection import GridSearchCV
ls = Lasso()
lgscv = GridSearchCV(estimator=ls, param_grid=params, cv=5, scoring='neg_mean_squared_error')
lgscv.fit(xtrain, ytrain)

In [81]:
lgscv.best_params_

{'alpha': 999}

In [82]:
lgscv.best_score_

-1252345048.6810582

In [83]:
best_lasso = lgscv.best_estimator_
best_lasso

## 9. Evaluate Ridge & Lasso Model

### 9(i) Ridge Model Evaluation

In [84]:
best_ridge.score(xtrain, ytrain)

0.8440716952440149

In [85]:
best_ridge.score(xtest, ytest)

0.8031487397137105

#### Cross Validate R2 for above model

In [86]:
from sklearn.model_selection import cross_val_score
rscores = cross_val_score(best_ridge, xtrain, ytrain, cv=5, scoring='r2')
rscores.mean()

0.8082586916941

### 9(ii) Lasso Model Evaluation

In [87]:
best_lasso.score(xtrain, ytrain)

0.8460497858586942

In [88]:
best_lasso.score(xtest, ytest)

0.8015680271701066

#### Cross Validate R2 for above model

In [89]:
from sklearn.model_selection import cross_val_score
lscored = cross_val_score(best_lasso, xtrain, ytrain, cv=5, scoring='neg_mean_absolute_error')

## 10. Model Prediction for Train-Test split

### 10(i) Ridge - Model Prediction 

In [90]:
rypred_train = best_ridge.predict(xtrain)
rypred_test = best_ridge.predict(xtest)

In [91]:
rypred_train[0:5]

array([[ 47736.4225102 ],
       [ 97603.01613896],
       [171486.57546754],
       [215968.23810227],
       [215943.38014977]])

In [92]:
ytrain.head()

Unnamed: 0,SalePrice
710,52000
1098,128000
1286,143000
992,187000
631,209500


In [93]:
rypred_test[0:5]

array([[165323.04836907],
       [206876.63930512],
       [254170.18225604],
       [217740.16949917],
       [ 83279.29228823]])

In [94]:
ytest.head()

Unnamed: 0,SalePrice
880,157000
605,205000
1166,245350
216,210000
970,135000


### 10(ii) Lasso - Model Prediction 

In [95]:
lypred_train = best_ridge.predict(xtrain)
lypred_test = best_ridge.predict(xtest)

In [96]:
lypred_train[0:5]

array([[ 47736.4225102 ],
       [ 97603.01613896],
       [171486.57546754],
       [215968.23810227],
       [215943.38014977]])

In [97]:
ytrain.head()

Unnamed: 0,SalePrice
710,52000
1098,128000
1286,143000
992,187000
631,209500


In [98]:
lypred_test[0:5]

array([[165323.04836907],
       [206876.63930512],
       [254170.18225604],
       [217740.16949917],
       [ 83279.29228823]])

In [99]:
ytest.head()

Unnamed: 0,SalePrice
880,157000
605,205000
1166,245350
216,210000
970,135000


## 11. Evaluate model with MSE, RMSE, MAE, R2

In [100]:
def evaluate_model(xtrain, ytrain, xtest, ytest, model):

    # R2 scores
    r2_train = model.score(xtrain, ytrain)
    r2_test = model.score(xtest, ytest)

    # Get prediction on train and test
    ypred_train = model.predict(xtrain)
    ypred_test = model.predict(xtest)

    # Getting mse, rmse, mae
    from sklearn.metrics import mean_squared_error ,mean_absolute_error
    mse_train = mean_squared_error(ytrain, ypred_train)
    mse_test = mean_squared_error(ytest, ypred_test)
    rmse_train = mse_train**(1/2)
    rmse_test = mse_test**(1/2)
    mae_train = mean_absolute_error(ytrain, ypred_train)
    mae_test = mean_absolute_error(ytest, ypred_test)

    # Print above results
    print('Training Evaluation :\n')
    print(f'MSE : {mse_train:.2f}')
    print(f'RMSE : {rmse_train:.2f}')
    print(f'MAE : {mae_train:.2f}')
    print(f'R2 : {r2_train:.2f}')
    print('\n===========================================\n')
    print('Testing Evaluation :\n')
    print(f'MSE : {mse_test:.2f}')
    print(f'RMSE : {rmse_test:.2f}')
    print(f'MAE : {mae_test:.2f}')
    print(f'R2 : {r2_test:.4f}')

In [101]:
# Ridge Regularization Model
evaluate_model(xtrain, ytrain, xtest, ytest, modelr)

Training Evaluation :

MSE : 870941868.50
RMSE : 29511.72
MAE : 18776.14
R2 : 0.86


Testing Evaluation :

MSE : 1369050641.16
RMSE : 37000.68
MAE : 20522.18
R2 : 0.7980


In [102]:
# Lasso Regularization Model
evaluate_model(xtrain, ytrain, xtest, ytest, modell)

Training Evaluation :

MSE : 867353004.45
RMSE : 29450.86
MAE : 18799.19
R2 : 0.86


Testing Evaluation :

MSE : 1375516149.68
RMSE : 37087.95
MAE : 20576.65
R2 : 0.7970


In [103]:
# Forward Feature Selection
evaluate_model(xftrain, yftrain, xftest, yftest, modelf)

Training Evaluation :

MSE : 318251164.61
RMSE : 17839.60
MAE : 12488.18
R2 : 0.95


Testing Evaluation :

MSE : 26628487959731039403761467392.00
RMSE : 163182376376038.34
MAE : 14430255260222.41
R2 : -3929063325723062784.0000


In [104]:
# Backward-Elimination Feature Selection
evaluate_model(xbtrain, ybtrain, xbtest, ybtest, modelb)

Training Evaluation :

MSE : 319353755.19
RMSE : 17870.47
MAE : 12609.48
R2 : 0.95


Testing Evaluation :

MSE : 281113849857973167702671360.00
RMSE : 16766450126904.42
MAE : 1823951636352.72
R2 : -41478664485196632.0000


## 12. Out of Sample data prediction

In [109]:
xnew = pd.read_csv('sample_set.csv', na_values=[''])
xnew = xnew.drop(columns='Id')
xnew.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
4,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,144,0,,,,0,1,2010,WD,Normal


### Apply pre2.transform on xnew

In [113]:
xnew_pre = pre2.transform(xnew)
xnew_pre.head()

Unnamed: 0,num__MSSubClass,num__LotArea,num__OverallQual,num__OverallCond,num__YearBuilt,num__BsmtFinSF1,num__GrLivArea,num__BsmtFullBath,num__Fireplaces,num__GarageCars,...,cat__Fence_GdPrv,cat__Fence_GdWo,cat__Fence_MnPrv,cat__Fence_MnWw,cat__Fence_NA,cat__MiscFeature_Gar2,cat__MiscFeature_NA,cat__MiscFeature_Othr,cat__MiscFeature_Shed,cat__MiscFeature_TenC
0,-0.872563,0.110763,-0.795151,0.381743,-0.340077,0.053428,-1.179256,-0.819964,-0.951226,-1.026858,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.872563,0.37585,-0.071836,0.381743,-0.43944,1.051363,-0.354966,-0.819964,-0.951226,-1.026858,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2,0.073375,0.332053,-0.795151,-0.5172,0.852269,0.761852,0.216136,-0.819964,0.600495,0.311725,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.073375,-0.054002,-0.071836,0.381743,0.88539,0.347326,0.168544,-0.819964,0.600495,0.311725,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,1.492282,-0.552407,1.374795,-0.5172,0.686666,-0.39619,-0.448246,-0.819964,-0.951226,0.311725,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


### Predict the result for above by applying Ridge Model

In [115]:
preds = best_ridge.predict(xnew_pre)
preds

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- cat__BldgType_1Fam
- cat__BldgType_2fmCon
- cat__BldgType_Duplex
- cat__BldgType_Twnhs
- cat__BldgType_TwnhsE
- ...
Feature names seen at fit time, yet now missing:
- cat__Alley
- cat__BldgType
- cat__BsmtCond
- cat__BsmtExposure
- cat__BsmtFinType1
- ...
