# Analysis of 'HousePrice' DataSet for Prediction using Ridge and Lasso Regression

In [114]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv("HousePrice.csv")
df


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


<h3> Preprocessing the Dataset

In [115]:
df.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

<h3>Drop Attributes with High NAN values

In [116]:
# Drop attributes with NAN values greater than 200
null_feat = ["Alley","PoolQC","Fence","MiscFeature","FireplaceQu","LotFrontage"]
df.drop(null_feat,axis=1,inplace=True)


In [117]:
# Delete records with NAN values
df.dropna(inplace=True)

In [118]:
df.shape

(1338, 75)

In [119]:
# Split the Dataset considering predictors and target variable
x = df.iloc[:,:-1]
y = df["SalePrice"]


In [120]:
# List the Numeric and Categoric columns
numeric_col = list(x.select_dtypes(include=[np.number]).columns)
categorical_col = list(x.select_dtypes(exclude=[np.number]).columns)

print(numeric_col)
print(categorical_col)


['Id', 'MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']
['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'Sa

<h3>Encoding the Categoric features

In [121]:
# Applying the LabelEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(handle_unknown='ignore')
le = LabelEncoder()

for i in categorical_col:
    x[i] = le.fit_transform(x[i])


In [122]:
x

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,3,8450,1,3,3,0,4,0,...,61,0,0,0,0,0,2,2008,8,4
1,2,20,3,9600,1,3,3,0,2,0,...,0,0,0,0,0,0,5,2007,8,4
2,3,60,3,11250,1,0,3,0,4,0,...,42,0,0,0,0,0,9,2008,8,4
3,4,70,3,9550,1,0,3,0,0,0,...,35,272,0,0,0,0,2,2006,8,0
4,5,60,3,14260,1,0,3,0,2,0,...,84,0,0,0,0,0,12,2008,8,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,3,7917,1,3,3,0,4,0,...,40,0,0,0,0,0,8,2007,8,4
1456,1457,20,3,13175,1,3,3,0,4,0,...,0,0,0,0,0,0,2,2010,8,4
1457,1458,70,3,9042,1,3,3,0,4,0,...,60,0,0,0,0,2500,5,2010,8,4
1458,1459,20,3,9717,1,3,3,0,4,0,...,0,112,0,0,0,0,4,2010,8,4


<h3>Scaling the Numeric features

In [123]:
# Applying the StandardScaler
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

x[numeric_col] = ss.fit_transform(x[numeric_col])


<h3> Spliting the dataset into train and test sets

In [148]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=45)


<h3> Ridge Regression

In [174]:
from sklearn.linear_model import Ridge

ridge_r = Ridge(alpha=1.0)
ridge_r.fit(x_train,y_train)


Ridge()

In [175]:
y_pred_ridge = ridge_r.predict(x_test)


In [176]:
# Evaluating Accuracy
print("Accuracy: ",ridge_r.score(x_test,y_test))

Accuracy:  0.8312118988230774


<h3>Lasso Regression

In [177]:
from sklearn.linear_model import Lasso

Lasso_r = Lasso(alpha=1.0)
Lasso_r.fit(x_train,y_train)


Lasso()

In [178]:
y_pred_lasso = Lasso_r.predict(x_test)

In [179]:
# Evaluating Accuracy
print("Accuracy: ",Lasso_r.score(x_test,y_test))

Accuracy:  0.8307515787964729


<h3>Model Deployment on 'HousePriceTest.csv' Dataset 

In [183]:
test = pd.read_csv("HousePriceTest.csv")
test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [184]:
# Drop attributes with NAN values greater than 200
null_feat = ["Alley","PoolQC","Fence","MiscFeature","FireplaceQu","LotFrontage"]
test.drop(null_feat,axis=1,inplace=True)

In [185]:
test.shape

(1459, 74)

In [186]:
# Removing NAN records from dataset 
test.dropna(inplace=True)

In [187]:
test.shape

(1319, 74)

In [190]:
le = LabelEncoder()
ss = StandardScaler()

for i in categorical_col:
    test[i] = le.fit_transform(test[i])

test[numeric_col] = ss.fit_transform(test[numeric_col])
    

<h3>Ridge Regression Prediction

In [191]:
pred_ridge = ridge_r.predict(test)
pred_ridge

array([ 98276.8706393 , 162271.7867134 , 163883.71350954, ...,
        52866.89640246, 166299.47699197, 259543.62188719])

<h3>Lasso Regression Prediction

In [193]:
pred_Lasso = Lasso_r.predict(test)
pred_Lasso

array([ 99652.65279038, 162959.32950832, 163965.31292537, ...,
        54067.47619912, 167005.6197615 , 259873.7242179 ])