## House Price Prediction

Implement a linear regression model to predict the prices of houses based on their square footage and the number of bedrooms and bathrooms

In [461]:
#importing necessary libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")

In [462]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [463]:
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [464]:
#Shape of train dataset
train_df.shape

(1460, 81)

In [465]:
#Shape of test dataset
test_df.shape

(1459, 80)

In [466]:
#Combining Train and Test dataset for preprocessing

df = pd.concat([train_df , test_df])

In [467]:
df.shape

(2919, 81)

In [468]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2919 entries, 0 to 1458
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             2919 non-null   int64  
 1   MSSubClass     2919 non-null   int64  
 2   MSZoning       2915 non-null   object 
 3   LotFrontage    2433 non-null   float64
 4   LotArea        2919 non-null   int64  
 5   Street         2919 non-null   object 
 6   Alley          198 non-null    object 
 7   LotShape       2919 non-null   object 
 8   LandContour    2919 non-null   object 
 9   Utilities      2917 non-null   object 
 10  LotConfig      2919 non-null   object 
 11  LandSlope      2919 non-null   object 
 12  Neighborhood   2919 non-null   object 
 13  Condition1     2919 non-null   object 
 14  Condition2     2919 non-null   object 
 15  BldgType       2919 non-null   object 
 16  HouseStyle     2919 non-null   object 
 17  OverallQual    2919 non-null   int64  
 18  OverallC

In [469]:
#Statistical Analysis
df.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,2919.0,2919.0,2433.0,2919.0,2919.0,2919.0,2919.0,2919.0,2896.0,2918.0,...,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,1460.0
mean,1460.0,57.137718,69.305795,10168.11408,6.089072,5.564577,1971.312778,1984.264474,102.201312,441.423235,...,93.709832,47.486811,23.098321,2.602261,16.06235,2.251799,50.825968,6.213087,2007.792737,180921.19589
std,842.787043,42.517628,23.344905,7886.996359,1.409947,1.113131,30.291442,20.894344,179.334253,455.610826,...,126.526589,67.575493,64.244246,25.188169,56.184365,35.663946,567.402211,2.714762,1.314964,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,730.5,20.0,59.0,7478.0,5.0,5.0,1953.5,1965.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0,129975.0
50%,1460.0,50.0,68.0,9453.0,6.0,5.0,1973.0,1993.0,0.0,368.5,...,0.0,26.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,2189.5,70.0,80.0,11570.0,7.0,6.0,2001.0,2004.0,164.0,733.0,...,168.0,70.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,2919.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,1424.0,742.0,1012.0,508.0,576.0,800.0,17000.0,12.0,2010.0,755000.0


### Handling Missing Values

In [470]:
#Checking Missing values
df.isnull().sum()

Id                  0
MSSubClass          0
MSZoning            4
LotFrontage       486
LotArea             0
                 ... 
MoSold              0
YrSold              0
SaleType            1
SaleCondition       0
SalePrice        1459
Length: 81, dtype: int64

In [471]:
#Extracting columns with missing values
missing = df.isnull().sum()
missing[missing!=0] 

MSZoning           4
LotFrontage      486
Alley           2721
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType        24
MasVnrArea        23
BsmtQual          81
BsmtCond          82
BsmtExposure      82
BsmtFinType1      79
BsmtFinSF1         1
BsmtFinType2      80
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
Electrical         1
BsmtFullBath       2
BsmtHalfBath       2
KitchenQual        1
Functional         2
FireplaceQu     1420
GarageType       157
GarageYrBlt      159
GarageFinish     159
GarageCars         1
GarageArea         1
GarageQual       159
GarageCond       159
PoolQC          2909
Fence           2348
MiscFeature     2814
SaleType           1
SalePrice       1459
dtype: int64

In [472]:
#Dealing with missing values in "MSZoning" (Identifies the general zoning classification of the sale)
#Replacing Nan with "Unknown"

df.MSZoning.fillna( "Unknown", inplace=True)

In [473]:
#Dealing with missing values in "LotFrontage" (Linear feet of street connected to property)

#Assuming this distance to be zero, replacing all NaN with 0

df.LotFrontage.fillna(0, inplace=True)

In [474]:
#Dealing with missing values in "Alley"(Type of alley access to property: Grvl(Gravel), Pave(Paved), NA(No alley access))
#Replacing NaN with "None"

df.Alley.fillna("None", inplace=True)

In [475]:
#Dealing with missing values in "Utilities"(Type of utilities available)
#Replacing NaN with "None" as no utilities available in these cases

df.Utilities.fillna("None", inplace=True)

In [476]:
#Dealing with missing values in "Exterior1st" and "Exterior2nd" (Exterior covering on house)
#Replacing Nan with "Unknown"

df.Exterior1st.fillna( "Unknown", inplace=True)
df.Exterior2nd.fillna( "Unknown", inplace=True)

In [477]:
#Dealing with missing values in "MasVnrType"(Masonry veneer type : BrkCmn(Brick Common),BrkFace(Brick Face),
#                                              CBlock(Cinder Block), None, Stone)

#Replacing NaN with "None"

df.MasVnrType.fillna("None", inplace=True)

In [478]:
#Dealing with missing values in "MasVnrArea"
df[df.MasVnrArea.isnull()][['MasVnrType','MasVnrArea']]

Unnamed: 0,MasVnrType,MasVnrArea
234,,
529,,
650,,
936,,
973,,
977,,
1243,,
1278,,
231,,
246,,


In [479]:
#Replacing Nan with 0
df.MasVnrArea.fillna(0, inplace=True)

In [480]:
#Dealing with missing values in "BsmtQual"(Evaluates the height of the basement :
#                                           Ex-Excellent (100+ inches),
#                                           Gd-Good (90-99 inches)
#                                           TA-Typical (80-89 inches)
#                                           Fa-Fair (70-79 inches)
#                                           Po-Poor (<70 inches
#                                           NA-No Basement )
#Replacing NaN with "None"

df.BsmtQual.fillna("None", inplace=True)

In [481]:
#Dealing with missing values in "BsmtCond"(Evaluates the general condition of the basement)
df[df.BsmtCond.isnull()][['BsmtQual','BsmtCond']]

Unnamed: 0,BsmtQual,BsmtCond
17,,
39,,
90,,
102,,
156,,
...,...,...
1343,,
1344,,
1364,,
1431,,


In [482]:
#Replacing Nan with 'None' as no basement present in these cases
df.BsmtCond.fillna("None", inplace=True)

In [483]:
#Dealing with missing values in "BsmtExposure"(Refers to walkout or garden level walls:
#                                              Gd - Good Exposure,
#                           Av - Average Exposure (split levels or foyers typically score average or above),
#                           Mn - Mimimum Exposure,
#                           No - No Exposure,
#                           NA - No Basement)

#Replacing NaN with "No"
df.BsmtExposure.fillna("None", inplace=True)

In [484]:
#Dealing with missing values in "BsmtFinType1" and "BsmtFinType2" (Rating of basement finished area)
#Replacing NaN with "None" as no basement present 

df.BsmtFinType1.fillna("None", inplace=True)
df.BsmtFinType2.fillna("None", inplace=True)

In [485]:
#Dealing with missing values in "BsmtFinSF1" and "BsmtFinSF2" (finished square feet)
#Replacing Nan with 0

df.BsmtFinSF1.fillna(0, inplace=True)
df.BsmtFinSF2.fillna(0, inplace=True)

In [486]:
#Dealing with missing values in "BsmtUnfSF" (Unfinished square feet of basement area) 
#                           and "TotalBsmtSF" (Total square feet of basement area)
#Replacing Nan with 0

df.BsmtUnfSF.fillna(0, inplace=True)
df.TotalBsmtSF.fillna(0, inplace=True)

In [487]:
#Dealing with missing values in "Electrical"(Electrical system)
#Replacing NaN with "None"

df.Electrical.fillna("None", inplace=True)

In [488]:
#Dealing with missing values in "BsmtFullBath" (Basement full bathrooms) and "BsmtHalfBath" (Basement half bathrooms)
#Replacing NaN with "0" as no bathroom present

df.BsmtFullBath.fillna(0, inplace=True)
df.BsmtHalfBath.fillna(0, inplace=True)

In [489]:
#Dealing with missing values in "KitchenQual" (Kitchen quality)
#Replacing NaN with "Unknown"

df.KitchenQual.fillna("None", inplace=True)

In [490]:
#Dealing with missing values in "Functional" (Home functionality )
#Replacing NaN with "Unknown"

df.Functional.fillna("None", inplace=True)

In [491]:
#Dealing with missing values in "FireplaceQu" (Fireplace quality: 
#                                                Ex:Excellent - Exceptional Masonry Fireplace
#                                                Gd:Good - Masonry Fireplace in main level
#                                                TA:Average - Prefabricated Fireplace in main living area or Masonry Fireplace in basement
#                                                Fa:Fair - Prefabricated Fireplace in basement
#                                                Po:Poor - Ben Franklin Stove
#                                                NA:No Fireplace)
#Replacing NaN with "None" as no Fireplace

df.FireplaceQu.fillna("None", inplace=True)

In [492]:
#Dealing with missing values in "GarageType" (Garage location), "GarageYrBlt" (Year garage was built), 
#                               "GarageFinish" (Interior finish of the garage), "GarageQual"(Garage quality),
#                               "GarageCars" (Size of garage in car capacity), "GarageArea" (Size of garage in square feet) and
#                               "GarageCond"(Garage condition)

#Replacing Nan in "GarageYrBlt" and "GarageArea" with 0 and for others replacing Nan with "None" as no garage present here.

df.GarageYrBlt.fillna(0, inplace=True)
df.GarageCars.fillna(0, inplace=True)
df.GarageArea.fillna(0, inplace=True)

df.GarageType.fillna("None", inplace=True)
df.GarageFinish.fillna("None", inplace=True)
df.GarageQual.fillna("None", inplace=True)
df.GarageCond.fillna("None", inplace=True)

In [493]:
#Dealing with missing values in "SaleType"(Type of sale)
#Replacing NaN with "Unknown"
df.SaleType.fillna("Unknown", inplace=True)

In [494]:
#Dealing with missing values in "PoolQC" (Pool quality: Ex-Excellent, Gd-Good, TA-Average/Typical, Fa-Fair, NA-No Pool)
#Replacing NaN with "None"
df.PoolQC.fillna("None", inplace=True)

In [495]:
#Dealing with missing values in "Fence" (Fence quality: GdPrv-Good Privacy, MnPrv-Minimum Privacy, GdWo-Good Wood,
#                                                       MnWw-Minimum Wood/Wire, NA-No Fence)
#Replacing NaN with "None"
df.Fence.fillna("None", inplace=True)

In [496]:
#Dealing with missing values in "MiscFeature"(Miscellaneous feature not covered in other categories)
#Replacing NaN with "None" as no Miscellaneous feature present

df.MiscFeature.fillna("None", inplace=True)

In [497]:
df.isnull().sum().sum()

1459

There are no mising values now except for Sales Price that is a part of our Test dataset!

In [498]:
#Checking for Duplicates

df.duplicated().sum()

0

There are no duplicates !

### Encoding

In [499]:
#Selecting Categorical columns
cat_cols = df.select_dtypes("object").columns

In [500]:
#Encoding dataset
df_dummies = pd.get_dummies(df, columns=cat_cols, drop_first=True)

### Scaling

In [501]:
#Scaling dataset
scaler = StandardScaler()
df_final = pd.DataFrame(scaler.fit_transform(df_dummies), columns = df_dummies.columns)

In [502]:
df_final.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_Unknown,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,-1.731458,0.067331,0.216075,-0.217879,0.646183,-0.507284,1.046258,0.896833,0.529034,0.581145,...,-0.052423,-0.298629,-0.049029,-0.018512,0.395018,-0.064249,-0.09105,-0.126535,0.463937,-0.302693
1,-1.730271,-0.873616,0.664158,-0.072044,-0.063185,2.188279,0.154764,-0.395604,-0.567016,1.178255,...,-0.052423,-0.298629,-0.049029,-0.018512,0.395018,-0.064249,-0.09105,-0.126535,0.463937,-0.302693
2,-1.729084,0.067331,0.305692,0.137197,0.646183,-0.507284,0.980221,0.848965,0.338903,0.098189,...,-0.052423,-0.298629,-0.049029,-0.018512,0.395018,-0.064249,-0.09105,-0.126535,0.463937,-0.302693
3,-1.727897,0.302568,0.066714,-0.078385,0.646183,-0.507284,-1.859351,-0.682812,-0.567016,-0.494529,...,-0.052423,-0.298629,-0.049029,-0.018512,0.395018,-0.064249,-0.09105,-0.126535,-2.155466,-0.302693
4,-1.726711,0.067331,0.783647,0.518903,1.355551,-0.507284,0.947203,0.753229,1.390216,0.469187,...,-0.052423,-0.298629,-0.049029,-0.018512,0.395018,-0.064249,-0.09105,-0.126535,0.463937,-0.302693


###### Our dataset is now ready . Let us split the data to our original Train and Test dataset

In [535]:
#Splitting data to original datasets
Train_final = df_final[df_final["SalePrice"].notna()]
Test_final = df_final[df_final["SalePrice"].isnull()]
Test_final.drop("SalePrice", axis=1, inplace=True)

In [536]:
print("Train dataset shape: ", Train_final.shape)
print("Test dataset shape: ", Test_final.shape)

Train dataset shape:  (1460, 269)
Test dataset shape:  (1459, 268)


##### Our data is now ready for Modelling!

In [505]:
#Defining our X and y
X = Train_final.drop('SalePrice', axis=1)
y = Train_final['SalePrice']

### Linear Regression Model

In [506]:
#Train Test Split

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3)

In [507]:
#Fitting model on test data
LR = LinearRegression()
LR.fit(train_X, train_y)

LinearRegression()

In [508]:
print('Train Score: {}'.format(LR.score(train_X, train_y)))  
print('Test Score: {}'.format(LR.score(test_X, test_y)))   

Train Score: 0.9407060564755615
Test Score: -1.4617559617271502e+21


In [509]:
pred_sales = LR.predict(test_X)
r2_score(test_y, pred_sales)

-1.4617559617271502e+21

In [510]:
#Evaluation on Train and Test data

preds_train = LR.predict(train_X)
preds_test = LR.predict(test_X)

print('TRAIN DATA RESULTS')
print("Mean Absolute Error (MAE) on Train data  : ", mean_absolute_error(preds_train,train_y))
print("Mean Squared Error (MSE) on Train data  : ", mean_squared_error(preds_train,train_y))
print('RMSE on Train Data :', np.sqrt(mean_squared_error(preds_train,train_y)))
print('Training Set R squared score : ', LR.score(train_X, train_y))

print('\nTEST DATA RESULTS')
print('Mean Absolute Error (MAE) on Test data :', mean_absolute_error(preds_test,test_y))
print('Mean Squared Error (MSE) on Test data :', mean_squared_error(preds_test,test_y))
print('RMSE on Test Data :', np.sqrt(mean_squared_error(preds_test,test_y)))
print('Test Set R squared score :', LR.score(test_X, test_y))


TRAIN DATA RESULTS
Mean Absolute Error (MAE) on Train data  :  0.15464990445543533
Mean Squared Error (MSE) on Train data  :  0.059500173191902046
RMSE on Train Data : 0.24392657336153856
Training Set R squared score :  0.9407060564755615

TEST DATA RESULTS
Mean Absolute Error (MAE) on Test data : 4357160815.852334
Mean Squared Error (MSE) on Test data : 1.4454840899440751e+21
RMSE on Test Data : 38019522484.429955
Test Set R squared score : -1.4617559617271502e+21


##### Our Model is highly overfitted!! Let us use regularisation!

#### L1 Regularisation(Lasso)

In [511]:
from sklearn.linear_model import Lasso

lasso = Lasso()
lasso.fit(train_X, train_y)

print('train score: {}'.format(lasso.score(train_X,train_y)))
print('test score: {}'.format(lasso.score(test_X, test_y)))
pred_sales = lasso.predict(test_X)
print('R2 score: ', r2_score(test_y, pred_sales))

train score: 0.0
test score: -0.004357342218363813
R2 score:  -0.004357342218363813


In [512]:
#Hyperparameter Tuning
params = {'alpha':[0.0001, 0.001, 0.01, 0.05, 0.1, 0.2, 0.5, 0.8, 1, 5, 10, 15], 'max_iter':[1500, 10000, 100000]}
grid_search = GridSearchCV(lasso, params)
grid_search.fit(train_X, train_y)

GridSearchCV(estimator=Lasso(),
             param_grid={'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2, 0.5,
                                   0.8, 1, 5, 10, 15],
                         'max_iter': [1500, 10000, 100000]})

In [513]:
grid_search.best_params_

{'alpha': 0.01, 'max_iter': 1500}

In [514]:
grid_search.best_estimator_

Lasso(alpha=0.01, max_iter=1500)

In [515]:
#Using these best parameters 
lasso = grid_search.best_estimator_
lasso.fit(train_X, train_y)

print('train score: {}'.format(lasso.score(train_X,train_y)))
print('test score: {}'.format(lasso.score(test_X, test_y)))
pred_sales = lasso.predict(test_X)
print('R2 score:', r2_score(test_y, pred_sales))

train score: 0.9009871693529095
test score: 0.8698108418759333
R2 score: 0.8698108418759333


In [516]:
#### Evaluation on Train and Test data
            
preds_train = lasso.predict(train_X)
preds_test = lasso.predict(test_X)

print('TRAIN DATA RESULTS')
print("Mean Absolute Error (MAE) on Train data  : ", mean_absolute_error(preds_train,train_y))
print("Mean Squared Error (MSE) on Train data  : ", mean_squared_error(preds_train,train_y))
print('RMSE on Train Data :', np.sqrt(mean_squared_error(preds_train,train_y)))
print('Training Set R squared score : ', lasso.score(train_X, train_y))

print('\nTEST DATA RESULTS')
print('Mean Absolute Error (MAE) on Test data :', mean_absolute_error(preds_test,test_y))
print('Mean Squared Error (MSE) on Test data :', mean_squared_error(preds_test,test_y))
print('RMSE on Test Data :', np.sqrt(mean_squared_error(preds_test,test_y)))
print('Test Set R squared score :', lasso.score(test_X, test_y))

TRAIN DATA RESULTS
Mean Absolute Error (MAE) on Train data  :  0.1920802676568343
Mean Squared Error (MSE) on Train data  :  0.09935720617560571
RMSE on Train Data : 0.3152097812181686
Training Set R squared score :  0.9009871693529095

TEST DATA RESULTS
Mean Absolute Error (MAE) on Test data : 0.2253742480663887
Mean Squared Error (MSE) on Test data : 0.12873992764783984
RMSE on Test Data : 0.3588034666051038
Test Set R squared score : 0.8698108418759333


In [517]:
#Saving all scores for model comparison
Lasso_train = lasso.score(train_X,train_y)
Lasso_test = lasso.score(test_X, test_y)
Lasso_R2 = r2_score(test_y, pred_sales)
Lasso_MAE = mean_absolute_error(preds_test,test_y)
Lasso_MSE = mean_squared_error(preds_test,test_y)
Lasso_RMSE = np.sqrt(mean_squared_error(preds_test,test_y))

#### L2 Regularisation(Ridge)

In [518]:
from sklearn.linear_model import Ridge

ridge = Ridge()
ridge.fit(train_X, train_y)

print('train score: {}'.format(ridge.score(train_X,train_y)))
print('test score: {}'.format(ridge.score(test_X, test_y)))
pred_sales = ridge.predict(test_X)
print('R2 score: ', r2_score(test_y, pred_sales))

train score: 0.9392053230043065
test score: 0.8727448325063709
R2 score:  0.8727448325063709


In [519]:
#Hyperparameter Tuning
params = {'alpha':[0.01, 0.05, 0.1, 0.2, 0.5, 0.8, 1, 5, 10, 15], 'max_iter':[1500, 10000, 100000]}
grid_search = GridSearchCV(ridge, params)
grid_search.fit(train_X, train_y)

GridSearchCV(estimator=Ridge(),
             param_grid={'alpha': [0.01, 0.05, 0.1, 0.2, 0.5, 0.8, 1, 5, 10,
                                   15],
                         'max_iter': [1500, 10000, 100000]})

In [520]:
grid_search.best_estimator_

Ridge(alpha=15, max_iter=1500)

In [521]:
#Using best parameters
ridge = grid_search.best_estimator_
ridge.fit(train_X, train_y)

print('train score: {}'.format(ridge.score(train_X,train_y)))
print('test score: {}'.format(ridge.score(test_X, test_y)))
pred_sales = ridge.predict(test_X)
print('R2 score: ', r2_score(test_y, pred_sales))

train score: 0.931975196444363
test score: 0.883833231926604
R2 score:  0.883833231926604


In [522]:
#### Evaluation on Train and Test data
            
preds_train = ridge.predict(train_X)
preds_test = ridge.predict(test_X)

print('TRAIN DATA RESULTS')
print("Mean Absolute Error (MAE) on Train data  : ", mean_absolute_error(preds_train,train_y))
print("Mean Squared Error (MSE) on Train data  : ", mean_squared_error(preds_train,train_y))
print('RMSE on Train Data :', np.sqrt(mean_squared_error(preds_train,train_y)))
print('Training Set R squared score : ', ridge.score(train_X, train_y))

print('\nTEST DATA RESULTS')
print('Mean Absolute Error (MAE) on Test data :', mean_absolute_error(preds_test,test_y))
print('Mean Squared Error (MSE) on Test data :', mean_squared_error(preds_test,test_y))
print('RMSE on Test Data :', np.sqrt(mean_squared_error(preds_test,test_y)))
print('Test Set R squared score :', ridge.score(test_X, test_y))

TRAIN DATA RESULTS
Mean Absolute Error (MAE) on Train data  :  0.17004553629494107
Mean Squared Error (MSE) on Train data  :  0.06826139993939369
RMSE on Train Data : 0.2612688269568218
Training Set R squared score :  0.931975196444363

TEST DATA RESULTS
Mean Absolute Error (MAE) on Test data : 0.23166090686379448
Mean Squared Error (MSE) on Test data : 0.11487363104845023
RMSE on Test Data : 0.3389301270888297
Test Set R squared score : 0.883833231926604


In [523]:
#Saving all scores for model comparison
Ridge_train = ridge.score(train_X,train_y)
Ridge_test = ridge.score(test_X, test_y)
Ridge_R2 = r2_score(test_y, pred_sales)
Ridge_MAE = mean_absolute_error(preds_test,test_y)
Ridge_MSE = mean_squared_error(preds_test,test_y)
Ridge_RMSE = np.sqrt(mean_squared_error(preds_test,test_y))

#### ElasticNet

In [524]:
from sklearn.linear_model import ElasticNet

Elasticnet = ElasticNet()
Elasticnet.fit(train_X, train_y)

print('train score: {}'.format(Elasticnet.score(train_X,train_y)))
print('test score: {}'.format(Elasticnet.score(test_X, test_y)))
pred_sales = Elasticnet.predict(test_X)
print('R2 score: ', r2_score(test_y, pred_sales))

train score: 0.3307529048878106
test score: 0.3182243019487254
R2 score:  0.3182243019487254


In [525]:
params = {'alpha':[0.0001, 0.001, 0.01, 0.05, 0.1, 0.2, 0.5, 0.8, 1, 5, 10, 15], 'max_iter':[1500, 10000, 100000]}
grid_search = GridSearchCV(Elasticnet, params)
grid_search.fit(train_X, train_y)

GridSearchCV(estimator=ElasticNet(),
             param_grid={'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2, 0.5,
                                   0.8, 1, 5, 10, 15],
                         'max_iter': [1500, 10000, 100000]})

In [526]:
grid_search.best_estimator_

ElasticNet(alpha=0.05, max_iter=1500)

In [527]:
#Using best parameters
Elasticnet = grid_search.best_estimator_
Elasticnet.fit(train_X, train_y)

print('train score: {}'.format(Elasticnet.score(train_X,train_y)))
print('test score: {}'.format(Elasticnet.score(test_X, test_y)))
pred_sales = Elasticnet.predict(test_X)
print('R2 score: ', r2_score(test_y, pred_sales))

train score: 0.8668930816442514
test score: 0.8574455530584222
R2 score:  0.8574455530584222


In [528]:
#### Evaluation on Train and Test data
            
preds_train = Elasticnet.predict(train_X)
preds_test = Elasticnet.predict(test_X)

print('TRAIN DATA RESULTS')
print("Mean Absolute Error (MAE) on Train data  : ", mean_absolute_error(preds_train,train_y))
print("Mean Squared Error (MSE) on Train data  : ", mean_squared_error(preds_train,train_y))
print('RMSE on Train Data :', np.sqrt(mean_squared_error(preds_train,train_y)))
print('Training Set R squared score : ', Elasticnet.score(train_X, train_y))

print('\nTEST DATA RESULTS')
print('Mean Absolute Error (MAE) on Test data :', mean_absolute_error(preds_test,test_y))
print('Mean Squared Error (MSE) on Test data :', mean_squared_error(preds_test,test_y))
print('RMSE on Test Data :', np.sqrt(mean_squared_error(preds_test,test_y)))
print('Test Set R squared score :', Elasticnet.score(test_X, test_y))

TRAIN DATA RESULTS
Mean Absolute Error (MAE) on Train data  :  0.21687322052239646
Mean Squared Error (MSE) on Train data  :  0.133569876187155
RMSE on Train Data : 0.3654721277842607
Training Set R squared score :  0.8668930816442514

TEST DATA RESULTS
Mean Absolute Error (MAE) on Test data : 0.23651530452661396
Mean Squared Error (MSE) on Test data : 0.14096756941654975
RMSE on Test Data : 0.37545648138838905
Test Set R squared score : 0.8574455530584222


In [529]:
#Saving all scores for model comparison
Elasticnet_train = Elasticnet.score(train_X,train_y)
Elasticnet_test = Elasticnet.score(test_X, test_y)
Elasticnet_R2 = r2_score(test_y, pred_sales)
Elasticnet_MAE = mean_absolute_error(preds_test,test_y)
Elasticnet_MSE = mean_squared_error(preds_test,test_y)
Elasticnet_RMSE = np.sqrt(mean_squared_error(preds_test,test_y))

### Model Comparison

In [530]:
models = pd.DataFrame({
    'Model': ['Lasso', 'Ridge', 'ElasticNet'],
    'Train Score': [Lasso_train, Ridge_train, Elasticnet_train],
    'Test Score': [Lasso_test, Ridge_test, Elasticnet_test],
    'R2 Score' : [Lasso_R2, Ridge_R2, Elasticnet_R2],
    'MAE' : [Lasso_MAE, Ridge_MAE, Elasticnet_MAE],
    'MSE':[Lasso_MSE, Ridge_MSE, Elasticnet_MSE],
    'RMSE' : [Lasso_RMSE, Ridge_RMSE, Elasticnet_RMSE]
})

models.sort_values(by='R2 Score', ascending = False, ignore_index=True)

Unnamed: 0,Model,Train Score,Test Score,R2 Score,MAE,MSE,RMSE
0,Ridge,0.931975,0.883833,0.883833,0.231661,0.114874,0.33893
1,Lasso,0.900987,0.869811,0.869811,0.225374,0.12874,0.358803
2,ElasticNet,0.866893,0.857446,0.857446,0.236515,0.140968,0.375456


### Final Model

Based on the performances of Ridge, Lasso and ElasticNet Models, it appears that the Ridge model is the best choice for our regression task. Let us compare the Models to understand why!

**R-squared (R2) Score**: R2 score measures the proportion of the variance in the dependent variable that is predictable from the independent variables. A higher R2 score indicates a better fit.
Ridge has a higher R2 score on the test data compared to Lasso and ElasticNet. 

**Mean Absolute Error (MAE)**: MAE measures the average magnitude of errors between actual and predicted values. Lower MAE indicates better accuracy.
Ridge has a lower MAE on the test data compared to Lasso and ElasticNet. 

**Mean Squared Error (MSE)**: MSE penalizes larger errors more than MAE, and a lower MSE suggests better performance.
Ridge has a lower MSE on the test data compared to Lasso and ElasticNet. 

**Root Mean Squared Error (RMSE)**: RMSE provides a measure of the standard deviation of errors, and lower RMSE indicates better accuracy.
Ridge has a lower RMSE on the test data compared to Lasso and ElasticNet.


In summary, Ridge regression outperforms both Lasso and ElasticNet in terms of R2 score, MAE, MSE and RMSE

In [531]:
#Final Model

Model = ridge
Model.fit(train_X, train_y)

print('train score: ', (Model.score(train_X,train_y)))
print('test score: ', (Model.score(test_X, test_y)))
pred_sales = Model.predict(test_X)
print('R2 score: ', r2_score(test_y, pred_sales))
print('Mean Absolute Error (MAE) on Test data :', mean_absolute_error(preds_test,test_y))
print('Mean Squared Error (MSE) on Test data :', mean_squared_error(preds_test,test_y))
print('RMSE on Test Data :', np.sqrt(mean_squared_error(preds_test,test_y)))

train score:  0.931975196444363
test score:  0.883833231926604
R2 score:  0.883833231926604
Mean Absolute Error (MAE) on Test data : 0.23651530452661396
Mean Squared Error (MSE) on Test data : 0.14096756941654975
RMSE on Test Data : 0.37545648138838905


### Predictions on Test Dataset

In [537]:
Predictions = Model.predict(Test_final)
Predictions

array([-0.95572499, -1.01589421,  0.00464227, ..., -0.08336731,
       -0.75848845,  0.69042707])

#### Project By
 - Archana Kokate