># <font color='green'>L1 and L2 Regularization <font>

For Equation $\hspace{7.7cm}h_{\theta}(x_i)=\theta_o+\theta_1x_1+\theta_2x_2+\theta_3x_3$
<br><br>

- **OLS=>**
$\hspace{10cm}mse=\frac{1}{n}\sum_{i=1}^{n}(y_i-h_{\theta}(x_i))^2$
<br><br>
- **L1 regularization=>**

$\hspace{10cm}mse=\frac{1}{n}\sum_{i=1}^{n}(y_i-h_{\theta}(x_i))^2+\lambda\sum_{i=1}^{n}|\theta_i|$


- **L2 regularization=>**

$\hspace{10cm}mse=\frac{1}{n}\sum_{i=1}^{n}(y_i-h_{\theta}(x_i))^2 +\lambda\sum_{i=1}^{n}\theta_i^2$

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [2]:
#Melbourne housing price dataset

dataset=pd.read_csv(r'D:\Datasets\20.Melbourne_housing_FULL.csv')
dataset.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [3]:
dataset.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [4]:
cols_to_use=['Suburb','Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Distance','Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea','CouncilArea','Regionname', 'Propertycount']
dataset=dataset[cols_to_use]
dataset.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Method,SellerG,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,CouncilArea,Regionname,Propertycount
0,Abbotsford,2,h,,SS,Jellis,2.5,2.0,1.0,1.0,126.0,,Yarra City Council,Northern Metropolitan,4019.0
1,Abbotsford,2,h,1480000.0,S,Biggin,2.5,2.0,1.0,1.0,202.0,,Yarra City Council,Northern Metropolitan,4019.0
2,Abbotsford,2,h,1035000.0,S,Biggin,2.5,2.0,1.0,0.0,156.0,79.0,Yarra City Council,Northern Metropolitan,4019.0
3,Abbotsford,3,u,,VB,Rounds,2.5,3.0,2.0,1.0,0.0,,Yarra City Council,Northern Metropolitan,4019.0
4,Abbotsford,3,h,1465000.0,SP,Biggin,2.5,3.0,2.0,0.0,134.0,150.0,Yarra City Council,Northern Metropolitan,4019.0


In [5]:
dataset.shape

(34857, 15)

In [6]:
dataset.isna().sum()

Suburb               0
Rooms                0
Type                 0
Price             7610
Method               0
SellerG              0
Distance             1
Bedroom2          8217
Bathroom          8226
Car               8728
Landsize         11810
BuildingArea     21115
CouncilArea          3
Regionname           3
Propertycount        3
dtype: int64

In [7]:
cols_to_fill_zero=['Car','Distance','Bedroom2','Bathroom','Propertycount']
dataset[cols_to_fill_zero]=dataset[cols_to_fill_zero].fillna(0)
dataset.isna().sum()

Suburb               0
Rooms                0
Type                 0
Price             7610
Method               0
SellerG              0
Distance             0
Bedroom2             0
Bathroom             0
Car                  0
Landsize         11810
BuildingArea     21115
CouncilArea          3
Regionname           3
Propertycount        0
dtype: int64

In [8]:
cols_to_fill_mean=['Landsize','BuildingArea','Price']
dataset[cols_to_fill_mean]=dataset[cols_to_fill_mean].fillna(dataset[cols_to_fill_mean].mean())
dataset.isna().sum()

Suburb           0
Rooms            0
Type             0
Price            0
Method           0
SellerG          0
Distance         0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
BuildingArea     0
CouncilArea      3
Regionname       3
Propertycount    0
dtype: int64

In [9]:
dataset.dropna(inplace=True)
dataset.isna().sum()

Suburb           0
Rooms            0
Type             0
Price            0
Method           0
SellerG          0
Distance         0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
BuildingArea     0
CouncilArea      0
Regionname       0
Propertycount    0
dtype: int64

In [10]:
dataset=pd.get_dummies(dataset,drop_first=True)
dataset.head()

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Propertycount,Suburb_Aberfeldie,...,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council,Regionname_Eastern Victoria,Regionname_Northern Metropolitan,Regionname_Northern Victoria,Regionname_South-Eastern Metropolitan,Regionname_Southern Metropolitan,Regionname_Western Metropolitan,Regionname_Western Victoria
0,2,1050173.0,2.5,2.0,1.0,1.0,126.0,160.2564,4019.0,0,...,0,1,0,0,1,0,0,0,0,0
1,2,1480000.0,2.5,2.0,1.0,1.0,202.0,160.2564,4019.0,0,...,0,1,0,0,1,0,0,0,0,0
2,2,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,4019.0,0,...,0,1,0,0,1,0,0,0,0,0
3,3,1050173.0,2.5,3.0,2.0,1.0,0.0,160.2564,4019.0,0,...,0,1,0,0,1,0,0,0,0,0
4,3,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0,4019.0,0,...,0,1,0,0,1,0,0,0,0,0


In [11]:
X=dataset.drop('Price',axis=1)
Y=dataset['Price']

In [12]:
from sklearn.model_selection import train_test_split
train_x,test_x,train_y,test_y=train_test_split(X,Y,test_size=0.2,random_state=5)

- Linear Regression

In [13]:
from sklearn.linear_model import LinearRegression
reg=LinearRegression().fit(train_x,train_y)

In [14]:
reg.score(test_x,test_y) 

-11809790945.052074

In [15]:
reg.score(train_x,train_y)  # so this model is performing poorly in train but extremely poorly in test
                            # out goal here is to demonstrate it performed poorly on test dataset

0.494391273690317

- **Lasso regression**

In [20]:
# lasso regression is L1 regularization
from sklearn import linear_model

lasso_reg=linear_model.Lasso(alpha=0.5,max_iter=1000,tol=0.01)

lasso_reg.fit(train_x,train_y)

  model = cd_fast.enet_coordinate_descent(


Lasso(alpha=0.5, tol=0.01)

In [23]:
lasso_reg.score(test_x,test_y) # so we can clearly see that we got rid of overfitting to some extend

0.4725320713642748

In [24]:
lasso_reg.score(train_x,train_y)

0.4943841398423966

- **L2 Regularization**

In [25]:
#Ridge is a L2 Regularization technique
from sklearn.linear_model import Ridge

ridge_reg=Ridge(alpha=0.5,max_iter=1000,tol=0.01)
ridge_reg.fit(train_x,train_y)

Ridge(alpha=0.5, max_iter=1000, tol=0.01)

In [26]:
ridge_reg.score(test_x,test_y) # similarly improved results for Ridge regression

0.47335630280194685

In [27]:
ridge_reg.score(test_x,test_y)

0.47335630280194685