In [3]:
# Regularization

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')


In [6]:
dataset = pd.read_csv('Melbourne_housing_FULL.csv')

In [7]:
dataset.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [8]:
dataset.nunique()

Suburb             351
Address          34009
Rooms               12
Type                 3
Price             2871
Method               9
SellerG            388
Date                78
Distance           215
Postcode           211
Bedroom2            15
Bathroom            11
Car                 15
Landsize          1684
BuildingArea       740
YearBuilt          160
CouncilArea         33
Lattitude        13402
Longtitude       14524
Regionname           8
Propertycount      342
dtype: int64

In [18]:
columns_to_use = ['Suburb', 'Rooms', 'Type', 'Method', 'SellerG', 'Distance', 'CouncilArea','Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'Price']
dataset = dataset[columns_to_use]

In [15]:
dataset.head()

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Distance,CouncilArea,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price
0,Abbotsford,2,h,SS,Jellis,2.5,Yarra City Council,2.0,1.0,1.0,126.0,,
1,Abbotsford,2,h,S,Biggin,2.5,Yarra City Council,2.0,1.0,1.0,202.0,,1480000.0
2,Abbotsford,2,h,S,Biggin,2.5,Yarra City Council,2.0,1.0,0.0,156.0,79.0,1035000.0
3,Abbotsford,3,u,VB,Rounds,2.5,Yarra City Council,3.0,2.0,1.0,0.0,,
4,Abbotsford,3,h,SP,Biggin,2.5,Yarra City Council,3.0,2.0,0.0,134.0,150.0,1465000.0


In [21]:
dataset.shape

(34857, 13)

In [24]:
dataset.isna().sum()

Suburb              0
Rooms               0
Type                0
Method              0
SellerG             0
Distance            1
CouncilArea         3
Bedroom2         8217
Bathroom         8226
Car              8728
Landsize        11810
BuildingArea    21115
Price            7610
dtype: int64

In [26]:
# Handling missing/Null Value

cols_to_fill_zero = ['Distance', 'Bedroom2', 'Bathroom', 'Car' ]
dataset[cols_to_fill_zero] = dataset[cols_to_fill_zero].fillna(0)


In [28]:
# using mean to fill the continous column for faster results

dataset['Landsize'] = dataset['Landsize'].fillna(dataset.Landsize.mean())
dataset['BuildingArea'] = dataset['BuildingArea'].fillna(dataset.BuildingArea.mean())

In [30]:
dataset.dropna(inplace=True)

In [32]:
print("Dataset Shape:",dataset.shape)

Dataset Shape: (27244, 13)


In [34]:
dataset = pd.get_dummies(dataset, drop_first=True)
dataset.head()

Unnamed: 0,Rooms,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price,Suburb_Aberfeldie,Suburb_Airport West,...,CouncilArea_Moorabool Shire Council,CouncilArea_Moreland City Council,CouncilArea_Nillumbik Shire Council,CouncilArea_Port Phillip City Council,CouncilArea_Stonnington City Council,CouncilArea_Whitehorse City Council,CouncilArea_Whittlesea City Council,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council
1,2,2.5,2.0,1.0,1.0,202.0,160.2564,1480000.0,False,False,...,False,False,False,False,False,False,False,False,True,False
2,2,2.5,2.0,1.0,0.0,156.0,79.0,1035000.0,False,False,...,False,False,False,False,False,False,False,False,True,False
4,3,2.5,3.0,2.0,0.0,134.0,150.0,1465000.0,False,False,...,False,False,False,False,False,False,False,False,True,False
5,3,2.5,3.0,2.0,1.0,94.0,160.2564,850000.0,False,False,...,False,False,False,False,False,False,False,False,True,False
6,4,2.5,3.0,1.0,2.0,120.0,142.0,1600000.0,False,False,...,False,False,False,False,False,False,False,False,True,False


In [35]:
X = dataset.drop('Price' ,axis= 1)
Y = dataset['Price']

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 2)

In [43]:
reg = LinearRegression().fit(x_train, y_train)
print("Regression Score for Testing Data:",reg.score(x_test,y_test))

Regression Score for Testing Data: 0.13850577886284754


In [42]:
print("Regression Score for Training Data:",reg.score(x_train, y_train))

Regression Score for Training Data: 0.6827792395792723


In [56]:
# Using Lasso (L1 regularized) regression Model

from sklearn import linear_model

lasso_reg = linear_model.Lasso(alpha = 50, max_iter = 100, tol = 0.1)
lasso_reg.fit(x_train, y_train)
print("Regression Scre for L1 Testing Data:",lasso_reg.score(x_test, y_test))
print("Regression Scre for L1 Training Data:",lasso_reg.score(x_train, y_train))

Regression Scre for L1 Testing Data: 0.6636673123689183
Regression Scre for L1 Training Data: 0.6765532437058862


In [57]:
# Using Ridge (l2 Regularized) Regression Model

from sklearn.linear_model import Ridge

ridge_reg = Ridge(alpha = 50, max_iter = 100, tol = 0.1)
ridge_reg.fit(x_train, y_train)
print("Regression Scre for L2 Testing Data:",ridge_reg.score(x_test, y_test))
print("Regression Scre for L2 Training Data:",ridge_reg.score(x_train, y_train))

Regression Scre for L2 Testing Data: 0.6667728206546331
Regression Scre for L2 Training Data: 0.6619036003704484
