In [16]:
# Loading dataset
import pandas as pd
import numpy as np

df = pd.read_csv("Melbourne_housing.csv")
cols_to_use = ['Suburb', 'Rooms', 'Type', 'Method', 'SellerG', 'Regionname', 'Propertycount',
               'Distance', 'CouncilArea', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'Price']
df = df[cols_to_use]
df.head()

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,CouncilArea,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price
0,Abbotsford,2,h,SS,Jellis,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,1.0,126.0,,
1,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,1.0,202.0,,1480000.0
2,Abbotsford,2,h,S,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,2.0,1.0,0.0,156.0,79.0,1035000.0
3,Abbotsford,3,u,VB,Rounds,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,2.0,1.0,0.0,,
4,Abbotsford,3,h,SP,Biggin,Northern Metropolitan,4019.0,2.5,Yarra City Council,3.0,2.0,0.0,134.0,150.0,1465000.0


In [17]:
# Checking missing values
df.isnull().mean()*100

Unnamed: 0,0
Suburb,0.0
Rooms,0.0
Type,0.0
Method,0.0
SellerG,0.0
Regionname,0.008607
Propertycount,0.008607
Distance,0.002869
CouncilArea,0.008607
Bedroom2,23.573457


In [18]:
# handling missing values of certain columns by filling it with zero
cols_zero = ['Propertycount', 'Distance', 'Bedroom2', 'Bathroom', 'Car']
df[cols_zero] = df[cols_zero].fillna(0)

In [19]:
# handling missing values of certian columns by imputing it with mean
df['Landsize'] = df['Landsize'].fillna(df.Landsize.mean())
df['BuildingArea'] = df['BuildingArea'].fillna(df.BuildingArea.mean())

In [20]:
# Dropping remaining missing values
df.dropna(inplace=True)

In [21]:
# One hot encoding categorical features
df = pd.get_dummies(df, drop_first=True)
df.head()

Unnamed: 0,Rooms,Propertycount,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Price,Suburb_Aberfeldie,...,CouncilArea_Moorabool Shire Council,CouncilArea_Moreland City Council,CouncilArea_Nillumbik Shire Council,CouncilArea_Port Phillip City Council,CouncilArea_Stonnington City Council,CouncilArea_Whitehorse City Council,CouncilArea_Whittlesea City Council,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council
1,2,4019.0,2.5,2.0,1.0,1.0,202.0,160.2564,1480000.0,False,...,False,False,False,False,False,False,False,False,True,False
2,2,4019.0,2.5,2.0,1.0,0.0,156.0,79.0,1035000.0,False,...,False,False,False,False,False,False,False,False,True,False
4,3,4019.0,2.5,3.0,2.0,0.0,134.0,150.0,1465000.0,False,...,False,False,False,False,False,False,False,False,True,False
5,3,4019.0,2.5,3.0,2.0,1.0,94.0,160.2564,850000.0,False,...,False,False,False,False,False,False,False,False,True,False
6,4,4019.0,2.5,3.0,1.0,2.0,120.0,142.0,1600000.0,False,...,False,False,False,False,False,False,False,False,True,False


In [22]:
# Train Test Split
from sklearn.model_selection import train_test_split
X = df.drop('Price', axis=1)
Y = df['Price']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [23]:
# Applying linear regression and checking train and test accuracy
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train, Y_train)

print("Test score :-", reg.score(X_test, Y_test))
print("Train score :-", reg.score(X_train, Y_train))

Test score :- 0.6639095111574729
Train score :- 0.6825630659136583


In [24]:
# Applying L1 regularization (Lasso regression) and checking train and test accuracy
from sklearn.linear_model import Lasso
lasso_reg = Lasso(alpha=50, max_iter=1000, tol=0.1)
lasso_reg.fit(X_train, Y_train)

print("Test score :-", lasso_reg.score(X_test, Y_test))
print("Train score :-", lasso_reg.score(X_train, Y_train))

Test score :- 0.664649111037851
Train score :- 0.678434463298712


In [25]:
# Applying L2 regularization (Ridge regression) and checking train and test accuracy
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=50, max_iter=1000, tol=0.1)
ridge_reg.fit(X_train, Y_train)

print("Test score :-", ridge_reg.score(X_test, Y_test))
print("Train score :-", ridge_reg.score(X_train, Y_train))

Test score :- 0.6568050882966512
Train score :- 0.6672532059808791
