In [22]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import accuracy_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../Data/melb_data.csv')
df

Unnamed: 0.1,Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,...,1.0,1.0,202.0,,,Yarra,-37.79960,144.99840,Northern Metropolitan,4019.0
1,2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.80790,144.99340,Northern Metropolitan,4019.0
2,4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.80930,144.99440,Northern Metropolitan,4019.0
3,5,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,...,2.0,1.0,94.0,,,Yarra,-37.79690,144.99690,Northern Metropolitan,4019.0
4,6,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.80720,144.99410,Northern Metropolitan,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18391,23540,Williamstown,8/2 Thompson St,2,t,622500.0,SP,Greg,26/08/2017,6.8,...,2.0,1.0,,89.0,2010.0,,-37.86393,144.90484,Western Metropolitan,6380.0
18392,23541,Williamstown,96 Verdon St,4,h,2500000.0,PI,Sweeney,26/08/2017,6.8,...,1.0,5.0,866.0,157.0,1920.0,,-37.85908,144.89299,Western Metropolitan,6380.0
18393,23544,Yallambie,17 Amaroo Wy,4,h,1100000.0,S,Buckingham,26/08/2017,12.7,...,3.0,2.0,,,,,-37.72006,145.10547,Northern Metropolitan,1369.0
18394,23545,Yarraville,6 Agnes St,4,h,1285000.0,SP,Village,26/08/2017,6.3,...,1.0,1.0,362.0,112.0,1920.0,,-37.81188,144.88449,Western Metropolitan,6543.0


In [3]:
df.columns

Index(['Unnamed: 0', 'Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method',
       'SellerG', 'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom',
       'Car', 'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea',
       'Lattitude', 'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [4]:
col_to_remove = ['Unnamed: 0', 'Address', 'Date','YearBuilt','Lattitude', 'Longtitude','Postcode']
df_use = df.drop(col_to_remove, axis=1)
df_use.columns

Index(['Suburb', 'Rooms', 'Type', 'Price', 'Method', 'SellerG', 'Distance',
       'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea',
       'CouncilArea', 'Regionname', 'Propertycount'],
      dtype='object')

In [5]:
df_use.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18396 entries, 0 to 18395
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         18396 non-null  object 
 1   Rooms          18396 non-null  int64  
 2   Type           18396 non-null  object 
 3   Price          18396 non-null  float64
 4   Method         18396 non-null  object 
 5   SellerG        18396 non-null  object 
 6   Distance       18395 non-null  float64
 7   Bedroom2       14927 non-null  float64
 8   Bathroom       14925 non-null  float64
 9   Car            14820 non-null  float64
 10  Landsize       13603 non-null  float64
 11  BuildingArea   7762 non-null   float64
 12  CouncilArea    12233 non-null  object 
 13  Regionname     18395 non-null  object 
 14  Propertycount  18395 non-null  float64
dtypes: float64(8), int64(1), object(6)
memory usage: 2.1+ MB


In [6]:
df_use.isna().sum()

Suburb               0
Rooms                0
Type                 0
Price                0
Method               0
SellerG              0
Distance             1
Bedroom2          3469
Bathroom          3471
Car               3576
Landsize          4793
BuildingArea     10634
CouncilArea       6163
Regionname           1
Propertycount        1
dtype: int64

In [9]:
col_to_zero = ['Bedroom2', 'Car','Bathroom','Propertycount','Distance']
df_use[col_to_zero] = df_use[col_to_zero].fillna(0)


In [10]:
df_use.isna().sum()

Suburb               0
Rooms                0
Type                 0
Price                0
Method               0
SellerG              0
Distance             0
Bedroom2             0
Bathroom             0
Car                  0
Landsize          4793
BuildingArea     10634
CouncilArea       6163
Regionname           1
Propertycount        0
dtype: int64

In [11]:
df_use['Landsize'] = df['Landsize'].fillna(df.Landsize.mean())
df_use['BuildingArea'] = df['BuildingArea'].fillna(df.BuildingArea.mean())
df_use.isna().sum()

Suburb              0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Distance            0
Bedroom2            0
Bathroom            0
Car                 0
Landsize            0
BuildingArea        0
CouncilArea      6163
Regionname          1
Propertycount       0
dtype: int64

In [12]:
df_use = df_use.dropna()
df_use.isna().sum()

Suburb           0
Rooms            0
Type             0
Price            0
Method           0
SellerG          0
Distance         0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
BuildingArea     0
CouncilArea      0
Regionname       0
Propertycount    0
dtype: int64

In [13]:
df_use.shape

(12233, 15)

In [14]:
df_use

Unnamed: 0,Suburb,Rooms,Type,Price,Method,SellerG,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,CouncilArea,Regionname,Propertycount
0,Abbotsford,2,h,1480000.0,S,Biggin,2.5,2.0,1.0,1.0,202.0,151.220219,Yarra,Northern Metropolitan,4019.0
1,Abbotsford,2,h,1035000.0,S,Biggin,2.5,2.0,1.0,0.0,156.0,79.000000,Yarra,Northern Metropolitan,4019.0
2,Abbotsford,3,h,1465000.0,SP,Biggin,2.5,3.0,2.0,0.0,134.0,150.000000,Yarra,Northern Metropolitan,4019.0
3,Abbotsford,3,h,850000.0,PI,Biggin,2.5,3.0,2.0,1.0,94.0,151.220219,Yarra,Northern Metropolitan,4019.0
4,Abbotsford,4,h,1600000.0,VB,Nelson,2.5,3.0,1.0,2.0,120.0,142.000000,Yarra,Northern Metropolitan,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15391,Williamstown,3,h,1285000.0,S,Jas,6.8,3.0,1.0,0.0,296.0,151.220219,Hobsons Bay,Western Metropolitan,6380.0
15392,Windsor,2,u,560000.0,PI,hockingstuart,4.6,2.0,1.0,1.0,0.0,61.600000,Stonnington,Southern Metropolitan,4380.0
15393,Wollert,3,h,525300.0,S,Stockdale,25.5,3.0,2.0,2.0,400.0,151.220219,Whittlesea,Northern Metropolitan,2940.0
15394,Yarraville,2,h,750000.0,SP,hockingstuart,6.3,2.0,1.0,2.0,269.0,151.220219,Maribyrnong,Western Metropolitan,6543.0


In [16]:
le = LabelEncoder()
col_to_encode = ['Suburb', 'Type', 'Method', 'SellerG', 'CouncilArea', 'Regionname']
for col in col_to_encode:
    df_use[col] = le.fit_transform(df_use[col])
    
df_use

Unnamed: 0,Suburb,Rooms,Type,Price,Method,SellerG,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,CouncilArea,Regionname,Propertycount
0,0,2,0,1480000.0,1,22,2.5,2.0,1.0,1.0,202.0,151.220219,31,2,4019.0
1,0,2,0,1035000.0,1,22,2.5,2.0,1.0,0.0,156.0,79.000000,31,2,4019.0
2,0,3,0,1465000.0,3,22,2.5,3.0,2.0,0.0,134.0,150.000000,31,2,4019.0
3,0,3,0,850000.0,0,22,2.5,3.0,2.0,1.0,94.0,151.220219,31,2,4019.0
4,0,4,0,1600000.0,4,147,2.5,3.0,1.0,2.0,120.0,142.000000,31,2,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15391,300,3,0,1285000.0,1,101,6.8,3.0,1.0,0.0,296.0,151.220219,10,6,6380.0
15392,302,2,2,560000.0,0,247,4.6,2.0,1.0,1.0,0.0,61.600000,26,5,4380.0
15393,303,3,0,525300.0,1,208,25.5,3.0,2.0,2.0,400.0,151.220219,29,2,2940.0
15394,308,2,0,750000.0,3,247,6.3,2.0,1.0,2.0,269.0,151.220219,16,6,6543.0


In [19]:
X = df_use.drop(['Price'], axis=1)
y = df_use.Price

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=32)

In [74]:
model_LR = LinearRegression()
model_LR.fit(X_train, y_train)
y_pred_LR = model_LR.predict(X_test)
model_LR.score(X_test, y_test), model_LR.score(X_train, y_train)

(0.4664611397679215, 0.48662677096817675)

In [75]:
model_Lasso = Lasso(alpha=50, max_iter=100, tol=0.1) # can play with the parameters
model_Lasso.fit(X_train, y_train)
y_pred_lasso = model_Lasso.predict(X_test)
model_Lasso.score(X_test, y_test), model_Lasso.score(X_train, y_train)

(0.46648643619907715, 0.4866267111284589)

In [76]:
model_Ridge = Ridge(alpha=50, max_iter=100, tol=0.1) # can play with the parameters
model_Ridge.fit(X_train, y_train)
y_pred_Ridge = model_Ridge.predict(X_test)
model_Ridge.score(X_test, y_test), model_Ridge.score(X_train, y_train)

(0.46651351354071613, 0.48660826345659103)