# Melbourne house price prediction
A simple project demonstrating some of the things I learned about machine learning.

In [29]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [30]:
data = pd.read_csv('melb_data.csv')
data

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.79960,144.99840,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.80790,144.99340,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.80930,144.99440,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.79690,144.99690,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.80720,144.99410,Northern Metropolitan,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13575,Wheelers Hill,12 Strada Cr,4,h,1245000.0,S,Barry,26/08/2017,16.7,3150.0,...,2.0,2.0,652.0,,1981.0,,-37.90562,145.16761,South-Eastern Metropolitan,7392.0
13576,Williamstown,77 Merrett Dr,3,h,1031000.0,SP,Williams,26/08/2017,6.8,3016.0,...,2.0,2.0,333.0,133.0,1995.0,,-37.85927,144.87904,Western Metropolitan,6380.0
13577,Williamstown,83 Power St,3,h,1170000.0,S,Raine,26/08/2017,6.8,3016.0,...,2.0,4.0,436.0,,1997.0,,-37.85274,144.88738,Western Metropolitan,6380.0
13578,Williamstown,96 Verdon St,4,h,2500000.0,PI,Sweeney,26/08/2017,6.8,3016.0,...,1.0,5.0,866.0,157.0,1920.0,,-37.85908,144.89299,Western Metropolitan,6380.0


In [31]:
data.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


In [32]:
data.dtypes

Suburb            object
Address           object
Rooms              int64
Type              object
Price            float64
Method            object
SellerG           object
Date              object
Distance         float64
Postcode         float64
Bedroom2         float64
Bathroom         float64
Car              float64
Landsize         float64
BuildingArea     float64
YearBuilt        float64
CouncilArea       object
Lattitude        float64
Longtitude       float64
Regionname        object
Propertycount    float64
dtype: object

# Data preparation

Let's handle missing values and excessive features.

In [33]:
#number of nans in each column
data.isna().sum()

Suburb              0
Address             0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

As we can see, NaNs are a big portion of BuildingArea and YearBuilt values, so, let's remove these features. Some features are also obviously dependent or/and unlikely to be useful: address, coordinate features, council area, date, postcode. We will remove them too.

In [34]:
useless_features = ['BuildingArea', 'YearBuilt','CouncilArea','Lattitude','Longtitude', 'Address', 'Date', 'Propertycount', 'Postcode']
processed_data = data.drop(columns=useless_features)
processed_data['Car'].fillna(0, inplace=True)
processed_data

Unnamed: 0,Suburb,Rooms,Type,Price,Method,SellerG,Distance,Bedroom2,Bathroom,Car,Landsize,Regionname
0,Abbotsford,2,h,1480000.0,S,Biggin,2.5,2.0,1.0,1.0,202.0,Northern Metropolitan
1,Abbotsford,2,h,1035000.0,S,Biggin,2.5,2.0,1.0,0.0,156.0,Northern Metropolitan
2,Abbotsford,3,h,1465000.0,SP,Biggin,2.5,3.0,2.0,0.0,134.0,Northern Metropolitan
3,Abbotsford,3,h,850000.0,PI,Biggin,2.5,3.0,2.0,1.0,94.0,Northern Metropolitan
4,Abbotsford,4,h,1600000.0,VB,Nelson,2.5,3.0,1.0,2.0,120.0,Northern Metropolitan
...,...,...,...,...,...,...,...,...,...,...,...,...
13575,Wheelers Hill,4,h,1245000.0,S,Barry,16.7,4.0,2.0,2.0,652.0,South-Eastern Metropolitan
13576,Williamstown,3,h,1031000.0,SP,Williams,6.8,3.0,2.0,2.0,333.0,Western Metropolitan
13577,Williamstown,3,h,1170000.0,S,Raine,6.8,3.0,2.0,4.0,436.0,Western Metropolitan
13578,Williamstown,4,h,2500000.0,PI,Sweeney,6.8,4.0,1.0,5.0,866.0,Western Metropolitan


In [35]:
processed_data.isna().sum()

Suburb        0
Rooms         0
Type          0
Price         0
Method        0
SellerG       0
Distance      0
Bedroom2      0
Bathroom      0
Car           0
Landsize      0
Regionname    0
dtype: int64

Note that features are either of numeric type or object type, but object features are actually strings, and they can be converted to category type.

In [36]:
categorial_features = ['Suburb', 'Type', 'Method', 'SellerG', 'Regionname']
y = processed_data['Price']
X = processed_data.drop(columns=['Price'])
X[categorial_features] = X[categorial_features].astype('category')
X.dtypes

Suburb        category
Rooms            int64
Type          category
Method        category
SellerG       category
Distance       float64
Bedroom2       float64
Bathroom       float64
Car            float64
Landsize       float64
Regionname    category
dtype: object

# Training and comparing different models

First, we need to choose our metric - let's choose MAPE, as relative error would be more informative.

It is generally useful to normalize numeric values and use one-hot encoding for categorial ones, so let's do this.

In [37]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

categorial_features = ['Suburb', 'Type', 'Method', 'SellerG', 'Regionname']
numeric_features = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom', 'Car', 'Landsize']

def transform_data(X):
  scaler = StandardScaler()
  encoder = OneHotEncoder(drop='first')
  X_num = scaler.fit_transform(X[numeric_features])
  X_cat = encoder.fit_transform(X[categorial_features]).toarray()
  return np.hstack((X_num, X_cat))

In [38]:
X_transformed = transform_data(X)
X_transformed.shape

(13580, 599)

Now we are going to try various regression algorithms: ridge regression, random forest, gradient boosting. We are going to use cross-validation to tune hyperparameters, so we are going to split the data in two parts right now.

In [39]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

In [40]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_percentage_error

def test_model(X_train, y_train, X_test, y_test, regressor, params):
  model = GridSearchCV(regressor, params, scoring='neg_mean_absolute_percentage_error')
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  print(mean_absolute_percentage_error(y_test, y_pred), model.best_params_, sep='\n')

First, let's try out ridge regression:

In [41]:
from sklearn.linear_model import Ridge

ridge_parameters  = {
    'alpha': [0.01, 0.1, 1, 10, 100]
}

ridge = Ridge()
test_model(X_train, y_train, X_test, y_test, ridge, ridge_parameters)

0.2435130493606874
{'alpha': 10}


Now, let's try random forest

In [17]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

forest_parameters = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [5, 7, 10]
}

forest = RandomForestRegressor()
test_model(X_train, y_train, X_test, y_test, forest, forest_parameters)

0.1881272516778739
{'max_depth': 10, 'n_estimators': 200}


A decent improvement over the previous algorithm, but it also took much more time.

Finally, let's try gradient boosting.

In [42]:
!pip install CatBoost
from catboost import CatBoostRegressor




First, let's see it's performance right out of the box:

In [43]:
model = CatBoostRegressor()
model.fit(X_train, y_train, verbose=False)

<catboost.core.CatBoostRegressor at 0x794f32331db0>

In [44]:
y_pred = model.predict(X_test)
mean_absolute_percentage_error(y_test, y_pred)

0.16471391298678653

Without any tuning, gradient boosting gave a decent score, higher than other algorithms, and did it fast. We can conclude that gradient boosting is indeed the greatest technique for structured data (tables).