In [5]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv('melb_data.csv')
pd.set_option('display.max_columns', None) 
df

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,2.0,1.0,1.0,202.0,,,Yarra,-37.79960,144.99840,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.80790,144.99340,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.80930,144.99440,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,1.0,94.0,,,Yarra,-37.79690,144.99690,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.80720,144.99410,Northern Metropolitan,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13575,Wheelers Hill,12 Strada Cr,4,h,1245000.0,S,Barry,26/08/2017,16.7,3150.0,4.0,2.0,2.0,652.0,,1981.0,,-37.90562,145.16761,South-Eastern Metropolitan,7392.0
13576,Williamstown,77 Merrett Dr,3,h,1031000.0,SP,Williams,26/08/2017,6.8,3016.0,3.0,2.0,2.0,333.0,133.0,1995.0,,-37.85927,144.87904,Western Metropolitan,6380.0
13577,Williamstown,83 Power St,3,h,1170000.0,S,Raine,26/08/2017,6.8,3016.0,3.0,2.0,4.0,436.0,,1997.0,,-37.85274,144.88738,Western Metropolitan,6380.0
13578,Williamstown,96 Verdon St,4,h,2500000.0,PI,Sweeney,26/08/2017,6.8,3016.0,4.0,1.0,5.0,866.0,157.0,1920.0,,-37.85908,144.89299,Western Metropolitan,6380.0


In [3]:
df.isnull().sum()

Suburb              0
Address             0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    12211 non-null  object 
 17  Lattitude      13580 non-null  float64
 18  Longti

In [6]:
cols = ['Rooms','Bathroom','Bedroom2','Landsize','Lattitude','Longtitude']
X = df[cols]
y = df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LinearRegression()

model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

print(f'MSE: {mean_squared_error(y_test, y_pred):.2f}')
print(f'R^2: {r2_score(y_test, y_pred):.2f}')

MSE: 255507462236.66
R^2: 0.36


In [12]:
from sklearn.tree import DecisionTreeRegressor

cols = ['Rooms','Bathroom','Bedroom2','Landsize','Lattitude','Longtitude']
X = df[cols]
y = df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

for depth in [2, 5, 10, 20, None]:
    
    dtr = DecisionTreeRegressor(max_depth=depth, random_state=1)
    dtr.fit(X_train, y_train)
    y_pred = dtr.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f'max_depth={depth} -> MSE: {mse:.2f}, R^2: {r2:.2f}')

max_depth=2 -> MSE: 272236488880.31, R^2: 0.31
max_depth=5 -> MSE: 183548040909.42, R^2: 0.54
max_depth=10 -> MSE: 141746500047.70, R^2: 0.64
max_depth=20 -> MSE: 162149317796.13, R^2: 0.59
max_depth=None -> MSE: 179939339229.32, R^2: 0.55


In [16]:
for min_split in [2, 10, 20, 50]:
    dtr = DecisionTreeRegressor(max_depth=10, min_samples_split=min_split, random_state=1)
    dtr.fit(X_train, y_train)
    y_pred = dtr.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f'min_samples_split={min_split} -> MSE: {mse:.2f}, R^2: {r2:.2f}')

min_samples_split=2 -> MSE: 141746500047.70, R^2: 0.64
min_samples_split=10 -> MSE: 141851764546.55, R^2: 0.64
min_samples_split=20 -> MSE: 142426516620.52, R^2: 0.64
min_samples_split=50 -> MSE: 139608345611.20, R^2: 0.65


In [22]:
for min_leaf in [1,5,10,20,30]:
    dtr = DecisionTreeRegressor(max_depth=10, min_samples_split=50, min_samples_leaf=min_leaf, random_state=1)
    dtr.fit(X_train, y_train)
    y_pred = dtr.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f'min_samples_leaf={min_leaf} -> MSE: {mse:.2f}, R^2: {r2:.2f}')
    

min_samples_leaf=1 -> MSE: 139608345611.20, R^2: 0.65
min_samples_leaf=5 -> MSE: 123326580557.58, R^2: 0.69
min_samples_leaf=10 -> MSE: 125657842726.87, R^2: 0.68
min_samples_leaf=20 -> MSE: 123708080599.36, R^2: 0.69
min_samples_leaf=30 -> MSE: 123792532732.06, R^2: 0.69


In [24]:
dtr = DecisionTreeRegressor(max_depth=10, min_samples_split=50, min_samples_leaf=5, random_state=1)
dtr.fit(X_train, y_train)

y_pred = dtr.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse:.2f}, R^2: {r2:.2f}')

MSE: 123326580557.58, R^2: 0.69
