In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('./cleaned_data.csv')

In [3]:
df.head()

Unnamed: 0,date,price,address,bedrooms,bathrooms,parking,area,median_price,population,average_age,zipcode,state
0,29-04-24,480000,"1307/60 A'beckett Street, Melbourne",2,1,0,53.0,550000,47279,20 to 39,3000,Victoria
1,27-03-24,815000,"1103/108 Flinders Street, Melbourne",2,2,1,93.0,550000,47279,20 to 39,3000,Victoria
2,29-02-24,3350000,"3801/35 Spring Street, Melbourne",3,2,2,175.0,1072000000,47279,20 to 39,3000,Victoria
3,29-02-24,3350000,"3801/35 Spring Street, Melbourne",3,2,2,175.0,1072000000,47279,20 to 39,3000,Victoria
4,8 12 23,590000,"508/181 Exhibition Street, Melbourne",2,1,1,80.0,550000,47279,20 to 39,3000,Victoria


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55006 entries, 0 to 55005
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          55006 non-null  object 
 1   price         55006 non-null  int64  
 2   address       55006 non-null  object 
 3   bedrooms      55006 non-null  int64  
 4   bathrooms     55006 non-null  int64  
 5   parking       55006 non-null  int64  
 6   area          55006 non-null  float64
 7   median_price  55006 non-null  int64  
 8   population    55006 non-null  int64  
 9   average_age   55006 non-null  object 
 10  zipcode       55006 non-null  int64  
 11  state         55006 non-null  object 
dtypes: float64(1), int64(7), object(4)
memory usage: 5.0+ MB


In [5]:
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

df['state'] = df['state'].str.strip()
df_encoded = df.copy()


le = LabelEncoder()
df_encoded['date'] = le.fit_transform(df_encoded['date'])
df_encoded['address'] = le.fit_transform(df_encoded['address'])
df_encoded['average_age'] = le.fit_transform(df_encoded['average_age'])

In [6]:
x = df_encoded.drop('state', axis=1)
y = df_encoded['state']

le_state = LabelEncoder()
y_encoded = le_state.fit_transform(y)

smote = SMOTE(random_state=42)
x_resampled, y_resampled = smote.fit_resample(x, y_encoded)

y_resampled_decoded = le_state.inverse_transform(y_resampled)

df = x_resampled
df['state'] = y_resampled_decoded

In [7]:
df['state'].value_counts()

Victoria              17092
New South Wales       17092
Queensland            17092
Western Australia     17092
Southern Australia    17092
Northern Territory    17092
Tasmania              17092
Name: state, dtype: int64

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119644 entries, 0 to 119643
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   date          119644 non-null  int64  
 1   price         119644 non-null  int64  
 2   address       119644 non-null  int64  
 3   bedrooms      119644 non-null  int64  
 4   bathrooms     119644 non-null  int64  
 5   parking       119644 non-null  int64  
 6   area          119644 non-null  float64
 7   median_price  119644 non-null  int64  
 8   population    119644 non-null  int64  
 9   average_age   119644 non-null  int64  
 10  zipcode       119644 non-null  int64  
 11  state         119644 non-null  object 
dtypes: float64(1), int64(10), object(1)
memory usage: 11.0+ MB


In [9]:
from sklearn.preprocessing import StandardScaler

df['state'] = le_state.fit_transform(df['state'])

X = df[['bedrooms', 'bathrooms', 'parking', 'area', 'state']]
y = df['price']
scaler = StandardScaler()
x = scaler.fit_transform(X)

In [10]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=2345)

In [11]:
from catboost import CatBoostRegressor

model = CatBoostRegressor()
model.fit(x_train, y_train)

Learning rate set to 0.083319
0:	learn: 594479.9969308	total: 55.3ms	remaining: 55.2s
1:	learn: 582193.8850694	total: 62.3ms	remaining: 31.1s
2:	learn: 571235.4676010	total: 71.2ms	remaining: 23.7s
3:	learn: 561776.7797657	total: 78.6ms	remaining: 19.6s
4:	learn: 552595.5349795	total: 84.4ms	remaining: 16.8s
5:	learn: 544987.2376142	total: 87.9ms	remaining: 14.6s
6:	learn: 538716.6042416	total: 92.1ms	remaining: 13.1s
7:	learn: 532365.7850066	total: 95.6ms	remaining: 11.9s
8:	learn: 527526.3958457	total: 99ms	remaining: 10.9s
9:	learn: 522588.2604696	total: 102ms	remaining: 10.1s
10:	learn: 518455.9901990	total: 105ms	remaining: 9.45s
11:	learn: 514961.5035568	total: 108ms	remaining: 8.86s
12:	learn: 511862.1610203	total: 110ms	remaining: 8.39s
13:	learn: 509064.5188305	total: 113ms	remaining: 7.93s
14:	learn: 506072.5854215	total: 115ms	remaining: 7.53s
15:	learn: 503456.6754684	total: 117ms	remaining: 7.2s
16:	learn: 501047.5020555	total: 119ms	remaining: 6.9s
17:	learn: 498845.91862

<catboost.core.CatBoostRegressor at 0x7cf59509a530>

In [12]:
from sklearn.metrics import r2_score

r2_score(y_test, model.predict(x_test))

0.49435372928405474

In [13]:
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

model2 = XGBRegressor()
model2.fit(x_train, y_train)
r2_score(y_test, model2.predict(x_test))

0.4915731370975176

In [14]:
model3 = DecisionTreeRegressor()
model3.fit(x_train, y_train)
r2_score(y_test, model3.predict(x_test))

0.2280028125061906

In [15]:
model4 = RandomForestRegressor()
model4.fit(x_train, y_train)
r2_score(y_test, model4.predict(x_test))

0.427992766971492

In [16]:
model5 = SVR(kernel="linear")
model5.fit(x_train, y_train)
r2_score(y_test, model5.predict(x_test))

-0.006787512873712043

In [17]:
model6 = KNeighborsRegressor()
model6.fit(x_train, y_train)
r2_score(y_test, model6.predict(x_test))

0.434123628640639