In [1]:
import pandas as pd


In [5]:
df = pd.read_csv('all.csv', parse_dates=[6])

In [199]:
df = df[df.date.dt.year==2022]
df = df.drop(columns='date')

In [200]:
suburb_median = df.groupby('suburb')['price'].agg('median').to_frame().rename(columns={'price':'medianSuburbPrice'})
df = df.merge(suburb_median, on='suburb')

In [201]:
df = df.dropna()
df.loc[df.landUnit == 'ha', 'landSize'] = df[df.landUnit == 'ha'].landSize*100
df = df.drop(columns=['id', 'listingType', 'url', 'tagClassName', 'tagText',
             'street', 'suburb', 'state', 'propertyTypeFormatted', 'landUnit'])
df.astype({'beds': 'int64', 'baths': 'int64', 'parking': 'int64'})
df = pd.get_dummies(df)

In [202]:
n_rows = 10_000
X_train = pd.get_dummies(df.sample(n_rows)).drop(columns=['price'])
y_train = pd.get_dummies(df.sample(n_rows))['price']

In [203]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
import sklearn.ensemble as E


def test_model(model):
    errors = []
    for train_idx, test_idx in KFold().split(X_train):
        model.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
        errors.append(mean_absolute_error(y_train.iloc[test_idx], model.predict(X_train.iloc[test_idx])))
    error = sum(errors)/len(errors)
    print(f"{error:,.0f} - {model}")


In [204]:
models = [E.RandomForestRegressor(),
          E.ExtraTreesRegressor(),
          E.BaggingRegressor(),
          E.AdaBoostRegressor(),
          E.GradientBoostingRegressor(),
          E.HistGradientBoostingRegressor()]

for model in models:
    test_model(model)


529,993 - RandomForestRegressor()
568,780 - ExtraTreesRegressor()
553,156 - BaggingRegressor()
1,047,370 - AdaBoostRegressor()
483,639 - GradientBoostingRegressor()
494,587 - HistGradientBoostingRegressor()
