# Imports

In [19]:
import pandas as pd
import numpy as np
import os
import datetime
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import tensorflow as tf
from tensorflow import keras

# Set up

In [2]:
RANDOM_SEED = 42
tf.random.set_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

# Data

In [3]:
df = pd.read_csv("melb_data.csv")

In [4]:
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [6]:
df["id"] = df["Longtitude"] * 1000 + df["Lattitude"]
df = df.set_index("id")
df = df.dropna()

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 6196 entries, 144955.59209999998 to 144856.07961999997
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         6196 non-null   object 
 1   Address        6196 non-null   object 
 2   Rooms          6196 non-null   int64  
 3   Type           6196 non-null   object 
 4   Price          6196 non-null   float64
 5   Method         6196 non-null   object 
 6   SellerG        6196 non-null   object 
 7   Date           6196 non-null   object 
 8   Distance       6196 non-null   float64
 9   Postcode       6196 non-null   float64
 10  Bedroom2       6196 non-null   float64
 11  Bathroom       6196 non-null   float64
 12  Car            6196 non-null   float64
 13  Landsize       6196 non-null   float64
 14  BuildingArea   6196 non-null   float64
 15  YearBuilt      6196 non-null   float64
 16  CouncilArea    6196 non-null   object 
 17  Lattitude      6196

# Pre-processing

In [8]:
df.head()
df["Date"] = df["Date"].apply(lambda x: datetime.datetime.strptime(x, "%d/%m/%Y"))
df["SellMonth"] = df["Date"].apply(lambda x: x.month)
df["SellDay"] = df["Date"].apply(lambda x: x.day)
df["SellYear"] = df["Date"].apply(lambda x: x.year)
df = df.drop("Date", axis=1)

In [9]:
train_set, test_set = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)
train_labels = train_set["Price"]
train_set = train_set.drop("Price", axis=1)

In [10]:
cat_attribs = ['Suburb', 'Address', 'Type', 'Method', 'SellerG', 'CouncilArea', 'Regionname',]
num_attribs = [col for col in train_set.columns.tolist() if col not in cat_attribs]

full_pipeline = ColumnTransformer([
        ('std_scaler', StandardScaler(), num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])
prepped = full_pipeline.fit_transform(train_set)

# Linear Regression

In [11]:
lin_reg = LinearRegression()
lin_reg.fit(prepped, train_labels)
lin_reg_predicitons = lin_reg.predict(prepped)

# Decision Tree

In [12]:
tree_reg = DecisionTreeRegressor(random_state=RANDOM_SEED)
tree_reg.fit(prepped, train_labels)
decision_tree_predicitons = tree_reg.predict(prepped)

# Random Forest

In [13]:
forest_reg = RandomForestRegressor(n_estimators=100, random_state=RANDOM_SEED)
forest_reg.fit(prepped, train_labels)
random_forest_tree_predicitons = forest_reg.predict(prepped)

# Neural Network - Needs some work

In [24]:
#Dropping non-numerical columns for ease of use
df = df.drop(cat_attribs, axis=1)

target = np.asarray(df["Price"].values.tolist())
data = np.asarray(df.drop("Price", axis=1).values.tolist())

X_train_full, X_test, y_train_full, y_test = train_test_split(data, target, random_state=RANDOM_SEED)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, random_state=RANDOM_SEED)


model = keras.models.Sequential([
    keras.layers.Dense(30, activation="relu",
    input_shape=X_train.shape[1:]),
    keras.layers.Dense(1)
])
model.compile(loss="mean_squared_error", optimizer="sgd")
history = model.fit(X_train, y_train, epochs=20, validation_data=(X_valid, y_valid))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
