In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

# Read the CSV and Perform Basic Data Cleaning

In [2]:
# Read the CSV
df = pd.read_csv("../cleaned_data/cleaned_house_crime_school.csv")
df

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Date,Distance,Postcode,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount,CrimeRate,NearbySchools
0,Abbotsford,85 Turner St,2,h,1480000.0,2016-12-03,2.5,3067,1,1,202,0,0,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019,157,2
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,2016-02-04,2.5,3067,1,0,156,79,1900,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019,157,2
2,Abbotsford,5 Charles St,3,h,1465000.0,2017-03-04,2.5,3067,2,0,134,150,1900,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019,157,2
3,Abbotsford,40 Federation La,3,h,850000.0,2017-03-04,2.5,3067,2,1,94,0,0,Yarra City Council,-37.7969,144.9969,Northern Metropolitan,4019,157,2
4,Abbotsford,55a Park St,4,h,1600000.0,2016-06-04,2.5,3067,1,2,120,142,2014,Yarra City Council,-37.8072,144.9941,Northern Metropolitan,4019,157,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16035,Yarraville,78 Bayview Rd,3,h,1101000.0,2018-02-24,6.3,3013,1,0,288,0,0,Maribyrnong City Council,-37.8110,144.8852,Western Metropolitan,6543,136,5
16036,Yarraville,13 Burns St,4,h,1480000.0,2018-02-24,6.3,3013,1,3,593,0,0,Maribyrnong City Council,-37.8105,144.8847,Western Metropolitan,6543,136,5
16037,Yarraville,29A Murray St,2,h,888000.0,2018-02-24,6.3,3013,2,1,98,104,2018,Maribyrnong City Council,-37.8155,144.8883,Western Metropolitan,6543,136,5
16038,Yarraville,147A Severn St,2,t,705000.0,2018-02-24,6.3,3013,1,2,220,120,2000,Maribyrnong City Council,-37.8229,144.8786,Western Metropolitan,6543,136,5


# Select features (columns)

In [3]:
# Set features to be used as X values.
X = df.drop(["Price", "Suburb", "Date", "Address", "Type", "Postcode", "CouncilArea", "Lattitude", "Longtitude", "Regionname", "Distance", "CrimeRate"], axis = "columns")
y = df["Price"]
print(X.shape, y.shape)

(16040, 8) (16040,)


In [4]:
# GET housing stats
total_houses = len(df)
max_value = df["Price"].describe()["max"]
min_value = df["Price"].describe()["min"]
print(f"Total houses: {total_houses}")
print(f"Highest price: {max_value}")
print(f"Lowest price: {min_value}")

Total houses: 16040
Highest price: 11200000.0
Lowest price: 131000.0


# Split the data into test and train data using `train_test_split` with test size of 33%

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.33)
X_train

Unnamed: 0,Rooms,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Propertycount,NearbySchools
6367,3,2,2,226,140,2002,3593,7
14911,5,2,3,635,0,0,3619,12
1597,3,1,2,699,0,0,5051,4
10351,3,2,1,1007,250,1980,2985,13
6765,3,2,2,257,120,0,21650,14
...,...,...,...,...,...,...,...,...
13418,3,2,4,560,154,1990,7630,8
5390,3,1,1,202,0,0,6543,5
860,2,1,2,750,0,0,10969,7
15795,5,2,2,710,0,0,2671,3


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [6]:
# Scale X values
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [7]:
y_train = y_train.array.reshape(-1, 1)


In [8]:
# Label Encode the target feature from human words to machine id's
label_encoder = OrdinalEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

ValueError: X does not contain any features, but OrdinalEncoder is expecting 1 features

In [None]:
print(X_train_scaled.shape, y_train_categorical.shape)

# Tune Model Parameters

In [None]:
# Create the model and the layers
model = Sequential()
model.add(Dense(units = 50, activation = "relu", input_dim = X_train_scaled.shape[1]))
model.add(Dense(units = 100, activation = "relu"))
model.add(Dense(units = 150, activation = "relu"))
model.add(Dense(units = 200, activation = "relu"))
model.add(Dense(units = y_train_categorical.shape[1], activation = "softmax"))

In [None]:
# Compile the model
model.compile(optimizer = "adam",
              loss = "categorical_crossentropy",
              metrics = ["accuracy"])
model.summary()

# Train Tuned Model

In [None]:
model.fit(X_train_scaled,
          y_train_categorical,
          epochs = 500,
          shuffle = True,
          verbose = 2)

# Quantify our Trained Model

In [None]:
model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test_categorical, verbose = 2)
print(f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

# Save the Model

In [None]:
filename = "../models/deep_learning_model.h5"
model.save(filename)