In [1]:
# Dependencies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import tree
from sklearn.preprocessing import MinMaxScaler
import joblib

In [2]:
# Read the CSV
df = pd.read_csv("../cleaned_data/cleaned_house_crime_school.csv")
df

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Date,Distance,Postcode,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount,CrimeRate,NearbySchools
0,Abbotsford,85 Turner St,2,h,1480000.0,2016-12-03,2.5,3067,1,1,202,0,0,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019,157,2
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,2016-02-04,2.5,3067,1,0,156,79,1900,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019,157,2
2,Abbotsford,5 Charles St,3,h,1465000.0,2017-03-04,2.5,3067,2,0,134,150,1900,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019,157,2
3,Abbotsford,40 Federation La,3,h,850000.0,2017-03-04,2.5,3067,2,1,94,0,0,Yarra City Council,-37.7969,144.9969,Northern Metropolitan,4019,157,2
4,Abbotsford,55a Park St,4,h,1600000.0,2016-06-04,2.5,3067,1,2,120,142,2014,Yarra City Council,-37.8072,144.9941,Northern Metropolitan,4019,157,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16035,Yarraville,78 Bayview Rd,3,h,1101000.0,2018-02-24,6.3,3013,1,0,288,0,0,Maribyrnong City Council,-37.8110,144.8852,Western Metropolitan,6543,136,5
16036,Yarraville,13 Burns St,4,h,1480000.0,2018-02-24,6.3,3013,1,3,593,0,0,Maribyrnong City Council,-37.8105,144.8847,Western Metropolitan,6543,136,5
16037,Yarraville,29A Murray St,2,h,888000.0,2018-02-24,6.3,3013,2,1,98,104,2018,Maribyrnong City Council,-37.8155,144.8883,Western Metropolitan,6543,136,5
16038,Yarraville,147A Severn St,2,t,705000.0,2018-02-24,6.3,3013,1,2,220,120,2000,Maribyrnong City Council,-37.8229,144.8786,Western Metropolitan,6543,136,5


# Select features (columns)

In [3]:
# Set features to be used as X values.
X = df.drop(["Price",
             "Suburb",
             "Date",
             "Address",
             "Type",
             "Postcode",
             "CouncilArea",
             "Lattitude",
             "Longtitude",
             "Regionname",
             "Distance",
             "YearBuilt",
             "CrimeRate"], axis = "columns")
y = df["Price"]
print(X.shape, y.shape)

(16040, 7) (16040,)


In [4]:
X

Unnamed: 0,Rooms,Bathroom,Car,Landsize,BuildingArea,Propertycount,NearbySchools
0,2,1,1,202,0,4019,2
1,2,1,0,156,79,4019,2
2,3,2,0,134,150,4019,2
3,3,2,1,94,0,4019,2
4,4,1,2,120,142,4019,2
...,...,...,...,...,...,...,...
16035,3,1,0,288,0,6543,5
16036,4,1,3,593,0,6543,5
16037,2,2,1,98,104,6543,5
16038,2,1,2,220,120,6543,5


In [5]:
# GET housing stats
total_houses = len(df)
max_value = df["Price"].describe()["max"]
min_value = df["Price"].describe()["min"]
print(f"Total houses: {total_houses}")
print(f"Highest price: {max_value}")
print(f"Lowest price: {min_value}")

Total houses: 16040
Highest price: 11200000.0
Lowest price: 131000.0


# Split the data into test and train data using `train_test_split` with test size of 33%

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.33)
X_train

Unnamed: 0,Rooms,Bathroom,Car,Landsize,BuildingArea,Propertycount,NearbySchools
6367,3,2,2,226,140,3593,7
14911,5,2,3,635,0,3619,12
1597,3,1,2,699,0,5051,4
10351,3,2,1,1007,250,2985,13
6765,3,2,2,257,120,21650,14
...,...,...,...,...,...,...,...
13418,3,2,4,560,154,7630,8
5390,3,1,1,202,0,6543,5
860,2,1,2,750,0,10969,7
15795,5,2,2,710,0,2671,3


In [7]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state = 42, test_size = 16039)
X_train2

Unnamed: 0,Rooms,Bathroom,Car,Landsize,BuildingArea,Propertycount,NearbySchools
7270,3,1,3,758,0,5498,6


# Pre-processing

In [8]:
# Scale your data
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Model using RandomForestRegression

In [9]:
regressor = tree.DecisionTreeRegressor()
regressor= regressor.fit(X_train_scaled, y_train)

training_score = regressor.score(X_train_scaled, y_train)
base_accuracy = regressor.score(X_test_scaled, y_test)

print(f"DecisionTreeClassifier training Data Score: {training_score}")
print(f"DecisionTreeClassifier testing Data Score: {base_accuracy}")

DecisionTreeClassifier training Data Score: 0.9994803733210903
DecisionTreeClassifier testing Data Score: 0.21610589520508272


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

# Parameters: n_estimators=600, max_features= Auto, max_depth= None

In [10]:
# Create the GridSearchCV model
param_grid = {"max_features": ["auto", "sqrt", "log2"],
              "max_depth": [17, 18, None]}

grid = GridSearchCV(regressor, param_grid, error_score = "raise", verbose = 3, cv = 5, n_jobs = -1)

In [11]:
# Train the model with GridSearch
grid.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


GridSearchCV(cv=5, error_score='raise', estimator=DecisionTreeRegressor(),
             n_jobs=-1,
             param_grid={'max_depth': [17, 18, None],
                         'max_features': ['auto', 'sqrt', 'log2']},
             verbose=3)

In [12]:
print(f"Best grid params: {grid.best_params_}")
print(f"Best grid score: {grid.best_score_}")

Best grid params: {'max_depth': 17, 'max_features': 'auto'}
Best grid score: 0.24617087872995044


# Train Tuned Model

In [13]:
# Tuned parameters
max_features = grid.best_params_["max_features"]
max_depth = grid.best_params_["max_depth"]

# Tuned model
tuned_model = tree.DecisionTreeRegressor(max_features = max_features, 
                                    max_depth = max_depth,
                                    random_state = 42)
tuned_model.fit(X_train, y_train)

tuned_model_score = tuned_model.score(X_train, y_train)
tuned_accuracy = tuned_model.score(X_test, y_test)

print(f"Training Data Score: {tuned_model_score}")
print(f"Testing Data Score: {tuned_accuracy}")

Training Data Score: 0.9259686693433833
Testing Data Score: 0.23371226869091077


In [14]:
type(tuned_model)

sklearn.tree._classes.DecisionTreeRegressor

In [15]:
# Make predictions with the hypertuned model
predictions = tuned_model.predict(X_test)

prediction_actual = {"Actual": y_test,
                     "Prediction": predictions}

prediction_df = pd.DataFrame(prediction_actual)
prediction_df = prediction_df.set_index("Actual").reset_index()
prediction_df

Unnamed: 0,Actual,Prediction
0,765000.0,1.310000e+06
1,1000000.0,9.080357e+05
2,1025000.0,8.937500e+05
3,812000.0,1.150000e+06
4,725000.0,5.600000e+05
...,...,...
5289,682000.0,5.416667e+05
5290,1200000.0,1.100000e+06
5291,1377000.0,1.291194e+06
5292,1126000.0,8.938152e+05


In [16]:
# Make predictions with the hypertuned model
predictions = tuned_model.predict(X_test2)

prediction_actual = {"Actual": y_test2,
                     "Prediction": predictions}

full_prediction_df = pd.DataFrame(prediction_actual)
full_prediction_df = full_prediction_df.set_index("Actual").reset_index()
full_prediction_df

Unnamed: 0,Actual,Prediction
0,765000.0,1.310000e+06
1,1000000.0,9.080357e+05
2,1025000.0,8.937500e+05
3,812000.0,1.150000e+06
4,725000.0,5.600000e+05
...,...,...
16034,1005000.0,9.468391e+05
16035,718000.0,7.116000e+05
16036,955000.0,1.063013e+06
16037,1160000.0,1.100047e+06


In [17]:
full_prediction_df = full_prediction_df.sort_values("Actual")
full_prediction_df = full_prediction_df.rename({"Actual": "Price"}, axis = "columns")
full_prediction_df.dtypes

Price         float64
Prediction    float64
dtype: object

In [18]:
full_prediction_df

Unnamed: 0,Price,Prediction
7258,131000.0,131000.0
7129,145000.0,145000.0
13371,145000.0,145000.0
2315,160000.0,1200000.0
4739,170000.0,365000.0
...,...,...
1968,6500000.0,3800000.0
5218,7650000.0,5510000.0
2828,8000000.0,2950000.0
7677,9000000.0,9000000.0


In [19]:
price_df = pd.DataFrame(y_test2)
price_df = price_df.sort_values("Price")
price_df = price_df.merge(full_prediction_df, on = "Price")

In [20]:
price_df

Unnamed: 0,Price,Prediction
0,131000.0,131000.0
1,145000.0,145000.0
2,145000.0,145000.0
3,145000.0,145000.0
4,145000.0,145000.0
...,...,...
610790,6500000.0,3800000.0
610791,7650000.0,5510000.0
610792,8000000.0,2950000.0
610793,9000000.0,9000000.0


In [21]:
df = df.sort_values("Price")
df = df.merge(price_df, on = "Price")
df

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Date,Distance,Postcode,Bathroom,Car,...,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount,CrimeRate,NearbySchools,Prediction
0,Caulfield,30 Pyne St,4,h,131000.0,2017-02-25,8.9,3162,1,2,...,155,1920,Glen Eira City Council,-37.8864,145.0242,Southern Metropolitan,2379,221,4,131000.0
1,Coburg,171 Moreland Rd,4,h,145000.0,2016-06-04,7.8,3058,1,1,...,164,1910,Darebin City Council,-37.7555,144.9658,Northern Metropolitan,11204,313,11,145000.0
2,Coburg,171 Moreland Rd,4,h,145000.0,2016-06-04,7.8,3058,1,1,...,164,1910,Darebin City Council,-37.7555,144.9658,Northern Metropolitan,11204,313,11,145000.0
3,Coburg,171 Moreland Rd,4,h,145000.0,2016-06-04,7.8,3058,1,1,...,164,1910,Darebin City Council,-37.7555,144.9658,Northern Metropolitan,11204,313,11,145000.0
4,Coburg,171 Moreland Rd,4,h,145000.0,2016-06-04,7.8,3058,1,1,...,164,1910,Darebin City Council,-37.7555,144.9658,Northern Metropolitan,11204,313,11,145000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45789360,Kew,15 Barry St,6,h,6500000.0,2016-08-13,5.6,3101,6,3,...,365,1890,Boroondara City Council,-37.8029,145.0267,Southern Metropolitan,10331,152,12,3800000.0
45789361,Hawthorn,49 Lisson Gr,4,h,7650000.0,2017-06-17,5.3,3122,2,4,...,284,1863,Boroondara City Council,-37.8265,145.0305,Southern Metropolitan,11308,144,8,5510000.0
45789362,Canterbury,49 Mangarra Rd,5,h,8000000.0,2017-05-13,9.0,3126,5,4,...,464,1880,Boroondara City Council,-37.8179,145.0694,Southern Metropolitan,3265,86,6,2950000.0
45789363,Mulgrave,35 Bevis St,3,h,9000000.0,2017-07-29,18.8,3170,1,1,...,117,1960,Monash City Council,-37.9317,145.1613,South-Eastern Metropolitan,7113,149,5,9000000.0


In [22]:
evaluations = {"": ["Base Model", "Tuned Model"],
               "Accuracy": [f"%s" % round(base_accuracy, 3), f"%s" % round(tuned_accuracy, 3)]}

evaluations_df = pd.DataFrame(evaluations)
evaluations_df = evaluations_df.set_index("")

evaluations_df.to_csv("../evaluations/decision_tree_eval.csv")
evaluations_df

Unnamed: 0,Accuracy
,
Base Model,0.216
Tuned Model,0.234


# Save the model

In [23]:
# filename = "../models/decision_tree.sav"
# joblib.dump(tuned_model, filename)