In [None]:
# Dependencies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report
import joblib

In [None]:
# Read the CSV
df = pd.read_csv("../cleaned_data/cleaned_house_crime_school.csv")
df

# Select features (columns)

In [None]:
# Set features to be used as X values.
X = df.drop(["Price", "Suburb", "Date", "Address", "Type", "Postcode", "CouncilArea", "Lattitude", "Longtitude", "Regionname"], axis = "columns")
y = df["Price"]
print(X.shape, y.shape)

In [None]:
# GET housing stats
total_houses = len(df)
max_value = df["Price"].describe()["max"]
min_value = df["Price"].describe()["min"]
print(f"Total houses: {total_houses}")
print(f"Highest price: {max_value}")
print(f"Lowest price: {min_value}")

# Split the data into test and train data using `train_test_split` with test size of 33%

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.33)
X_train

In [None]:
classifier = RandomForestRegressor(n_estimators = 200, random_state = 42)
classifier.fit(X_train, y_train)

training_score = classifier.score(X_train, y_train)
base_accuracy = classifier.score(X_test, y_test)

print(f"RandomForestRegressor training Data Score: {training_score}")
print(f"RandomForestRegressor testing Data Score: {base_accuracy}")

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [None]:
# Create the GridSearchCV model
param_grid = {"n_estimators": [200, 400, 800, 1600, 3200],
              "max_features": ["auto", "sqrt", "log2"],
              "max_depth": [14, 15, 16, 17, 18, None]}

grid = GridSearchCV(classifier, param_grid, error_score = "raise", verbose = 3, cv = 5, n_jobs = -1)

In [None]:
# Train the model with GridSearch
grid.fit(X_train, y_train)

In [None]:
print(f"Best grid params: {grid.best_params_}")
print(f"Best grid score: {grid.best_score_}")

# Train Tuned Model

In [None]:
# Tuned parameters
max_features = grid.best_params_["max_features"]
n_estimators = grid.best_params_["n_estimators"]
max_depth = grid.best_params_["max_depth"]

# Tuned model
tuned_model = RandomForestRegressor(max_features = max_features, 
                                     n_estimators = n_estimators,
                                     max_depth = max_depth,
                                     random_state = 42)
tuned_model.fit(X_train, y_train)

tuned_model_score = tuned_model.score(X_train, y_train)
tuned_accuracy = tuned_model.score(X_test, y_test)

print(f"Training Data Score: {tuned_model_score}")
print(f"Testing Data Score: {tuned_accuracy}")

In [None]:
# Make predictions with the hypertuned model
predictions = tuned_model.predict(X_test)
classifications = y_test.unique().tolist()

prediction_actual = {"Actual": y_test,
                     "Prediction": predictions}

prediction_df = pd.DataFrame(prediction_actual)
prediction_df = prediction_df.set_index("Actual").reset_index()
prediction_df

In [None]:
evaluations = {"": ["Base Model", "Tuned Model"],
               "Accuracy": [f"%s" % round(base_accuracy, 3), f"%s" % round(tuned_accuracy, 3)]}

evaluations_df = pd.DataFrame(evaluations)
evaluations_df = evaluations_df.set_index("")

evaluations_df.to_csv("../evaluations/random_forest_eval.csv")
evaluations_df

# Save the model

In [None]:
filename = "../models/random_forest.sav"
joblib.dump(tuned_model, filename)