In [15]:
# Dependencies
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
import joblib

In [16]:
# Read the CSV
df = pd.read_csv("../cleaned_data/cleaned_house_crime_school.csv")
df

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Date,Distance,Postcode,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount,CrimeRate,NearbySchools
0,Abbotsford,85 Turner St,2,h,1480000.0,2016-12-03,2.5,3067,1,1,202,0,0,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019,157,2
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,2016-02-04,2.5,3067,1,0,156,79,1900,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019,157,2
2,Abbotsford,5 Charles St,3,h,1465000.0,2017-03-04,2.5,3067,2,0,134,150,1900,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019,157,2
3,Abbotsford,40 Federation La,3,h,850000.0,2017-03-04,2.5,3067,2,1,94,0,0,Yarra City Council,-37.7969,144.9969,Northern Metropolitan,4019,157,2
4,Abbotsford,55a Park St,4,h,1600000.0,2016-06-04,2.5,3067,1,2,120,142,2014,Yarra City Council,-37.8072,144.9941,Northern Metropolitan,4019,157,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16035,Yarraville,78 Bayview Rd,3,h,1101000.0,2018-02-24,6.3,3013,1,0,288,0,0,Maribyrnong City Council,-37.8110,144.8852,Western Metropolitan,6543,136,5
16036,Yarraville,13 Burns St,4,h,1480000.0,2018-02-24,6.3,3013,1,3,593,0,0,Maribyrnong City Council,-37.8105,144.8847,Western Metropolitan,6543,136,5
16037,Yarraville,29A Murray St,2,h,888000.0,2018-02-24,6.3,3013,2,1,98,104,2018,Maribyrnong City Council,-37.8155,144.8883,Western Metropolitan,6543,136,5
16038,Yarraville,147A Severn St,2,t,705000.0,2018-02-24,6.3,3013,1,2,220,120,2000,Maribyrnong City Council,-37.8229,144.8786,Western Metropolitan,6543,136,5


# Select features (columns)

In [17]:
# Set features to be used as X values.
X = df.drop(["Price",
             "Suburb",
             "Date",
             "Address",
             "Type",
             "Postcode",
             "CouncilArea",
             "Lattitude",
             "Longtitude",
             "Regionname",
             "Distance",
             "YearBuilt",
             "CrimeRate"], axis = "columns")
y = df["Price"]
print(X.shape, y.shape)

(16040, 7) (16040,)


In [18]:
# GET housing stats
total_houses = len(df)
max_value = df["Price"].describe()["max"]
min_value = df["Price"].describe()["min"]
print(f"Total houses: {total_houses}")
print(f"Highest price: {max_value}")
print(f"Lowest price: {min_value}")

Total houses: 16040
Highest price: 11200000.0
Lowest price: 131000.0


# Split the data into test and train data using `train_test_split` with test size of 33%

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.33)
X_train

Unnamed: 0,Rooms,Bathroom,Car,Landsize,BuildingArea,Propertycount,NearbySchools
6367,3,2,2,226,140,3593,7
14911,5,2,3,635,0,3619,12
1597,3,1,2,699,0,5051,4
10351,3,2,1,1007,250,2985,13
6765,3,2,2,257,120,21650,14
...,...,...,...,...,...,...,...
13418,3,2,4,560,154,7630,8
5390,3,1,1,202,0,6543,5
860,2,1,2,750,0,10969,7
15795,5,2,2,710,0,2671,3


In [20]:
# Scale the data
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [21]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

training_score = classifier.score(X_train, y_train)
base_accuracy = classifier.score(X_test, y_test)

print(f"LogisticRegression training Data Score: {training_score}")
print(f"LogisticRegression testing Data Score: {base_accuracy}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression training Data Score: 0.009678019728270985
LogisticRegression testing Data Score: 0.007366830374008312


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [22]:
# Create the GridSearchCV model
param_grid = {"C": [0.1, 1, 10],
              "max_iter": [1000, 5000, 10000]}
grid = GridSearchCV(classifier, param_grid, error_score = "raise", verbose = 3)

In [23]:
# Train the model with GridSearch
grid.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits




KeyboardInterrupt: 

In [None]:
print(f"Best grid params: {grid.best_params_}")
print(f"Best grid score: {grid.best_score_}")

# Train Tuned Model

In [None]:
# Tuned parameters
C = grid.best_params_["C"]
max_iter = grid.best_params_["max_iter"]

# Tuned model
tuned_model = LogisticRegression(solver = "newton-cg",
                                 multi_class = "auto",
                                 C = C,
                                 max_iter = max_iter)
tuned_model.fit(X_train_scaled, y_train)

tuned_model_score = tuned_model.score(X_train_scaled, y_train)
tuned_accuracy = tuned_model.score(X_test_scaled, y_test)

print(f"Training Data Score: {tuned_model_score}")
print(f"Testing Data Score: {tuned_accuracy}")

In [None]:
# Make predictions with the hypertuned model
predictions = tuned_model.predict(X_test)
classifications = y_test.unique().tolist()

prediction_actual = {"Actual": y_test,
                     "Prediction": predictions}

prediction_df = pd.DataFrame(prediction_actual)
prediction_df = prediction_df.set_index("Actual").reset_index()
prediction_df

In [None]:
evaluations = {"": ["Base Model", "Tuned Model"],
               "Accuracy": [f"%s" % round(base_accuracy, 3), f"%s" % round(tuned_accuracy, 3)]}

evaluations_df = pd.DataFrame(evaluations)
evaluations_df = evaluations_df.set_index("")

evaluations_df.to_csv("../evaluations/random_forest_eval.csv")
evaluations_df

# Save the model

In [None]:
filename = "../models/random_forest.sav"
joblib.dump(tuned_model, filename)