In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [None]:
warnings.filterwarnings("ignore")

In [None]:
nLSOAs = 215
nMonths = 70

In [None]:
### IMPORT DATA THAT WAS CLEANED IN model_data.ipynb ###

df = pd.read_csv("C:\\Users\\agozacan\\OneDrive - Humberside Fire and Rescue Service\\Arson Project\\Clean Data\\model_data.csv")

ncols = len(df.columns)

In [None]:
### SPLIT DATAFRAME INTO TRAINING AND TESTING SETS ###

training_set, test_set = train_test_split(df, test_size = 0.33, random_state=1)

In [None]:
### ADD IN BOOLEAN COLUMN "DID AN ARSON ATTACK HAPPEN IN THIS ENTRY?" ###

training_set.loc[:,"arson.bool"] = (training_set["Arson Count"] > 0)

In [None]:
### UNDERSAMPLE THE TRAINING SET SO THAT THERE IS AN EQUAL NUMBER OF ENTRIES THAT DID HAVE AN ARSON ATTACK AND THOSE THAT DIDN'T ###

print(f"The initial proportion of entries where an arson attack occurred is {sum(training_set['arson.bool']) / len(training_set)}, with {len(training_set)} entries in total")

undersamp = RandomUnderSampler(sampling_strategy=1)

X, y = undersamp.fit_resample(training_set.iloc[:,:-1], training_set.iloc[:,-1])

training_set = pd.DataFrame(X)

training_set["arson.bool"] = y

print(f"The new proportion of entries where an arson attack occurred is {sum(training_set['arson.bool']) / len(training_set)}, with {len(training_set)} entries in total")

In [None]:
### DROP THIS BOOLEAN COLUMN AS IT WAS JUST USED FOR UNDERSAMPLING ###

training_set.drop("arson.bool", axis=1, inplace=True)

In [None]:
### INITIATE PREDICTORS (X) AND RESPONSE (y) IN BOTH TRAINING AND TESTING SETS ###

X_train = training_set.iloc[:,:-1]
y_train = training_set.iloc[:,-1]
X_test = test_set.iloc[:,:-1]
y_test = test_set.iloc[:,-1]

In [None]:
### TRAIN FOUR DIFFERENT MACHINE LEARNING MODELS ON THE TRAINING DATA TO PREDICT THE RESPONSE GIVEN THE PREDICTORS ###

adaboost = AdaBoostClassifier(random_state=1)
adaboost.fit(X_train, y_train)

rf = RandomForestClassifier(random_state=1)
rf.fit(X_train, y_train)

xgboost = GradientBoostingClassifier(random_state=1)
xgboost.fit(X_train, y_train)

mlp = MLPClassifier(random_state=1, solver="lbfgs")
mlp.fit(X_train, y_train)

In [None]:
### USE THE TRAINED MODELS TO MAKE PREDICTIONS ON NEW DATA. INSERT THE PREDICTIONS AS NEW COLUMNS IN THE TEST DATA ###

y_ada_pred = adaboost.predict(X_test)
test_set.insert(ncols, "AdaBoost Predictions", y_ada_pred)

y_rf_pred = rf.predict(X_test)
test_set.insert(ncols+1, "RF Predictions", y_rf_pred)

y_xg_pred = xgboost.predict(X_test)
test_set.insert(ncols+2, "XGBoost Predictions", y_xg_pred)

y_mlp_pred = mlp.predict(X_test)
test_set.insert(ncols+3, "MLP Predictions", y_mlp_pred)

no_classifiers = 4

In [None]:
### VIEW THE TEST DATA PREDICTION RESULTS ###

test_set[test_set["Arson Count"] >= 1]

# test_set.sort_values("IMD Decile", ascending=True)

In [None]:
### VIEW BINARY CLASSIFICATION METRICS USING THE SAME BOOLEAN CONDITION AS EARLIER. THE HIGHER THE RECALL AND PRECISION, THE BETTER ###

real_positives = sum(test_set["Arson Count"] >= 1)

ada_num_positives = sum(test_set["AdaBoost Predictions"] >= 1)
ada_num_true_positives = sum((test_set["AdaBoost Predictions"] >= 1) & (test_set["Arson Count"] >= 1))
ada_recall = ada_num_true_positives / real_positives
ada_precision = ada_num_true_positives / ada_num_positives
print(f"AdaBoost Recall: {round(ada_recall, 4)}, AdaBoost Precision: {round(ada_precision, 4)}")

rf_num_positives = sum(test_set["RF Predictions"] >= 1)
rf_num_true_positives = sum((test_set["RF Predictions"] >= 1) & (test_set["Arson Count"] >= 1))
rf_recall = rf_num_true_positives / real_positives
rf_precision = rf_num_true_positives / rf_num_positives
print(f"RF Recall: {round(rf_recall, 4)}, RF Precision: {round(rf_precision, 4)}")

xg_num_positives = sum(test_set["XGBoost Predictions"] >= 1)
xg_num_true_positives = sum((test_set["XGBoost Predictions"] >= 1) & (test_set["Arson Count"] >= 1))
xg_recall = xg_num_true_positives / real_positives
xg_precision = xg_num_true_positives / xg_num_positives
print(f"XGBoost Recall: {round(xg_recall, 4)}, XGBoost Precision: {round(xg_precision, 4)}")

mlp_num_positives = sum(test_set["MLP Predictions"] >= 1)
mlp_num_true_positives = sum((test_set["MLP Predictions"] >= 1) & (test_set["Arson Count"] >= 1))
mlp_recall = mlp_num_true_positives / real_positives
mlp_precision = mlp_num_true_positives / mlp_num_positives
print(f"MLP Recall: {round(mlp_recall, 4)}, MLP Precision: {round(mlp_precision, 4)}")

In [None]:
### DISPLAY THE MEAN SQUARED ERRORS OF EACH MODEL'S PERFORMANCE. LOWER IS BETTER ###

for i in range(no_classifiers, 0, -1):
    
    print(test_set.columns[-i], mean_squared_error(test_set["Arson Count"], test_set.iloc[:,-i]))

In [None]:
### PLOT THE FEATURE IMPORTANCE GRAPH, SHOWING WHICH COLUMNS ARE THE MOST INFLUENTIAL TO THE MODELS' DECISIONS ###

features = rf.feature_importances_

ftrs = pd.DataFrame({"column_name": df.columns[:-1], "score": features}).sort_values(by = "score", ascending = False).reset_index(drop=True)

plt.figure(figsize=(10,8))
sns.barplot(y = ftrs.loc[:15, "column_name"], x = ftrs.loc[:15, "score"])
plt.title("Random Forest Feature Importance")
plt.xlabel("Score")
plt.ylabel("Column Names")
plt.show()