In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
import scikitplot as skplt
import matplotlib.pyplot as plt
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns

In [None]:
df = pd.read_csv("C:\\Users\\agozacan\\OneDrive - Humberside Fire and Rescue Service\\RBIP Project\\Merged Data\\risk_join.csv")

In [None]:
model_cols = ["RISK_OF_FIRE", "SEVERITY_OF_FIRE", "SLEEPING_RISK", "SLEEPING_RISK_ABOVE", "SSRI_SCORE",
"FSEC_DESCRIPT", "GRS", "FIRE SAFETY STATUS",
"Satisfactory", "ASSET_RATING", "ASSET_RATING_BAND", "PROPERTY_TYPE", "MAIN_HEATING_FUEL", "inc.2010", "inc.2011", "inc.2012", "inc.2013",
"inc.2014", "inc.2015", "inc.2016", "inc.2017",
"inc.2018", "inc.2019", "inc.2020"]

In [None]:
df["FIRE SAFETY STATUS"] = df["FIRE SAFETY STATUS"].replace(["(1) Well Above Average", "(2) Above Average", "(3) Average", "(4) Below Average", "(5) Very Below Average"], range(5))

In [None]:
df["ASSET_RATING_BAND"] = df["ASSET_RATING_BAND"].replace(["A", "B", "C", "D", "E", "F", "G"], range(7))

In [None]:
categorical_cols = ["SLEEPING_RISK", "FSEC_DESCRIPT", "Satisfactory", "PROPERTY_TYPE", "MAIN_HEATING_FUEL"]

In [None]:
df = df[model_cols]

In [None]:
df.replace(np.nan, 0, inplace=True)

In [None]:
df.loc[df[df["inc.2020"] > 0].index, "inc.2020"] = 1

In [None]:
df.rename({"inc.2020": "inc.2020.bool"}, axis=1, inplace=True)

In [None]:
encoder = OneHotEncoder(drop="first", sparse=False)

dummy_view = encoder.fit_transform(df[categorical_cols])

In [None]:
encoded_df = pd.DataFrame(dummy_view)

In [None]:
encoded_df.columns = encoder.get_feature_names()

In [None]:
df.drop(categorical_cols, axis=1, inplace=True)

In [None]:
df = encoded_df.join(df)

In [None]:
df["inc.2020.bool"].value_counts()

In [None]:
len(df.columns)

In [None]:
# df.to_csv("C:\\Users\\agozacan\\OneDrive - Humberside Fire and Rescue Service\\RBIP Project\\Merged Data\\clean_data.csv", index=False)

In [None]:
training_set, test_set = train_test_split(df, test_size = 0.33)

ncols = len(df.columns)

In [None]:
OVERSAMPLE = True

In [None]:
if OVERSAMPLE:

    # oversamp = ADASYN()
    # oversamp = SMOTE(sampling_strategy=0.9)
    oversamp = RandomOverSampler()

    ncols = len(df.columns)

    X, y = oversamp.fit_resample(training_set.iloc[:,:-1], training_set.iloc[:,-1])

    training_set = pd.DataFrame(X)

    training_set["inc.2020.bool"] = y

In [None]:
len(y)

In [None]:
X_train = training_set.iloc[:,:-1]
y_train = training_set.iloc[:,-1]
X_test = test_set.iloc[:,:-1]
y_test = test_set.iloc[:,-1]

In [None]:
adaboost = AdaBoostClassifier()
adaboost.fit(X_train, y_train)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

xgboost = GradientBoostingClassifier()
xgboost.fit(X_train, y_train)

mlp = MLPClassifier()
mlp.fit(X_train, y_train)

y_ada_pred = adaboost.predict(X_test)
test_set.insert(ncols, "AdaBoost Predictions", y_ada_pred)

y_rf_pred = rf.predict(X_test)
test_set.insert(ncols+1, "RF Predictions", y_rf_pred)

y_lr_pred = logreg.predict(X_test)
test_set.insert(ncols+2, "LogReg Predictions", y_lr_pred)

y_xg_pred = xgboost.predict(X_test)
test_set.insert(ncols+3, "XGBoost Predictions", y_xg_pred)

y_mlp_pred = mlp.predict(X_test)
test_set.insert(ncols+4, "MLP Predictions", y_mlp_pred)

In [None]:
test_set[test_set["inc.2020.bool"] == 1.0].iloc[:20, :]

In [None]:
real_positives = len(test_set[test_set["inc.2020.bool"] == 1.0])
adaboost_positives = len(test_set[test_set["AdaBoost Predictions"] == 1.0])
rf_positives = len(test_set[test_set["RF Predictions"] == 1.0])
logreg_positives = len(test_set[test_set["LogReg Predictions"] == 1.0])
XGBoost_positives = len(test_set[test_set["XGBoost Predictions"] == 1.0])
MLP_positives = len(test_set[test_set["MLP Predictions"] == 1.0])

print(f"There are {len(test_set)} entries in the test set, of which {real_positives} are real positives")
print(f"AdaBoost predicted {adaboost_positives} positives")
print(f"Random Forest predicted {rf_positives} positives")
print(f"Logistic Regression predicted {logreg_positives} positives")
print(f"XGBoost predicted {XGBoost_positives} positives")
print(f"MLP predicted {MLP_positives} positives")

In [None]:

print("AdaBoost:", classification_report(test_set.iloc[:,ncols-1], test_set.iloc[:,ncols]))
print("Random Forest:", classification_report(test_set.iloc[:,ncols-1], test_set.iloc[:,ncols+1]))

In [None]:

print("Logistic Regression:", classification_report(test_set.iloc[:,ncols-1], test_set.iloc[:,ncols+2]))
print("XGBoost:", classification_report(test_set.iloc[:,ncols-1], test_set.iloc[:,ncols+3]))

In [None]:
print("MLP:", classification_report(test_set.iloc[:,ncols-1], test_set.iloc[:,ncols+4]))

In [None]:

length = len(test_set.iloc[:,ncols-1])

ada_no_matched = sum([(test_set.iloc[i,ncols-1] * test_set.iloc[i,ncols]) + ((1-test_set.iloc[i,ncols-1]) * (1-test_set.iloc[i,ncols])) for i in range(length)])
rf_no_matched = sum([(test_set.iloc[i,ncols-1] * test_set.iloc[i,ncols+1]) + ((1-test_set.iloc[i,ncols-1]) * (1-test_set.iloc[i,ncols+1])) for i in range(length)])
lr_no_matched = sum([(test_set.iloc[i,ncols-1] * test_set.iloc[i,ncols+2]) + ((1-test_set.iloc[i,ncols-1]) * (1-test_set.iloc[i,ncols+2])) for i in range(length)])
xg_no_matched = sum([(test_set.iloc[i,ncols-1] * test_set.iloc[i,ncols+3]) + ((1-test_set.iloc[i,ncols-1]) * (1-test_set.iloc[i,ncols+3])) for i in range(length)])
mlp_no_matched = sum([(test_set.iloc[i,ncols-1] * test_set.iloc[i,ncols+4]) + ((1-test_set.iloc[i,ncols-1]) * (1-test_set.iloc[i,ncols+4])) for i in range(length)])

ada_accuracy = ada_no_matched / length
rf_accuracy = rf_no_matched / length
lr_accuracy = lr_no_matched / length
xg_accuracy = xg_no_matched / length
mlp_accuracy = mlp_no_matched / length

print("AdaBoost Proportion Correctly Guessed:", ada_accuracy)
print("Random Forest Proportion Correctly Guessed:", rf_accuracy)
print("Logistic Regression Proportion Correctly Guessed:", lr_accuracy)
print("XGBoost Proportion Correctly Guessed:", xg_accuracy)
print("MLP Proportion Correctly Guessed:", mlp_accuracy)

In [None]:
adaprobs = adaboost.predict_proba(X_test)
rfprobs = rf.predict_proba(X_test)
lrprobs = logreg.predict_proba(X_test)
xgprobs = xgboost.predict_proba(X_test)
mlpprobs = mlp.predict_proba(X_test)

In [None]:
#### PLOTS

probas = [adaprobs, rfprobs, lrprobs, xgprobs, mlpprobs]
titles = ["AdaBoost", "Random Forest", "Logistic Regression", "XGBoost", "MLP"]

for i in range(len(probas)):
    
    skplt.metrics.plot_roc(y_test, probas[i], title=titles[i])

    # plt.savefig("C:\\Users\\agozacan\\OneDrive - Humberside Fire and Rescue Service\\RBIP Project\\Report\\images\\"+titles[i]+"_ROC.png", dpi = 200, bbox_inches = "tight")

plt.show()

In [None]:
# positive_probs = [x[1] for x in adaprobs]  # THIS CODE SNIPPET IS IN CASE YOU WANT TO SPLIT INTO QUARTILES
# positive_probs_assigned = [p for p in positive_probs if p > 0.5]
# positive_indices = [positive_probs.index(y) for y in positive_probs_assigned]
# test_set.reset_index(drop=True, inplace=True)
# test_set.loc[positive_indices, "quartile"] = pd.qcut(positive_probs_assigned, q=4, labels=[4, 3, 2, 1])

In [None]:
features = rf.feature_importances_

ftrs = pd.DataFrame({"column_name": df.columns[:-1], "score": features}).sort_values(by = "score", ascending = False).reset_index(drop=True)

plt.figure(figsize=(10,8))
sns.barplot(y = ftrs.loc[:15, "column_name"], x = ftrs.loc[:15, "score"])
plt.title("Random Forest Feature Importance")
plt.xlabel("Score")
plt.ylabel("Column Names")
# plt.savefig("C:\\Users\\agozacan\\OneDrive - Humberside Fire and Rescue Service\\RBIP Project\\Report\\Images\\rf_features.png", dpi = 200, bbox_inches = "tight")
plt.show()