In [271]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
import scikitplot as skplt
import matplotlib.pyplot as plt
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import OneHotEncoder

In [272]:
df = pd.read_csv("C:\\Users\\agozacan\\OneDrive - Humberside Fire and Rescue Service\\RBIP Project\\Merged Data\\rbip_epc_irs_join.csv")

In [273]:
model_cols = ["FSEC_DESCRIPT", "GRS", "FIRE SAFETY STATUS",
"Satisfactory", "ASSET_RATING", "ASSET_RATING_BAND", "PROPERTY_TYPE", "MAIN_HEATING_FUEL", "inc.2010", "inc.2011", "inc.2012", "inc.2013",
"inc.2014", "inc.2015", "inc.2016", "inc.2017",
"inc.2018", "inc.2019", "inc.2020"]

In [274]:
categorical_cols = ["FSEC_DESCRIPT", "FIRE SAFETY STATUS", "ASSET_RATING_BAND", "Satisfactory", "PROPERTY_TYPE", "MAIN_HEATING_FUEL"]

In [275]:
df = df[model_cols]

In [276]:
df.replace(np.nan, 0, inplace=True)

In [277]:
df.loc[df[df["inc.2020"] > 0].index, "inc.2020"] = 1

In [278]:
df.rename({"inc.2020": "inc.2020.bool"}, axis=1, inplace=True)

In [279]:
encoder = OneHotEncoder(drop="first", sparse=False)

dummy_view = encoder.fit_transform(df[categorical_cols])

In [280]:
encoded_df = pd.DataFrame(dummy_view)

In [281]:
encoded_df.columns = encoder.get_feature_names()

In [282]:
df.drop(categorical_cols, axis=1, inplace=True)

In [283]:
df = encoded_df.join(df)

In [284]:
training_set, test_set = train_test_split(df, test_size = 0.3)

ncols = len(df.columns)

In [285]:
OVERSAMPLE = True

In [286]:
if OVERSAMPLE:

    # oversamp = ADASYN()
    oversamp = SMOTE(sampling_strategy=0.9)
    # oversamp = RandomOverSampler()

    ncols = len(df.columns)

    X, y = oversamp.fit_resample(training_set.iloc[:,:-1], training_set.iloc[:,-1])

    training_set = pd.DataFrame(X)

    training_set["inc.2020.bool"] = y

In [287]:
len(y)

1968

In [288]:
X_train = training_set.iloc[:,:-1]
y_train = training_set.iloc[:,-1]
X_test = test_set.iloc[:,:-1]
y_test = test_set.iloc[:,-1]

In [289]:
adaboost = AdaBoostClassifier()
adaboost.fit(X_train, y_train)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

xgboost = GradientBoostingClassifier()
xgboost.fit(X_train, y_train)

y_ada_pred = adaboost.predict(X_test)
test_set.insert(ncols, "AdaBoost Predictions", y_ada_pred)

y_rf_pred = rf.predict(X_test)
test_set.insert(ncols+1, "RF Predictions", y_rf_pred)

y_lr_pred = logreg.predict(X_test)
test_set.insert(ncols+2, "LogReg Predictions", y_lr_pred)

y_xg_pred = xgboost.predict(X_test)
test_set.insert(ncols+3, "XGBoost Predictions", y_xg_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [290]:
test_set[test_set["inc.2020.bool"] == 1.0].iloc[:20, :]

Unnamed: 0,x0_Factory or warehouse,x0_Further education,x0_Hospitals,x0_Hostel,x0_Hotel,x0_House converted to flat,x0_Licensed premise,x0_Office,x0_Other premises open to the public,x0_Other sleeping accomodation,...,inc.2015,inc.2016,inc.2017,inc.2018,inc.2019,inc.2020.bool,AdaBoost Predictions,RF Predictions,LogReg Predictions,XGBoost Predictions
809,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1270,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1292,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,2.0,1.0,0.0,0.0,1.0,0.0
45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1480,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1060,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1299,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1149,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,2.0,0.0,1.0,1.0,1.0,0.0,1.0


In [291]:
print(len(test_set))
print(len(test_set[test_set["inc.2020.bool"] == 1.0]))
print(len(test_set[test_set["LogReg Predictions"] == 1.0]))

483
23
68


In [292]:

print("AdaBoost:", classification_report(test_set.iloc[:,ncols-1], test_set.iloc[:,ncols]))
print("Random Forest:", classification_report(test_set.iloc[:,ncols-1], test_set.iloc[:,ncols+1]))

AdaBoost:               precision    recall  f1-score   support

         0.0       0.97      0.95      0.96       460
         1.0       0.29      0.39      0.33        23

    accuracy                           0.93       483
   macro avg       0.63      0.67      0.65       483
weighted avg       0.94      0.93      0.93       483

Random Forest:               precision    recall  f1-score   support

         0.0       0.96      0.98      0.97       460
         1.0       0.43      0.26      0.32        23

    accuracy                           0.95       483
   macro avg       0.70      0.62      0.65       483
weighted avg       0.94      0.95      0.94       483



In [293]:

print("Logistic Regression:", classification_report(test_set.iloc[:,ncols-1], test_set.iloc[:,ncols+2]))
print("XGBoost:", classification_report(test_set.iloc[:,ncols-1], test_set.iloc[:,ncols+3]))

Logistic Regression:               precision    recall  f1-score   support

         0.0       0.97      0.88      0.92       460
         1.0       0.16      0.48      0.24        23

    accuracy                           0.86       483
   macro avg       0.57      0.68      0.58       483
weighted avg       0.93      0.86      0.89       483

XGBoost:               precision    recall  f1-score   support

         0.0       0.97      0.97      0.97       460
         1.0       0.30      0.30      0.30        23

    accuracy                           0.93       483
   macro avg       0.63      0.63      0.63       483
weighted avg       0.93      0.93      0.93       483



In [294]:

length = len(test_set.iloc[:,ncols-1])

ada_no_matched = sum([(test_set.iloc[i,ncols-1] * test_set.iloc[i,ncols]) + ((1-test_set.iloc[i,ncols-1]) * (1-test_set.iloc[i,ncols])) for i in range(length)])
rf_no_matched = sum([(test_set.iloc[i,ncols-1] * test_set.iloc[i,ncols+1]) + ((1-test_set.iloc[i,ncols-1]) * (1-test_set.iloc[i,ncols+1])) for i in range(length)])
lr_no_matched = sum([(test_set.iloc[i,ncols-1] * test_set.iloc[i,ncols+2]) + ((1-test_set.iloc[i,ncols-1]) * (1-test_set.iloc[i,ncols+2])) for i in range(length)])
xg_no_matched = sum([(test_set.iloc[i,ncols-1] * test_set.iloc[i,ncols+3]) + ((1-test_set.iloc[i,ncols-1]) * (1-test_set.iloc[i,ncols+3])) for i in range(length)])

ada_accuracy = ada_no_matched / length
rf_accuracy = rf_no_matched / length
lr_accuracy = lr_no_matched / length
xg_accuracy = xg_no_matched / length

print("AdaBoost Proportion Correctly Guessed:", ada_accuracy)
print("Random Forest Proportion Correctly Guessed:", rf_accuracy)
print("Logistic Regression Proportion Correctly Guessed:", lr_accuracy)
print("XGBoost Proportion Correctly Guessed:", xg_accuracy)

AdaBoost Proportion Correctly Guessed: 0.9254658385093167
Random Forest Proportion Correctly Guessed: 0.94824016563147
Logistic Regression Proportion Correctly Guessed: 0.8571428571428571
XGBoost Proportion Correctly Guessed: 0.9337474120082816
