In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
import scikitplot as skplt
import matplotlib.pyplot as plt
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler

In [16]:
df = pd.read_csv("C:\\Users\\agozacan\\OneDrive - Humberside Fire and Rescue Service\\RBIP Project\\Merged Data\\epc_irs_left_join.csv")

In [17]:
model_cols = ["ASSET_RATING", "ASSET_RATING_BAND", "PROPERTY_TYPE", "MAIN_HEATING_FUEL", "inc.2010", "inc.2011", "inc.2012", "inc.2013",
"inc.2014", "inc.2015", "inc.2016", "inc.2017",
"inc.2018", "inc.2019", "inc.2020"]

In [18]:
df = df[model_cols]

In [19]:
for field_name in ["PROPERTY_TYPE", "MAIN_HEATING_FUEL", "ASSET_RATING_BAND"]:
    df[field_name] = df[field_name].replace(df[field_name].unique(), range(len(df[field_name].unique())))

In [20]:
df.replace(np.nan, 0, inplace=True)

In [21]:
df.loc[df[df["inc.2020"] > 0].index, "inc.2020"] = 1

In [22]:
df.rename({"inc.2020": "inc.2020.bool"}, axis=1, inplace=True)

In [23]:
training_set, test_set = train_test_split(df, test_size = 0.3)

ncols = len(df.columns)

In [24]:
OVERSAMPLE = True

In [25]:
if OVERSAMPLE:

    oversamp = ADASYN(sampling_strategy = 0.9)
    # oversamp = SMOTE()
    # oversamp = RandomOverSampler()

    ncols = len(df.columns)

    X, y = oversamp.fit_resample(training_set.iloc[:,:ncols-1], training_set.iloc[:,ncols-1])

    training_set = pd.DataFrame(X)

    training_set["inc.2020.bool"] = y

In [26]:
len(y)

11458

In [27]:
X_train = training_set.iloc[:,:ncols-1]
y_train = training_set.iloc[:,ncols-1]
X_test = test_set.iloc[:,:ncols-1]
y_test = test_set.iloc[:,ncols-1]

In [28]:
adaboost = AdaBoostClassifier()
adaboost.fit(X_train, y_train)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

xgboost = GradientBoostingClassifier()
xgboost.fit(X_train, y_train)

y_ada_pred = adaboost.predict(X_test)
test_set.insert(ncols, "AdaBoost Predictions", y_ada_pred)

y_rf_pred = rf.predict(X_test)
test_set.insert(ncols+1, "RF Predictions", y_rf_pred)

y_lr_pred = logreg.predict(X_test)
test_set.insert(ncols+2, "LogReg Predictions", y_lr_pred)

y_xg_pred = xgboost.predict(X_test)
test_set.insert(ncols+3, "XGBoost Predictions", y_xg_pred)

print("AdaBoost:", classification_report(test_set.iloc[:,ncols-1], test_set.iloc[:,ncols]))
print("Random Forest:", classification_report(test_set.iloc[:,ncols-1], test_set.iloc[:,ncols+1]))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


AdaBoost:               precision    recall  f1-score   support

         0.0       0.98      0.98      0.98      2599
         1.0       0.25      0.31      0.27        68

    accuracy                           0.96      2667
   macro avg       0.61      0.64      0.63      2667
weighted avg       0.96      0.96      0.96      2667

Random Forest:               precision    recall  f1-score   support

         0.0       0.98      0.96      0.97      2599
         1.0       0.08      0.12      0.09        68

    accuracy                           0.94      2667
   macro avg       0.53      0.54      0.53      2667
weighted avg       0.95      0.94      0.95      2667



In [29]:
print("Logistic Regression:", classification_report(test_set.iloc[:,ncols-1], test_set.iloc[:,ncols+2]))
print("XGBoost:", classification_report(test_set.iloc[:,ncols-1], test_set.iloc[:,ncols+3]))

Logistic Regression:               precision    recall  f1-score   support

         0.0       0.98      0.92      0.95      2599
         1.0       0.12      0.43      0.19        68

    accuracy                           0.91      2667
   macro avg       0.55      0.67      0.57      2667
weighted avg       0.96      0.91      0.93      2667

XGBoost:               precision    recall  f1-score   support

         0.0       0.98      0.98      0.98      2599
         1.0       0.26      0.26      0.26        68

    accuracy                           0.96      2667
   macro avg       0.62      0.62      0.62      2667
weighted avg       0.96      0.96      0.96      2667



In [30]:
length = len(test_set.iloc[:,ncols-1])

ada_no_matched = sum([(test_set.iloc[i,ncols-1] * test_set.iloc[i,ncols]) + ((1-test_set.iloc[i,ncols-1]) * (1-test_set.iloc[i,ncols])) for i in range(length)])
rf_no_matched = sum([(test_set.iloc[i,ncols-1] * test_set.iloc[i,ncols+1]) + ((1-test_set.iloc[i,ncols-1]) * (1-test_set.iloc[i,ncols+1])) for i in range(length)])
lr_no_matched = sum([(test_set.iloc[i,ncols-1] * test_set.iloc[i,ncols+2]) + ((1-test_set.iloc[i,ncols-1]) * (1-test_set.iloc[i,ncols+2])) for i in range(length)])
xg_no_matched = sum([(test_set.iloc[i,ncols-1] * test_set.iloc[i,ncols+3]) + ((1-test_set.iloc[i,ncols-1]) * (1-test_set.iloc[i,ncols+3])) for i in range(length)])

ada_accuracy = ada_no_matched / length
rf_accuracy = rf_no_matched / length
lr_accuracy = lr_no_matched / length
xg_accuracy = xg_no_matched / length

print("AdaBoost Proportion Correctly Guessed:", ada_accuracy)
print("Random Forest Proportion Correctly Guessed:", rf_accuracy)
print("Logistic Regression Proportion Correctly Guessed:", lr_accuracy)
print("XGBoost Proportion Correctly Guessed:", xg_accuracy)

AdaBoost Proportion Correctly Guessed: 0.9583802024746907
Random Forest Proportion Correctly Guessed: 0.9418822647169104
Logistic Regression Proportion Correctly Guessed: 0.9092613423322085
XGBoost Proportion Correctly Guessed: 0.9625046869141357
