In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns
from tqdm import tqdm

In [None]:
nLSOAs = 215

In [None]:
nMonths = 70

In [None]:
data = pd.read_csv("C:\\Users\\agozacan\\OneDrive - Humberside Fire and Rescue Service\\Arson Project\\inc_per_lsoa.csv")

In [None]:
imd_data = pd.read_csv("C:\\Users\\agozacan\\OneDrive - Humberside Fire and Rescue Service\\Arson Project\\imd_2019_lsoa.csv")

In [None]:
cbandb_data = pd.read_csv("C:\\Users\\agozacan\\OneDrive - Humberside Fire and Rescue Service\\Arson Project\\cbandb.csv")
cbandb_data["monthyear"] = [x.split(" ")[0][3:] for x in cbandb_data.inc_time_of_call]

In [None]:
aggregate_data = cbandb_data[["monthyear", "LSOA"]].value_counts().rename_axis(["Date", "LSOA"]).reset_index(name="CB & B Count")

In [None]:
for i in range(nLSOAs):

    data.loc[i,"IMD"] = int(imd_data[imd_data["lsoa_code"] == data.loc[i, "lsoa_code"]].imd_decile)

In [None]:
cbandb_counts = []

for i in tqdm(range(nLSOAs)):

    for j in range(1,nMonths+1):

        LSOA = data.loc[i, "lsoa_code"]
        DATE = data.columns[j].split(" ")[1]

        appendage = 0

        for k in range(len(aggregate_data)):

            if aggregate_data.loc[k, "Date"] == DATE and aggregate_data.loc[k, "LSOA"] == LSOA:

                appendage = aggregate_data.loc[k, "CB & B Count"]
        
        cbandb_counts.append(appendage)
            

In [None]:
flytipping_counts = []

for i in range(nLSOAs):
    
    flytipping_counts.extend(data.iloc[i,1:nMonths+1].tolist())

In [None]:
arson_counts = []

for i in range(nLSOAs):
    
    arson_counts.extend(data.iloc[i,nMonths+1:2*nMonths+1].tolist())

In [None]:
dates = [x.split(" ")[1] for x in data.columns[1:nMonths+1]] * nLSOAs
months = [x.split("/")[0] for x in dates]
years = [x.split("/")[1] for x in dates]

In [None]:
lsoas = [x for x in data.iloc[:,0] for _ in range(nMonths)]

In [None]:
imd_deciles = [x for x in data.IMD for _ in range(nMonths)]

In [None]:
df = pd.DataFrame({"Month": months, "Year": years, "LSOA": lsoas, "IMD Decile": imd_deciles, "Flytipping Count": flytipping_counts,
                   "CB & B Count": cbandb_counts, "Arson Count": arson_counts})

In [None]:
df.head()

In [None]:
categorical_cols = ["Month", "Year", "LSOA"]

In [None]:
encoder = OneHotEncoder(drop="first", sparse=False)

dummy_view = encoder.fit_transform(df[categorical_cols])

In [None]:
encoded_df = pd.DataFrame(dummy_view)

In [None]:
encoded_df.columns = encoder.get_feature_names(categorical_cols)

In [None]:
df.drop(categorical_cols, axis=1, inplace=True)

In [None]:
df = encoded_df.join(df)

In [None]:
training_set, test_set = train_test_split(df, test_size = 0.33, random_state=1)

ncols = len(df.columns)

In [None]:
X_train = training_set.iloc[:,:-1]
y_train = training_set.iloc[:,-1]
X_test = test_set.iloc[:,:-1]
y_test = test_set.iloc[:,-1]

In [None]:
adaboost = AdaBoostClassifier(random_state=1)
adaboost.fit(X_train, y_train)

rf = RandomForestClassifier(random_state=1)
rf.fit(X_train, y_train)

xgboost = GradientBoostingClassifier(random_state=1)
xgboost.fit(X_train, y_train)

mlp = MLPClassifier(random_state=1)
mlp.fit(X_train, y_train)

y_ada_pred = adaboost.predict(X_test)
test_set.insert(ncols, "AdaBoost Predictions", y_ada_pred)

y_rf_pred = rf.predict(X_test)
test_set.insert(ncols+1, "RF Predictions", y_rf_pred)

y_xg_pred = xgboost.predict(X_test)
test_set.insert(ncols+2, "XGBoost Predictions", y_xg_pred)

y_mlp_pred = mlp.predict(X_test)
test_set.insert(ncols+3, "MLP Predictions", y_mlp_pred)

no_classifiers = 4

In [None]:
test_set[test_set["Arson Count"] >= 1]

In [None]:
for i in range(no_classifiers, 0, -1):
    
    print(test_set.columns[-i], mean_squared_error(test_set["Arson Count"], test_set.iloc[:,-i]))

In [None]:
features = rf.feature_importances_

ftrs = pd.DataFrame({"column_name": df.columns[:-1], "score": features}).sort_values(by = "score", ascending = False).reset_index(drop=True)

plt.figure(figsize=(10,8))
sns.barplot(y = ftrs.loc[:15, "column_name"], x = ftrs.loc[:15, "score"])
plt.title("Random Forest Feature Importance")
plt.xlabel("Score")
plt.ylabel("Column Names")
plt.show()