In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm

In [None]:
nLSOAs = 215
nMonths = 70

In [None]:
### READ IN DATA GIVING FLYTIPPING AND ARSON INCIDENT COUNT PER LSOA PER MONTH ###

data = pd.read_csv("C:\\Users\\agozacan\\OneDrive - Humberside Fire and Rescue Service\\Arson Project\\inc_per_lsoa.csv")

In [None]:
### READ IN DATA GIVING IMD DECILE FOR EVERY LSOA IN ENGLAND ###

imd_data = pd.read_csv("C:\\Users\\agozacan\\OneDrive - Humberside Fire and Rescue Service\\Arson Project\\imd_2019_lsoa.csv")

In [None]:
### READ IN WEATHER DATA ###

weather_data = pd.read_csv("C:\\Users\\agozacan\\OneDrive - Humberside Fire and Rescue Service\\Arson Project\\Weather Data\\weather.csv")

In [None]:
### READ IN DATA GIVING CONTROLLED BURNING AND BONFIRE (CB & B) INCIDENTS WITH LSOA AND DATE INFO, THEN TURN DATE COLUMN INTO JUST MONTH AND YEAR ###

cbandb_data = pd.read_csv("C:\\Users\\agozacan\\OneDrive - Humberside Fire and Rescue Service\\Arson Project\\cbandb.csv")
cbandb_data["monthyear"] = [x.split(" ")[0][3:] for x in cbandb_data.inc_time_of_call]

In [None]:
### AGGREGATE THE CB & B DATA BY DATE AND LSOA TO GIVE COUNTS ###

aggregate_data = cbandb_data[["monthyear", "LSOA"]].value_counts().rename_axis(["Date", "LSOA"]).reset_index(name="CB & B Count")

In [None]:
### ATTACH IMD DECILE TO THE INCIDENTS DATA ###

for i in range(nLSOAs):

    data.loc[i,"IMD"] = int(imd_data[imd_data["lsoa_code"] == data.loc[i, "lsoa_code"]].imd_decile)

In [None]:
### CREATE LIST OF CB & B INCIDENT COUNTS. THIS LIST CORRESPONDS TO A MATRIX OF DIMENSIONS nLSOAs x nMonths ###

cbandb_counts = []

for i in tqdm(range(nLSOAs)):

    for j in range(1,nMonths+1):

        LSOA = data.loc[i, "lsoa_code"]
        DATE = data.columns[j].split(" ")[1]

        appendage = 0

        for k in range(len(aggregate_data)):

            if aggregate_data.loc[k, "Date"] == DATE and aggregate_data.loc[k, "LSOA"] == LSOA:

                appendage = aggregate_data.loc[k, "CB & B Count"]
        
        cbandb_counts.append(appendage)
            

In [None]:
### FLATTEN THE FLYTIPPING COUNTS TO A LIST, ALSO CORRESPONDING TO A MATRIX OF DIMENSIONS nLSOAs x nMonths ###

flytipping_counts = []

for i in range(nLSOAs):
    
    flytipping_counts.extend(data.iloc[i,1:nMonths+1].tolist())

In [None]:
### DITTO BUT FOR ARSON COUNTS ###

arson_counts = []

for i in range(nLSOAs):
    
    arson_counts.extend(data.iloc[i,nMonths+1:2*nMonths+1].tolist())

In [None]:
### SPLIT DATES INTO MONTHS AND YEARS ###

dates = [x.split(" ")[1] for x in data.columns[1:nMonths+1]] * nLSOAs
months = [x.split("/")[0] for x in dates]
years = [x.split("/")[1] for x in dates]

In [None]:
### MAKE LIST OF LSOAS SO THAT EACH LSOA REPEATS nMonths TIMES CONSECUTIVELY ###

lsoas = [x for x in data.iloc[:,0] for _ in range(nMonths)]

In [None]:
### DITTO BUT FOR THE IMD DECILES CORRESPONDING TO THE LSOAs ###

imd_deciles = [x for x in data.IMD for _ in range(nMonths)]

In [None]:
### TURN WEATHER DATA INTO USABLE FORM ###

max_temp = list(weather_data.max_temp[3:73]) * nLSOAs
min_temp = list(weather_data.min_temp[3:73]) * nLSOAs
af_days = list(weather_data.af_days[3:73]) * nLSOAs        # AF - Air Frost
rain_mm = list(weather_data.rain_mm[3:73]) * nLSOAs
sun_hours = list(weather_data.sun_hours[3:73]) * nLSOAs

In [None]:
### LOAD ALL OF THIS INTO A NEW DATAFRAME ###

df = pd.DataFrame({"Month": months, "Year": years, "Max Temp": max_temp, "Min Temp": min_temp, "AF Days": af_days,
                   "Rainfall": rain_mm, "Hours of Sun": sun_hours, "LSOA": lsoas, "IMD Decile": imd_deciles, "Flytipping Count": flytipping_counts,
                   "CB & B Count": cbandb_counts, "Arson Count": arson_counts})

In [None]:
### MAKE A LIST OF WHICH FEATURES ARE CATEGORICAL (i.e. NOT ORDERED NUMERIC) ###

categorical_cols = ["Month", "Year", "LSOA"]

In [None]:
### INITIATE A DUMMY ENCODING FOR THE CATEGORICAL COLUMNS SO THAT THE MODEL CAN LEARN PROPERLY ###

encoder = OneHotEncoder(drop="first", sparse=False)

dummy_view = encoder.fit_transform(df[categorical_cols])

In [None]:
### STORE THIS DUMMY ENCODED DATA INTO A NEW DATAFRAME ###

encoded_df = pd.DataFrame(dummy_view)

In [None]:
### NAME THESE COLUMNS IN THE FORMAT "columnname_value" ###

encoded_df.columns = encoder.get_feature_names(categorical_cols)

In [None]:
### DROP THE INITIAL CATEGORICAL COLUMNS ###

df.drop(categorical_cols, axis=1, inplace=True)

In [None]:
### ATTACH THE NEW DUMMY ENCODED COLUMNS ###

df = encoded_df.join(df)

In [None]:
### SAVE THIS FINAL DATAFRAME FOR USE IN ml_models.ipynb ###

df.to_csv("C:\\Users\\agozacan\\OneDrive - Humberside Fire and Rescue Service\\Arson Project\\Clean Data\\model_data.csv", index=False)