# Extract Crime Data

In [None]:
# Import Relevant Modules
import os
import pandas as pd
import numpy as np

In [None]:
# Directory Navigation
d = os.getcwd()
print(f'Current: {d}')
file_location = "/home/uthlakanyana/Dropbox/Dissertation Code/London Crime"
os.chdir(file_location)

# Obtain Directory Items
locations = [file_location+"/"+str(f) for f in os.listdir()]
    
# Obtain the Files Locations in Each Directory
dataframes = []
for l in locations:
    os.chdir(l)
    dataframes.extend([l+"/"+i for i in os.listdir(l)])


In [None]:
# Collate Data into a Single Dataframe
dataframes = [pd.read_csv(i,usecols=["Month","Reported by","Longitude",
                                     "Latitude","Location","LSOA code", 
                                     "LSOA name", "Crime type"]).rename(columns ={"Reported by":"ReportedBy",
                                                                                "LSOA code":"LSOA_Code",
                                                                                "LSOA name": "LSOA_Name"}) for i in dataframes]


Combined = pd.concat(dataframes,axis=0)
Combined.sort_values(by="Month")
Combined.LSOA_Name = Combined.LSOA_Name.fillna("ZZZ")
#Combined.to_csv("/home/uthlakanyana/Combined.csv",index=False)
        

In [None]:
# LSOA Data
# Note that city of London data is marked as confidential
lsoa_data = pd.read_csv("/home/uthlakanyana/Dropbox/Dissertation Code/LSOA Data -2021.csv",
                        usecols=["Area name","Working households (thousands)",
                                 "Working households (per cent)",
                                 "Mixed households (thousands)",
                                 "Mixed households (per cent)",
                                 "Workless households (thousands)",
                                 "Workless households (per cent)"]).rename(columns={"Area Code":"LSOA_Code",
                                                                                    "Area name":"LSOA_Name"})

In [None]:
# List and Designate Areas/Boroughs
areas = ["Camden","City of London","Hackney","Hammersmith and Fulham",
         "Haringey","Islington","Kensington and Chelsea","Lambeth","Lewisham",
         "Newham","Southwark","Tower Hamlets","Wandsworth","Westminster",
         "Barking and Dagenham","Barnet","Bexley","Brent","Bromley","Croydon","Ealing",
         "Enfield","Greenwich","Harrow","Havering","Hillingdon","Hounslow",
         "Kingston upon Thames","Merton","Redbridge","Richmond upon Thames",]

In [None]:
# Replace Full Area Names With Basic Versions; Keep Track of the Amount of Crime They Account for as Well
x = 0
for zone in areas:
    Combined.loc[Combined["LSOA_Name"].str.contains(zone),"LSOA_Name"] = zone
    lsoa_data.loc[lsoa_data["LSOA_Name"].str.contains(zone),"LSOA_Name"] = zone
    print(zone,len(Combined[Combined["LSOA_Name"]==zone]))
    x = x+len(Combined[Combined["LSOA_Name"]==zone])

In [None]:
# Calculate the Percentage of Crime under the Umbrella of Proper LSOAs
print(x/len(Combined))       

In [None]:
# Export the Data
Comprehensive_LSOA = pd.merge(Combined,lsoa_data,on="LSOA_Name",how="left").fillna(np.nan)     
print(len(Comprehensive_LSOA))   
Comprehensive_LSOA.to_csv("/home/uthlakanyana/Dropbox/Dissertation Code/Comprehensive Data.csv",index=False)

# Preliminary Work

In [None]:
# Import Modules

import pandas as pd
import geopandas as gpd

In [None]:
# Load Crime Data and Reorganize It

data = pd.read_csv("/home/uthlakanyana/Dropbox/Dissertation Code/Comprehensive Data.csv",low_memory=False)
data["Month"] = data["Month"].astype("datetime64[ns]")
data = data.sort_values(by="Month").reset_index().drop(["index"],axis=1)
initial_size = len(data)

In [None]:
data.sort_values(by="Month").head()

In [None]:
# Load Geographic Data and Join It to the Crime Data
# Use the EPSG Relevant to the UK

london_geo_data = gpd.read_file("/home/uthlakanyana/Dropbox/Dissertation Code/statistical-gis-boundaries-london/ESRI/LSOA_2011_London_gen_MHW.shp").rename(columns={"LSOA11CD":"LSOA_Code"})
london_geo_data = london_geo_data.to_crs(27700)
london_geo_data["Area"] = london_geo_data["geometry"].area/10**6

data = data.join(london_geo_data.set_index('LSOA_Code'),on="LSOA_Code")
data = data.drop(["ReportedBy","Location","MSOA11CD","MSOA11NM","LAD11CD","LAD11NM","RGN11CD","RGN11NM","geometry","USUALRES"],axis=1)

In [None]:
p = data.merge(london_geo_data,on="LSOA_Code")

In [None]:
p

In [None]:
list(data.LSOA_Code.unique())

In [None]:
list(london_geo_data.LSOA_Code.unique())

In [None]:
london_geo_data.head()

In [None]:
# Sample the First Entry

data.iloc[0]

In [None]:
data

In [None]:
# Clear Missing Data

data2 = data[data.Longitude.isna() == False]
data2 = data[data.Latitude.isna() == False]
after_missing_size = len(data2)

In [None]:
# Percentage of Initial Data Remaining

print(after_missing_size/initial_size * 100)

In [None]:
# City of London LSOA Information Is always Confidential and/or Not Collected

print(data2[data2["LSOA_Name"] == "City of London"]["Working households (thousands)"].unique())
print(data2[data2["LSOA_Name"] == "City of London"]["Working households (per cent)"].unique())
print(data2[data2["LSOA_Name"] == "City of London"]["Working households (per cent)"].unique())
print(data2[data2["LSOA_Name"] == "City of London"]["Mixed households (thousands)"].unique())
print(data2[data2["LSOA_Name"] == "City of London"]["Mixed households (per cent)"].unique())
print(data2[data2["LSOA_Name"] == "City of London"]["Workless households (thousands)"].unique())
print(data2[data2["LSOA_Name"] == "City of London"]["Workless households (per cent)"].unique())

# We Can Confirm that It Contributes Little to the Entirety of the Data Set in Terms of Criminal Instances (About 10%)

print("City of London contribution is",data[data["LSOA_Name"]=="City of London"].count().sum()/len(data) * 100,"%")

# Thus We Can Remove It Entirely For Now. It Can Be Treated Separately Later

city_of_london = data[data["LSOA_Name"]=="City of London"]

data3 = data2[data2["LSOA_Name"] != "City of London"]



In [None]:
data3

In [None]:
# Convert Data to Proper Values

print(data3.dtypes)
data3 = data3.astype({"Working households (thousands)":"float64",
                      "Working households (per cent)":"float64",
                      "Mixed households (thousands)":"float64",
                      "Mixed households (per cent)": "float64",
                      "Workless households (thousands)": "float64",
                      "Workless households (per cent)": "float64"})

In [None]:
# Check that Relevant Columns Have Been Changed

data3.dtypes

In [None]:
# Check for any NaN Values in the dataset

data3.isna().any()

In [None]:
# The First Six Belong to the LSOAs not Represented in the Economic LSOA Data. To Confirm:

unique_na = data3[data3["Working households (thousands)"].isna() == True]["LSOA_Name"].unique()
number_na = data3[data3["Working households (thousands)"].isna() == True]["LSOA_Name"].count()
print(number_na)

# They Form a Small Proportion of the Dataset

proportion = round(((number_na/len(data3))*100),2)

print(f"{proportion}%")

# We Can Drop Them

data4 = data3[data3["Working households (thousands)"].isna() == False]


In [None]:
# There Are Now No NaN Values from the Crime Dataset + LSOA Information

data4.isna().any()

In [None]:
# Total Percentage Reduction in the Dataset Size So Far

print(f"{round((100-((len(data4)/initial_size) * 100)),2)}%")

In [None]:
# Some LSOAs in Brent Are Not Recognized in the Geographic Dataset

print(data4[data4["LSOA11NM"].isna()].iloc[1])

# Check if the First LSOA Code Here Is in the Geographic Dataset
data4[data4["LSOA11NM"].isna()].iloc[0][3] in london_geo_data[london_geo_data["LAD11NM"]=="Brent"]["LSOA_Code"].unique()

In [None]:
data4["LSOA11NM"]

In [None]:
# Identify Problematic Columns and their Quantities

missing_lsoa11nm = data4[data4["LSOA11NM"].isna()==True].copy()
missing_lsoa11nm["Instance"] = 1
missing_lsoa11nm[["LSOA_Name","Instance"]].groupby(["LSOA_Name"],as_index=False).sum()

In [None]:
# We Can Thus Drop the Entries Since They Are Insignificant and Useless

data5 = data4[data4["LSOA11NM"].isna() == False]


In [None]:
# If We Check for Null Values in the Dataset, We Can See That There Are None

data5.isna().any()

In [None]:
data5.to_csv("/home/uthlakanyana/Dropbox/Dissertation Code/Comprehensive Data - 2.csv",index=False)

# Begin Graphing 

## Loads

In [None]:
# Import Modules

import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

In [None]:
# Confirm that the Dataset is Cleaned. Each Feature Should Be Appropriately Typed, and there Should Be No NaN Values in the Dataset

data = pd.read_csv("/home/uthlakanyana/Dropbox/Dissertation Code/Comprehensive Data - 2.csv")
data["Month"] = data["Month"].astype("datetime64[ns]")
#print(data.dtypes)
#data.isna().any()

In [None]:
data[data["Month"].dt.year==2022]

## Bar Charts

In [None]:
# Create a Bar Chart of Crime Incidence By Location
sns.set_style("white")

#plt.ticklabel_format(style = 'plain')
sns.set(rc={'figure.figsize':(9,7)},style="white")
zone_plot = sns.countplot(y=data.LSOA_Name,order = data.LSOA_Name.value_counts().index,palette="RdBu")
zone_plot.set(xlabel='Criminal Incidents', ylabel='Borough',xlim=20000)
zone_plot.get_xaxis().set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'))

In [None]:
# Get the Ranked List of the Most Crime-Ridden Areas
crime_ridden_areas = data
crime_ridden_areas["Sum"] = 1
crime_ridden_areas.groupby(["LSOA_Name"]).sum(numeric_only = True).sort_values(["Sum"],ascending=False)["Sum"]

In [None]:
# Create Bar Chart of Crime Incidence by Type

sns.set(rc={'figure.figsize':(9,7)})
zone_plot = sns.countplot(y=data["Crime type"],palette="RdBu")
zone_plot.set(xlabel='Criminal Incidents', ylabel='Type of Crime',xlim=5000)
zone_plot.get_xaxis().set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'))

## Time Series

In [None]:
# Time Series Plot of Crime Incidenced Over Time

data_temp = data
data_temp["Counts"] = 1
crime_over_time = data_temp.groupby(["Month"]).count()["Counts"]
time_plot = sns.lineplot(x=crime_over_time.index,y=crime_over_time,color="r")
time_plot.set(xlabel='Time', ylabel='Criminal Incidents')
plt.xticks(rotation=30)
time_plot.get_yaxis().set_major_formatter(mpl.ticker.StrMethodFormatter('{x:,.0f}'))

In [None]:
data.sort_values(by=["Month"])

# Group Data and Plot Geography

## Loading

In [None]:
# Import Modules

import pandas as pd
import os
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import geopandas as gpd
import geodatasets
import contextily
import category_encoders as ce
import warnings
from shapely.errors import ShapelyDeprecationWarning
warnings.filterwarnings("ignore", category=ShapelyDeprecationWarning) 
import plotly.express as px

In [None]:
#Load Data

data = pd.read_csv("/home/uthlakanyana/Dropbox/Dissertation Code/Comprehensive Data - 2.csv")
data["Month"] = data["Month"].astype("datetime64[ns]")
data["Instances"] = 1
data.insert(0,"Year",data.Month.dt.year)
data["Month"] = data.Month.dt.month


In [None]:
# Select the Data from the Top 5 Regions

Regions = {"westminster" : data.loc[data["LSOA_Name"]=="Westminster"],
            "tower hamlets" : data.loc[data["LSOA_Name"]=="Tower Hamlets"],
            "southwark" : data.loc[data["LSOA_Name"]=="Southwark"],
            "newham" : data.loc[data["LSOA_Name"]=="Newham"],
            "lambeth" : data.loc[data["LSOA_Name"]=="Lambeth"]
            }


## Visualizations

In [None]:
# Load the Shapefiles for London

london_geo_data = gpd.read_file("/home/uthlakanyana/Dropbox/Dissertation Code/statistical-gis-boundaries-london/ESRI/LSOA_2011_London_gen_MHW.shp")
london_geo_data.to_crs(epsg=4326, inplace=True)
london_geo_data.head()

In [None]:
instances = data[["Year","Month","Longitude","Latitude","LSOA_Name","LSOA11NM","Instances"]]
instances = instances.groupby(["Year","Month","Longitude","Latitude","LSOA_Name","LSOA11NM"],as_index=False).sum()
instances = instances.merge(london_geo_data[["LSOA11NM","geometry"]],on="LSOA11NM")

In [None]:
crime_instances = london_geo_data.merge(instances[["LSOA11NM","Instances"]].groupby(["LSOA11NM"],as_index=False).sum())
crime_instances["geometry"].to_crs({'init': 'epsg:27700'})
crime_instances["area"] = crime_instances.area
crime_instances["Crime Density"] = crime_instances["Instances"]/crime_instances["area"]

In [None]:


crimes = {"westminster":pd.DataFrame(),"newham":pd.DataFrame(),"lambeth":pd.DataFrame(),"southwark":pd.DataFrame(),"tower hamlets":pd.DataFrame()}

for i in crimes.keys():
    crimes[i] = crime_instances[crime_instances["LAD11NM"]==i.title()]
    num_classes = 4 
    num_qtiles = [0, .25, .5, .75, 1.]
    qlabels = ["1st quartile","2nd quartile","3rd quartile","4th quartile"]
    crimes[i].loc[:,'Density_Quartile'] = pd.qcut(crimes[i].loc[:,'Crime Density'], num_qtiles, labels=qlabels)
    ax = plt.plot()
    ax = crimes[i].plot(column="Density_Quartile",legend=True,cmap="coolwarm",figsize=(7, 7))
    ax.set_title(f"Crime Density Distribution in {i.title()}")
    ax.set_xlabel("Latitude")
    ax.set_ylabel("Longitude")


In [None]:
# Visualize the Geographical Dimensions of the Regions in Question

for i in Regions.keys():
   print(i)
   fig, ax = plt.subplots()
   ax = london_geo_data[london_geo_data['LAD11NM']== i.title()].boundary.plot(color="darkred",figsize=(7, 7))
   ax
   ax.set_title(f"Layout of the {i.title()} Borough")
   ax.set_xlabel("Latitude")
   ax.set_ylabel("Longitude")
   plt.show(ax)

In [None]:
# Visualize the Distribution of Crimes in Each Borough

for i in Regions.keys():
    ax = plt.plot()
    ax = crime_instances[crime_instances['LAD11NM']== i.title()].plot(column="Incident_Quartile",legend=True,cmap="coolwarm",figsize=(7, 7),scheme='quantiles')
    ax.set_title(f"Crime Distribution in {i.title()}")
    ax.set_xlabel("Latitude")
    ax.set_ylabel("Longitude")
    #fig.show()

In [None]:
# Visualize the Distribution of Household Residents in each Borough

HH_Plots = {i:london_geo_data[london_geo_data['LAD11NM']== i.title()].plot(column="HHOLDRES",legend=True,cmap="coolwarm",figsize=(7, 7)).set_title(f"Household Distribution in {i.title()}") for i in Regions.keys()}

In [None]:
# Visualize the Distribution of Population Density in Each Borough

PD_Plots = {i:london_geo_data[london_geo_data['LAD11NM']== i.title()].plot(column="POPDEN",legend=True,cmap="coolwarm",figsize=(7, 7)).set_title(f"Population Density Distribution in the {i.title()}") for i in Regions.keys()}

In [None]:
# Visualize the Distribution of Population Density in Each Borough


for i in london_geo_data["LAD11NM"].unique():
    london_geo_data[london_geo_data["LAD11NM"]==i.title()].plot(column)

#PD_Plots = {i:london_geo_data[london_geo_data['LAD11NM']== i.title()].plot(column="POPDEN",legend=True,cmap="coolwarm",figsize=(7, 7)).set_title(f"Population Density Distribution in the {i.title()} Borough") for i in Regions.keys()}

In [None]:
# General Crime Plotting Function

def crime_plotter(region):
        # initialize an axis
        fig, ax = plt.subplots(figsize=(8,6))
        # plot map on axis
        london_geo_data[london_geo_data["LAD11NM"] == region.title()].boundary.plot(color="darkred",
                                                        ax=ax)

        # plot points
        Regions[region].plot.scatter(x="Longitude", y="Latitude",ax=ax,s=1,c="black")
        # add grid
        #ax.grid(alpha=0.5)
        ax.set_title(f"Crime Plot of {region.title()} Borough")
        return plt.show()

# Specific Crime plotting Function

def specific_crime_plotter(region,crime_type):
        # initialize an axis
        fig, ax = plt.subplots(figsize=(8,6))
        # plot map on axis
        london_geo_data[london_geo_data["LAD11NM"] == region.title()].boundary.plot(color="darkred",
                                                        ax=ax)

        # plot points
        Regions[region][Regions[region]["Crime type"]==crime_type].plot.scatter(x="Longitude", y="Latitude",ax=ax,s=1,c="black")
        # add grid
        #ax.grid(alpha=0.5)
        ax.set_title(f"{crime_type.title()} Crime Incidences in {region.title()} Borough")
        return plt.show()

In [None]:
# Plot General Crime

for i in Regions.keys():
    crime_plotter(i)

In [None]:
# List Types of Crimes

data["Crime type"].unique()

In [None]:
# Plot the Specific Crimes That Take Place in Each Zone

for i in Regions.keys():
    for v in data["Crime type"].unique():
        specific_crime_plotter(i,v)

## Data Splitting

### Specific Crimes

In [None]:
# Number of Unique LSOAs

len(set(data["LSOA_Code"].unique()))
data["Crime type"].unique()

# Arrange Specific Crimes Into a Dictionary

specific_crime = {}
groups = pd.concat(Regions.values())

for i in groups["Crime type"].unique():
    specific_crime[i] = groups[groups["Crime type"]==i]
specific_crime["Possession of weapons"]

# Generate the Vingtiles For Regions Under Specific Crimes

total_specific_crime = {}
combined_crime = {}

for i in specific_crime.keys():

    temp_data = specific_crime[i].copy()[["Year","Month","Longitude",
                "Latitude","LSOA_Code","LSOA_Name",
                "Working households (thousands)",
                "Working households (per cent)",
                "Mixed households (thousands)",
                "Mixed households (per cent)",
                "Workless households (thousands)",
                "Workless households (per cent)",
                "LSOA11NM","HHOLDRES","COMESTRES",
                "POPDEN","HHOLDS","AVHHOLDSZ",
                "Area","Instances"]]


    specific_area_group = temp_data.groupby(by=["Year","Month",
            "LSOA_Code","LSOA_Name",
            "Working households (thousands)",
            "Working households (per cent)",
            "Mixed households (thousands)",
            "Mixed households (per cent)",
            "Workless households (thousands)",
            "Workless households (per cent)",
            "LSOA11NM","HHOLDRES","COMESTRES",
            "POPDEN","HHOLDS","AVHHOLDSZ",
            "Area"], as_index = False).sum().sort_values(by=["Year","Month","LSOA_Name"],ascending=True)

    specific_area_group = specific_area_group.drop(columns=["Longitude","Latitude"])


    # Specific Crime Instances Per Zone For Years Up to 2022


    #                                                     Select years not 2023         Pick the following columns to form the new DF             Group the new DF by these cols                 Sum the Instances
    total_specific_crime[i] = specific_area_group.loc[specific_area_group["Year"]!=2023][["LSOA_Code","LSOA11NM","Area","Instances"]].groupby(by=["LSOA_Code","LSOA11NM","Area"],as_index=False).sum()
    total_specific_crime[i]["Crime Density 2022"] =  total_specific_crime[i]["Area"]/ total_specific_crime[i]["Instances"]
    total_specific_crime[i]

    # Join specific_area_group to total_specific_crime on Basis of LSOA_Code

    combined_crime[i] = pd.merge(specific_area_group,total_specific_crime[i][["LSOA_Code","Crime Density 2022"]],on="LSOA_Code")
    #specific_crime[i] = specific_area_group

total_specific_crime[i]
specific_crime[i]
specific_area_group
total_specific_crime[i]
combined_crime[i].head()

 ### Total Crimes

In [None]:
# Group The Occurence of Crimes Per LSOA In General

temp_data = pd.concat(Regions.values())[["Year","Month","Longitude",
                "Latitude","LSOA_Code","LSOA_Name",
                "Working households (thousands)",
                "Working households (per cent)",
                "Mixed households (thousands)",
                "Mixed households (per cent)",
                "Workless households (thousands)",
                "Workless households (per cent)",
                "LSOA11NM","HHOLDRES","COMESTRES",
                "POPDEN","HHOLDS","AVHHOLDSZ",
                "Area","Instances"]]



per_area_group = temp_data.groupby(by=["Year","Month",
                "LSOA_Code","LSOA_Name",
                "Working households (thousands)",
                "Working households (per cent)",
                "Mixed households (thousands)",
                "Mixed households (per cent)",
                "Workless households (thousands)",
                "Workless households (per cent)",
                "LSOA11NM","HHOLDRES","COMESTRES",
                "POPDEN","HHOLDS","AVHHOLDSZ",
                "Area"], as_index = False).sum().sort_values(by=["Year","Month","LSOA_Name"],ascending=True)



per_area_group = per_area_group.drop(columns=["Longitude","Latitude"])


# Annual Crime Instances Per Zone For Years not 2023

total_crime = per_area_group.loc[per_area_group["Year"]!=2023][["LSOA_Code","LSOA11NM","Area","Instances"]].groupby(by=["LSOA_Code","LSOA11NM","Area"],as_index=False).sum()
total_crime["Crime Density 2022"] = total_crime["Area"]/total_crime["Instances"]
total_crime

# Join per_area_group to annual_total_crime on Basis of LSOA_Code

per_area_group = pd.merge(per_area_group,total_crime[["LSOA_Code","Crime Density 2022"]],on="LSOA_Code")
per_area_group

### Monthly Imputations

In [None]:
# Generate Year-Month Tuples to Fill In Any Gaps Due to Months Not Having Any Crime

tuples = [(2020,7),(2020,8),(2020,9),(2020,10),(2020,11),(2020,12),
          (2021,1),(2021,2),(2021,3),(2021,4),(2021,5),(2021,6),(2021,7),(2021,8),(2021,9),(2021,10),(2021,11),(2021,12),
          (2022,1),(2022,2),(2022,3),(2022,4),(2022,5),(2022,6),(2022,7),(2022,8),(2022,9),(2022,10),(2022,11),(2022,12),
          (2023,1),(2023,2),(2023,3),(2023,4),(2023,5),(2023,6)]

#locations = per_area_group["LSOA11NM"].unique()

len(tuples)

In [None]:
# Function to Fill Missing Months With Crime Values

def new_filler(dataframe,locations,tuples=tuples):
    
    dataframe = dataframe
    locations = locations

    for l in locations:

        print(l)

        if len(dataframe[dataframe['LSOA11NM']==l]) != 36:

            row = dataframe[dataframe['LSOA11NM']==l].iloc[0]
            print()
            print(l,f" has {len(dataframe[dataframe['LSOA11NM']==l])} entries")

            for t in range(len(tuples)):
                #print(tuples[t])

                if len(dataframe[dataframe["LSOA11NM"]==l][dataframe["Year"]==tuples[t][0]][dataframe["Month"]==tuples[t][1]]) == 0:
                    print(tuples[t])
                    #row = dataframe[dataframe['LSOA11NM'==l]][dataframe["Year"]==tuples[t][0]][dataframe["Month"]==tuples[t][1]].iloc[0]
                    dataframe = dataframe.append(row, ignore_index=True)
                    #print(row)
                    dataframe.at[dataframe.index[-1],"Instances"] = 0
                    dataframe.at[dataframe.index[-1],"Year"] = tuples[t][0]
                    dataframe.at[dataframe.index[-1],"Month"] = tuples[t][1]
                    print(dataframe.columns)
    return dataframe

        

In [None]:
# Impute General Crime With Missing Months

per_area_group = new_filler(per_area_group,locations)

In [None]:
# Impute Specific Crime with Missing Months
combined_crime2 = dict.fromkeys(combined_crime)


for i in combined_crime.keys():
    combined_crime2[i] = new_filler(combined_crime[i],combined_crime[i]["LSOA11NM"].unique())


In [None]:
# Check Length of Imputed Values for Singular Crimes

for i in combined_crime2.keys():
    print(i,len(combined_crime2[i]))

### Validation/Sorting

In [None]:
# Check That the Length of the Dataset is Similar to What is Expected of 4,828 LSOAs over 19 Months (Some LSOAs Might Not Have Crimes Over All 17 Months) 
# Expanded Dataset Means Original Time and Regions have Changed. The Time Period is 36 Months Now

(len(set(per_area_group["LSOA_Code"].unique())) * 36) == len(per_area_group)

In [None]:
# Sort Ceneral Crime Into Boroughs

new_regions = {"westminster" : per_area_group.loc[per_area_group["LSOA_Name"]=="Westminster"],
            "tower hamlets" : per_area_group.loc[per_area_group["LSOA_Name"]=="Tower Hamlets"],
            "southwark" : per_area_group.loc[per_area_group["LSOA_Name"]=="Southwark"],
            "newham" : per_area_group.loc[per_area_group["LSOA_Name"]=="Newham"],
            "lambeth" : per_area_group.loc[per_area_group["LSOA_Name"]=="Lambeth"]
            }

In [None]:
# Do the Same As Above But For Specific Crimes

crime_type = {}

for i in combined_crime2.keys():
    crime_type[i] = {"westminster" : combined_crime2[i][combined_crime2[i]["LSOA_Name"]=="Westminster"],
            "tower hamlets" : combined_crime2[i].loc[combined_crime2[i]["LSOA_Name"]=="Tower Hamlets"],
            "southwark" : combined_crime2[i].loc[combined_crime2[i]["LSOA_Name"]=="Southwark"],
            "newham" : combined_crime2[i].loc[combined_crime2[i]["LSOA_Name"]=="Newham"],
            "lambeth" : combined_crime2[i].loc[combined_crime2[i]["LSOA_Name"]=="Lambeth"]
            }


In [None]:
# Test Stuff

crime_type[i]["southwark"].sort_values(by=["Year","Month"]).head()
new_regions["southwark"][new_regions["southwark"]["Year"]==2022]
per_area_group.loc[per_area_group["Year"]==2022][["LSOA_Code","LSOA11NM","Area","Instances"]].groupby(by=["LSOA_Code","LSOA11NM","Area"],as_index=False).sum()
len(new_regions["southwark"]["LSOA_Code"].unique())

In [None]:
# Count the Number of LSOA Occurences Across Months

new_regions["southwark"].groupby(by=["LSOA_Code"]).count()

# List Number of Occurrences

print(new_regions["lambeth"].groupby(by=["LSOA_Code"]).count()["Year"].unique())

# It Can be Seen that Some LSOA's Don't Report Crimes For All 17/36 Months, But None Has Reported for More Than 19/36 Months
# Corrected with Imputation

In [None]:
crime_type

## Transformations

In [None]:
# Set Up Encoder for the Various Datasets

encoder = ce.BinaryEncoder(cols='LSOA_Code')
# Identify Total Starting Features

og_columns = set(new_regions["southwark"].columns)
og_columns
# Transformer Function

def btransformer(dataframe,old_cols=og_columns):
    
    # Call on Original Columns

    old_cols = og_columns

    # Establish Pre-Transformation Length

    pre_length = len(dataframe["LSOA_Code"].unique())

    # Transform Data and Reassign to Dictionary

    dataframe = encoder.fit_transform(dataframe)

    # Identify New Columns After Transformation

    new_cols = set(dataframe.columns)

    # Identify Unique New Columns

    diffs = list(new_cols - old_cols)

    # Calculate Post-Transformation Length
    post_length = len(dataframe.groupby(by=diffs).count())

    # If Post-Transformation Length Is Less than Pre-Transformation Length, then Collisions Have occured
    #print(pre_length,post_length)

    print(i,":",f"Collisions = {post_length-pre_length}")

    return dataframe
# Transform and Check for Potential Collisions

for i in new_regions.keys():
    new_regions[i] = btransformer(new_regions[i])
i
# Same as Above but For Specific Crimes

for i in crime_type.keys():
    print(i)
    for q in crime_type[i].keys():
        print(q)
        crime_type[i][q] = btransformer(crime_type[i][q])
crime_type["Other theft"]["southwark"]

## Hotspot Detection

### Specific Crimes

In [None]:
# Assign the Vingtile Distributions For Specific Crimes

for i in crime_type.keys():
    for q in crime_type[i].keys():
        crime_type[i][q].loc[:,"Vingtile Rank"] = pd.qcut(crime_type[i][q]["Crime Density 2022"],20,labels=False,duplicates="drop")
        crime_type[i][q] = crime_type[i][q].sort_values(by=["Year","Month","Vingtile Rank"],ascending=False)

In [None]:
crime_type["Anti-social behaviour"]["westminster"]

In [None]:
# Identify the The Sub Regions That Are Within the Top 5% In Terms of Crime Density For Each Region

specific_region_hotspots = dict.fromkeys(crime_type)

for k in specific_region_hotspots.keys():
    specific_region_hotspots[k] = {"southwark":0,"tower hamlets":0,"westminster":0,"lambeth":0,"newham":0}

for i in crime_type.keys():
    for q in crime_type[i].keys():
        specific_region_hotspots[i][q] = list(crime_type[i][q][crime_type[i][q]["Vingtile Rank"]==19]["LSOA11NM"].unique())

        
        

In [None]:
crime_type["Theft from the person"]

### Total Crimes

In [None]:
# Assign the Vingtile Distributions For General Crimes

for i in new_regions.keys():
    new_regions[i].loc[:,"Vingtile Rank"] = pd.qcut(new_regions[i]["Crime Density 2022"],20,labels=False)
    new_regions[i] = new_regions[i].sort_values(by=["Year","Month","Vingtile Rank"],ascending=False)

new_regions[i][["Year","Month","LSOA11NM","Vingtile Rank"]]

In [None]:
# Identify the The Sub Regions That Are Within the Top 5% In Terms of Crime Density For Each Region

region_hotspots = {}

for i in new_regions.keys():
    region_hotspots[i] = list(new_regions[i][new_regions[i]["Vingtile Rank"]==19]["LSOA11NM"].unique())



In [None]:
region_hotspots

In [None]:
new_regions[i]

### Test Stuff

In [None]:
region_hotspots

In [None]:
specific_region_hotspots

## Export

In [None]:
# Function to Export and Tag Specific Crime Datasets

def specific_exporter(dataframe,title):
    dataframe = dataframe[dataframe["Vingtile Rank"]==19]
    dataframe = dataframe[["Year","Month","Working households (thousands)",
                     "Working households (per cent)","Mixed households (thousands)",
                     "Mixed households (per cent)","Workless households (thousands)",
                     "Workless households (per cent)","LSOA11NM","COMESTRES","POPDEN",
                     "HHOLDS","AVHHOLDSZ","Area","Instances","Crime Density 2022"]]
    dataframe["Crime Type"] = title
    dataframe = dataframe.sort_values(by=["Year","Month"])

    return dataframe.to_csv(f"/home/uthlakanyana/Dropbox/Dissertation Code/Specific Crime Dataset/{title}/{i}.csv",index=False)

In [None]:
# Call Exporting Function for Specific Datasets

for i in crime_type.keys():
        directory = i
        parent_dir = "/home/uthlakanyana/Dropbox/Dissertation Code/Specific Crime Dataset/"
        path = os.path.join(parent_dir, directory)
        os.mkdir(path)

        crime_type[i] = specific_exporter(pd.concat(crime_type[i].values()),i)

In [None]:
specific_crime.keys()

In [None]:
# Full Dataset Prep

full_set = pd.concat(new_regions.values())
full_set = full_set[full_set["Vingtile Rank"]==19]
full_set = full_set[["Year","Month","Working households (thousands)",
                     "Working households (per cent)","Mixed households (thousands)",
                     "Mixed households (per cent)","Workless households (thousands)",
                     "Workless households (per cent)","LSOA11NM","COMESTRES","POPDEN",
                     "HHOLDS","AVHHOLDSZ","Area","Instances","Crime Density 2022"]]
full_set = full_set.sort_values(by=["Year","Month"])
full_set

In [None]:
full_set.columns

In [None]:
# Export Full Dataset

full_set.to_csv("/home/uthlakanyana/Dropbox/Dissertation Code/Comprehensive Data - 3.csv",index=False)

# Text Analysis

## Loads

In [None]:
# Load Modules

import pandas as pd
from collections.abc import Iterable
import regex as re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import string
import preprocessor as p
import nltk
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.tokenize import TweetTokenizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load Twitter Data

twitter_data = {"southwark":pd.read_csv("/home/uthlakanyana/Dropbox/Dissertation Code/Tweet Data 2/MPS Southwark.csv",usecols=["created_at","text","bookmark_count","favorite_count","retweet_count","reply_count","view_count"]),
                "lambeth":pd.read_csv("/home/uthlakanyana/Dropbox/Dissertation Code/Tweet Data 2/MPS Lambeth.csv",usecols=["created_at","text","bookmark_count","favorite_count","retweet_count","reply_count","view_count"]),
                "newham":pd.read_csv("/home/uthlakanyana/Dropbox/Dissertation Code/Tweet Data 2/MPS Newham.csv",usecols=["created_at","text","bookmark_count","favorite_count","retweet_count","reply_count","view_count"]),
                "tower hamlets":pd.read_csv("/home/uthlakanyana/Dropbox/Dissertation Code/Tweet Data 2/MPS Tower Hamlets.csv",usecols=["created_at","text","bookmark_count","favorite_count","retweet_count","reply_count","view_count"]),
                "westminster":pd.read_csv("/home/uthlakanyana/Dropbox/Dissertation Code/Tweet Data 2/MPS Westminster.csv",usecols=["created_at","text","bookmark_count","favorite_count","retweet_count","reply_count","view_count"])
                }



In [None]:
twitter_data["newham"].columns

## Begin Sentiment Analysis

### Generate Sentiment

In [None]:
# Initialize VADER

sentimentAnalyser = SentimentIntensityAnalyzer()

# Set Pandas Column Width to Larger

pd.options.display.max_colwidth = 400


In [None]:
# Function to Calculate the Sentiment

def calculate_sentiment(text):
    # Run VADER on the text
    scores = sentimentAnalyser.polarity_scores(text)
    # Extract the compound score
    compound_score = scores['compound']
    # Return compound score
    return compound_score

In [None]:
 # Apply calculate_sentiment Function to Every Dataframe

for i in twitter_data.keys():
    twitter_data[i]["text"] = twitter_data[i]["text"].str.replace("\n"," ")
    twitter_data[i]['sentiment_score'] = twitter_data[i]['text'].apply(calculate_sentiment)



In [None]:
twitter_data["newham"].head()

### Text Cleaning

In [None]:
# Set Punctuation Characters to punct_cars

punct_chars = set(string.punctuation)

In [None]:
# Strip Punctuation

#for i in twitter_data.keys():
#    twitter_data[i]['text'] = twitter_data[i]['text'].apply(lambda x: ' '.join(char for char in x if char not in punct_chars))

In [None]:
# Function to Correct All  Entries and Their Dtypes

def correcter(dataframe):

    # Change "created_at" column to datetime
    dataframe["created_at"] = pd.to_datetime(dataframe["created_at"],errors="coerce")

    # Rearrange dataframe columns
    dataframe = dataframe[["created_at","text","bookmark_count","favorite_count","retweet_count","reply_count","sentiment_score"]]
    
    # Set Year and Month
    dataframe.insert(0,"Year",dataframe.created_at.dt.year)
    dataframe.insert(1,"Month",dataframe.created_at.dt.month)

    # Drop Created_At Column

    dataframe = dataframe.drop(columns=["created_at"],axis=1)

    # Clean individual tweets

    for i in range(len(dataframe)):
        dataframe.loc[i,"text"] = p.clean(dataframe.iloc[i]["text"])

    # Drop NA values (relevant for Westminster alone)

    dataframe = dataframe.dropna(subset=["text","Year","Month"])


    #dataframe = dataframe.dropna(subset=["created_at","text"])

    
    return dataframe
    

In [None]:
# Apply Correcting Function to All Dataframes

for i in twitter_data.keys():
    print(i,"beginning -",len(twitter_data[i]))
    twitter_data[i] = correcter(twitter_data[i])
    print(i, "end -",len(twitter_data[i]))

# Note that Westminster Loses Some Entries Due to Errors in the Scraping Procedure

In [None]:
twitter_data["newham"].head()

In [None]:
# Function to Lemmatize and Tokenize

lemmatizer = nltk.stem.WordNetLemmatizer()
w_tokenizer = TweetTokenizer()
def lemmatize_text(text):
 return [(lemmatizer.lemmatize(w)) for w in w_tokenizer.tokenize((text))]

In [None]:
# Lemmatize All Tweets in Dataframes

for i in twitter_data.keys():
    twitter_data[i]["text"] = twitter_data[i]["text"].apply(lemmatize_text)


In [None]:
twitter_data["newham"]

In [None]:
# Remove Stop Words and Punctuation

stop_words = set(stopwords.words('english'))

for i in twitter_data.keys():
    twitter_data[i]["text"] =  twitter_data[i]["text"].apply(lambda x: [item for item in x if item not in stop_words])
    twitter_data[i]["text"] = twitter_data[i]["text"].apply(lambda x: [i for i in x if i not in punct_chars])
    twitter_data[i] = twitter_data[i].sort_values(by=["Year","Month"])

In [None]:
twitter_data["newham"].head()

### Group Text and Features

In [None]:
# Generate Average of Sentiment Scores on a Monthly Basis

monthly_sentiments = dict.fromkeys(twitter_data)

for i in monthly_sentiments.keys():
    monthly_sentiments[i] = twitter_data[i][["Year","Month","sentiment_score"]].groupby(["Year","Month"],as_index=False).mean()


In [None]:
# Show That Monthly Sentiments Are Grouped Properly

monthly_sentiments["newham"]

In [None]:
# Function to Group Twitter Non-text Features by Year and Month 

def compiler(dataframe):
    new_dataframe = dataframe[["Year","Month","bookmark_count","favorite_count","retweet_count","reply_count"]].groupby(["Year","Month"],as_index=False).sum()
    return new_dataframe

In [None]:
# Fix Dtypes

for i in twitter_data.keys():
    twitter_data[i].loc[:,"Year"] = twitter_data[i].loc[:,"Year"].astype("int")
    twitter_data[i].loc[:,"Month"] = twitter_data[i].loc[:,"Month"].astype("int")
    twitter_data[i].loc[:,"favorite_count"] = twitter_data[i].loc[:,"favorite_count"].astype("int")
    twitter_data[i].loc[:,"bookmark_count"] = twitter_data[i].loc[:,"bookmark_count"].astype("int")
    twitter_data[i].loc[:,"retweet_count"] = twitter_data[i].loc[:,"retweet_count"].astype("int")
    twitter_data[i].loc[:,"reply_count"] = twitter_data[i].loc[:,"reply_count"].astype("int")

In [None]:
# Show that Reassingment of Dtypes Worked

twitter_data[i].dtypes

In [None]:
# Create Dictionary that Stores Grouped Twitter Non-Text Data

compiled_tweet_features = dict.fromkeys(twitter_data)

for i in compiled_tweet_features:
    compiled_tweet_features[i] = compiler(twitter_data[i])

In [None]:
compiled_tweet_features["newham"]

### Combine Text and Non-Textual Features

In [None]:
# Tack on Sentiment Score to the Compiled tweet Properties

for i in twitter_data.keys():
    compiled_tweet_features[i].loc[:,"average_sentiment"] = monthly_sentiments[i].loc[:,"sentiment_score"]

In [None]:
compiled_tweet_features["newham"]

In [None]:
# Create Dictionary to Store Grouped textual Data

textual_collations = dict.fromkeys(twitter_data)

for i in twitter_data.keys():
    textual_collations[i] = twitter_data[i][["Year","Month","text"]].groupby(["Year","Month"],as_index=False).sum()

In [None]:
# Function to Remove Punctuations from the Lists 

def punctuation_remover(text):
    new_list = [i for i in text if i not in string.punctuation]
    return new_list

In [None]:
# Remove Punctuations From the Listified Text Data for Each Sub-Region

for i in twitter_data.keys():
    textual_collations[i].loc[:,"text"] = textual_collations[i].loc[:,"text"].apply(lambda x: punctuation_remover(x))

In [None]:
textual_collations["newham"]

In [None]:
compiled_tweet_features["newham"]

In [None]:
# Merge the Grouped textual and Non-Textual Data

for i in compiled_tweet_features:
    print(i)
    compiled_tweet_features[i] = compiled_tweet_features[i].merge(textual_collations[i],left_on=["Year","Month"],right_on=["Year","Month"])

In [None]:
compiled_tweet_features["newham"]

## Graph Social Media Information

In [None]:
graph_twitter_data = twitter_data.copy()

In [None]:

for i in graph_twitter_data.keys():
    graph_twitter_data[i]["Date"] = graph_twitter_data[i]["Year"].astype(str) + "-" + graph_twitter_data[i]["Month"].astype(str)
    graph_twitter_data[i]["Date"] = graph_twitter_data[i]["Date"].astype("datetime64")
    graph_twitter_data[i] = graph_twitter_data[i][["Date","sentiment_score","bookmark_count","favorite_count","retweet_count","reply_count"]].groupby(["Date"],as_index=False).agg({"sentiment_score":"mean","bookmark_count":"sum","favorite_count":"sum","retweet_count":"sum","reply_count":"sum"})
    graph_twitter_data[i] = graph_twitter_data[i].sort_values(by=["Date"])

In [None]:
graph_twitter_data[i]

In [None]:
## Graph Social Media Behaviour for Each MPS

for i in graph_twitter_data.keys():
    ax = graph_twitter_data[i].plot(x="Date",y="sentiment_score",legend=False,color="r")
    ax2 = ax.twinx()
    graph_twitter_data[i].plot(x="Date",y="bookmark_count",ax=ax2,legend=False)
    ax.figure.legend()
    plt.title(f"Sentiment/Bookmark Patterns for MPS {i.title()}",x=0.32)
    ax.set_ylabel('Sentiment Score')
    ax2.set_ylabel("Bookmark Count")


    plt.xticks(rotation=45)

In [None]:
## Graph Social Media Behaviour for Each MPS (Sentiment)

for i in graph_twitter_data.keys():
    ax = graph_twitter_data[i].plot(x="Date",y="sentiment_score",legend=False,color="r")
    plt.title(f"VADER Sentiment Over Time for MPS {i.title()}")
    ax.set_ylabel('Sentiment Score')


    #plt.xticks(rotation=45)

In [None]:

graph_twitter_data[i].corr().style.background_gradient(cmap='coolwarm').set_precision(2)


## Export

In [None]:
# Add Columns for Name of Region 

for i in twitter_data.keys():
    compiled_tweet_features[i].loc[:,"LSOA_Name"] = i.title()

In [None]:
final_doc = pd.concat(compiled_tweet_features.values())

In [None]:
final_doc

In [None]:
final_doc.to_csv("/home/uthlakanyana/Dropbox/Dissertation Code/Grouped Social Media Data.csv",index=False)

# Combine Comprehensive Data and Textual Information

## Loads

In [None]:
# Load Modules

import os
import pandas as pd
import numpy as np

In [None]:
# Load General Crime Data

comp_data = pd.read_csv("/home/uthlakanyana/Dropbox/Dissertation Code/Comprehensive Data - 3.csv").sort_values(by=["Year","Month"])
tweet_data = pd.read_csv("/home/uthlakanyana/Dropbox/Dissertation Code/Grouped Social Media Data.csv").sort_values(by=["Year","Month"])
specific_lsoa_data = pd.read_csv("/home/uthlakanyana/Dropbox/Dissertation Code/Specific LSOA Data.csv")

# Directory Navigation
d = os.getcwd()
print(f'Current: {d}')
file_location = "/home/uthlakanyana/Dropbox/Dissertation Code/Specific Crime Dataset"
os.chdir(file_location)

# Obtain Directory Items
locations = [file_location+"/"+str(f) for f in os.listdir()]
    
# Obtain the Files Locations in Each Directory
dataframes = []
for l in locations:
    os.chdir(l)
    dataframes.extend([l+"/"+i for i in os.listdir(l)])

# Load Specific Crime Data

specific_crimes = {}

for d in dataframes:
    specific_crimes[d.rsplit("/")[-1].rsplit(".")[0]] = pd.read_csv(d).sort_values(by=["Year","Month"])


## Generate Lags

In [None]:
# Create New Column For comp_data For Prior Instances (This Will Help Generate Crime Motions later On)

comp_data["Prior Instances"] = np.nan

for i in specific_crimes.keys():
    specific_crimes[i]["Prior Instances"] = np.nan

In [None]:
def lag_function(dataframe):

    # Create Unique List of LSOA Names for General Crimes

    UniqueNames = dataframe.LSOA11NM.unique()

    # Store LSOA Dictionaries in Dataframe
    DataFrameDict = {elem : pd.DataFrame() for elem in UniqueNames}

    for key in DataFrameDict.keys():
        DataFrameDict[key] = dataframe[:][dataframe.LSOA11NM == key]

    # Lag the Split Dataframes

    for key in DataFrameDict.keys():
        DataFrameDict[key]["Prior Instances"] = DataFrameDict[key]["Instances"].shift()
        
    return DataFrameDict


### Total Crimes

In [None]:
# Create Unique List of LSOA Names for General Crimes

#UniqueNames = comp_data.LSOA11NM.unique()

# Store LSOA Dictionaries in Dataframe
#DataFrameDict = {elem : pd.DataFrame() for elem in UniqueNames}

#for key in DataFrameDict.keys():
#    DataFrameDict[key] = comp_data[:][comp_data.LSOA11NM == key]

# Lag the Split Dataframes

#for key in DataFrameDict.keys():
#    DataFrameDict[key]["Prior Instances"] = DataFrameDict[key]["Instances"].shift()

In [None]:
len(comp_data["LSOA11NM"].unique())

In [None]:
# Call Lag Function on General Crimes

comp_data = pd.concat(lag_function(comp_data).values())
comp_data

In [None]:
comp_data[comp_data["Crime Density 2022"]==0]

### Specific Crimes

In [None]:
# Call lag Function on Specific Crimes

specific_crimes2 = dict.fromkeys(specific_crimes)

for i in specific_crimes2:
    specific_crimes2[i] = pd.concat(lag_function(specific_crimes[i]).values())
    

In [None]:
specific_crimes[i]

## Map Movements

In [None]:
# Function to Convert Raw Differences To General Directions

def movement_mapping(datum):

    # Increase Vs Decrease or Stay the Same
    if datum == 0:
        datum = 0
    elif datum >= 0:
        datum = 1
    elif datum <= 0:
        datum = 0
    return datum

### General Crimes

In [None]:
# Create a 'Differences' Columns for General Crime

comp_data = comp_data.sort_values(by=["Year","Month"]).reset_index(drop=True)
comp_data["Differences"] = comp_data["Instances"] - comp_data["Prior Instances"]

In [None]:
# Rearrange General Crime dataframe

comp_data = comp_data.copy()[["Year","Month","LSOA11NM","Working households (thousands)",
                                "Working households (per cent)","Mixed households (thousands)",
                                "Mixed households (per cent)","Workless households (thousands)",
                                "Workless households (per cent)","COMESTRES","POPDEN",
                                "HHOLDS","AVHHOLDSZ","Area","Crime Density 2022","Differences"]]
comp_data

comp_data.columns = comp_data.columns.astype(str)
comp_data.loc[:,'Differences']

In [None]:
# Show Proportion of Data Where Differences Are 0

(len(comp_data[comp_data["Differences"]==0])/len(comp_data))*100

In [None]:
# Replace The Original Difference Column With Directions

comp_data.loc[:,"Differences"] = comp_data.loc[:,"Differences"].apply(lambda x:movement_mapping(x))

In [None]:
# Resplit Data According to Borough

resplit = {
    "westminster":comp_data[comp_data["LSOA11NM"].str.contains("Westminster")],
    "southwark":comp_data[comp_data["LSOA11NM"].str.contains("Southwark")],
    "tower hamlets":comp_data[comp_data["LSOA11NM"].str.contains("Tower Hamlets")],
    "lambeth":comp_data[comp_data["LSOA11NM"].str.contains("Lambeth")],
    "newham":comp_data[comp_data["LSOA11NM"].str.contains("Newham")]
}

### Specific Crime

In [None]:
# Create a 'Differences' Columns for Specific Crime

for i in specific_crimes2:
    specific_crimes2[i] = specific_crimes2[i].sort_values(by=["Year","Month"]).reset_index(drop=True)
    specific_crimes2[i]["Differences"] = specific_crimes2[i]["Instances"] - specific_crimes2[i]["Prior Instances"]

specific_crimes2[i]

In [None]:
# Rearrange Specific Crime Dataframes

for i in specific_crimes2:
    specific_crimes2[i] = specific_crimes2[i].copy()[["Year","Month","LSOA11NM","Working households (thousands)",
                                                      "Working households (per cent)","Mixed households (thousands)",
                                                      "Mixed households (per cent)","Workless households (thousands)",
                                                      "Workless households (per cent)","COMESTRES","POPDEN",
                                                      "HHOLDS","AVHHOLDSZ","Area","Crime Density 2022","Crime Type","Differences"]]


    specific_crimes2[i].columns = specific_crimes2[i].columns.astype(str)
    specific_crimes2[i].loc[:,"Differences"] = specific_crimes2[i].loc[:,"Differences"].apply(lambda x:movement_mapping(x))
    specific_crimes2[i] = specific_crimes2[i].sort_values(["Year","Month"])

In [None]:
specific_crimes3 = dict.fromkeys(specific_crimes2)

for i in specific_crimes3.keys():
    specific_crimes3[i] ={
                          "westminster":specific_crimes2[i][specific_crimes2[i]["LSOA11NM"].str.contains("Westminster")],
                          "southwark":specific_crimes2[i][specific_crimes2[i]["LSOA11NM"].str.contains("Southwark")],
                          "tower hamlets":specific_crimes2[i][specific_crimes2[i]["LSOA11NM"].str.contains("Tower Hamlets")],
                          "lambeth":specific_crimes2[i][specific_crimes2[i]["LSOA11NM"].str.contains("Lambeth")],
                          "newham":specific_crimes2[i][specific_crimes2[i]["LSOA11NM"].str.contains("Newham")]
                          }


## Merge

In [None]:
# Merger Function

def merger(dataframe,iterator,tweet_data=tweet_data):
    dataframe = pd.merge(dataframe,tweet_data.loc[tweet_data["LSOA_Name"]==iterator.title()],how="outer")
    dataframe = dataframe[dataframe["Working households (thousands)"].isna()==False]
    return dataframe


### Total Crime

In [None]:
# Merge General Crime Data with Textual Information

for i in resplit.keys():
    resplit[i] = merger(resplit[i],i)
    #resplit[i] = pd.merge(resplit[i],tweet_data.loc[tweet_data["LSOA_Name"]==i.title()],how="outer")

In [None]:
# Recombine General Crime Data

new_comp_data = pd.concat(resplit.values(),axis=0)
new_comp_data = new_comp_data[new_comp_data["Area"].isna()!=True]
new_comp_data = new_comp_data[new_comp_data["Differences"].isna()!=True]
new_comp_data

In [None]:
# Check the Missing Values in the Combined DataSet After Adding Columns

for i in new_comp_data.columns:
    print(i,new_comp_data[i].isna().any())

### Specific Crime

In [None]:
# Merge Specific Crime Data with Textual Information

for crime in specific_crimes3.keys():
    for region in specific_crimes3[crime]:
        specific_crimes3[crime][region] = merger(specific_crimes3[crime][region],region)
    #resplit[i] = pd.merge(resplit[i],tweet_data.loc[tweet_data["LSOA_Name"]==i.title()],how="outer")

In [None]:
# Confirm That There Is Variance in Crime/Location Combinations

for i in specific_crimes3.keys():
    for p in specific_crimes3[i].keys():
        print(i,p,len(specific_crimes3[i][p]))

In [None]:
# Recombine Specific Crime Data
specific_crimes4 = dict.fromkeys(specific_crimes3)

for i in specific_crimes3.keys():
    specific_crimes4[i] = pd.concat(specific_crimes3[i].values(),axis=0)
    specific_crimes4[i] = specific_crimes4[i][specific_crimes4[i]["Area"].isna()!=True]
    specific_crimes4[i] = specific_crimes4[i][specific_crimes4[i]["Differences"].isna()!=True]

## Export

### General Crime

In [None]:
full_data = pd.merge(new_comp_data,specific_lsoa_data,on="LSOA11NM")

In [None]:
full_data

In [None]:
full_data.columns

In [None]:
full_data

In [None]:
full_data.head()["text"].iloc[0]

In [None]:
full_data.to_csv("/home/uthlakanyana/Dropbox/Dissertation Code/Comprehensive Data - 4.csv",index=False)

### Specific Crime

In [None]:
# Function to Export and Tag Specific Crime Datasets

def specific_exporter(dataframe,title):
    dataframe["Crime Type"] = title
    dataframe = dataframe.sort_values(by=["Year","Month"])

    return dataframe.to_csv(f"/home/uthlakanyana/Dropbox/Dissertation Code/Specific Crime Dataset/{title}/{i}.csv",index=False)

In [None]:
for i in specific_crimes4.keys():
    specific_crimes4[i] = pd.merge(specific_crimes4[i],specific_lsoa_data,on="LSOA11NM")

In [None]:
for i in specific_crimes4.keys():
    specific_crimes4.copy()[i] = specific_exporter(specific_crimes4[i],i)

# Topic Modelling

In [None]:
# Import Modules

import little_mallet_wrapper
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn
import glob
from pathlib import Path
import pandas as pd
import random
pd.options.display.max_colwidth = 100
import nltk
from nltk.corpus import stopwords  #stopwords
from nltk.stem import WordNetLemmatizer  
from sklearn.feature_extraction.text import TfidfVectorizer
stop_words=set(nltk.corpus.stopwords.words('english'))
from ast import literal_eval
import numpy as np
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
# Load Data

data = pd.read_csv("/home/uthlakanyana/Dropbox/Dissertation Code/Comprehensive Data - 4.csv")

# Function to Convert the Stringified Lists Into Proper Lists

def fix_list(text):
    try:
        #print(text)
        text = eval(text)
        text = " ".join(text)
        return text
    except:
        print(text)



data.loc[:,"text"] = data.loc[:,"text"].apply(lambda x: fix_list(x))

In [None]:
# Split Training and Test Sets

training = data[data["Year"]==2022]
test = data[data["Year"]==2023]

In [None]:
# Perform TFIDF Vectorization

vect =TfidfVectorizer(stop_words=list(stop_words),max_features=1000)
vect_text=vect.fit_transform(training['text'])

In [None]:
data.head()

In [None]:
# Generate LDA Model

lda_model=LatentDirichletAllocation(n_components=30,learning_method='online',random_state=1,max_iter=1)
lda_top=lda_model.fit_transform(vect_text)

In [None]:
# Print Topics

print("Document 0: ")
for i,topic in enumerate(lda_top[0]):
  print("Topic ",i,": ",topic*100,"%")

In [None]:
# Generate Topic Features

vocab = vect.get_feature_names_out()
for i, comp in enumerate(lda_model.components_):
     vocab_comp = zip(vocab, comp)
     sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:10]
     print("Topic "+str(i)+": ")
     for t in sorted_words:
            print(t[0],end=" ")
            print("n")

In [None]:
training["text"]

In [None]:
# MALLET Approach to TM

little_mallet_wrapper.print_dataset_stats(training)

In [None]:
# Set Number of Topics
num_topics = 35

In [None]:
#Change to Your Desired Output Directory
output_directory_path = '/home/uthlakanyana/Dropbox/Dissertation Code/Police Tweets Topics'

path_to_mallet = "/usr/bin/mallet"

#No need to change anything below here
Path(f"{output_directory_path}").mkdir(parents=True, exist_ok=True)

path_to_training_data           = f"{output_directory_path}/training.txt"
path_to_formatted_training_data = f"{output_directory_path}/mallet.training"
path_to_model                   = f"{output_directory_path}/mallet.model.{str(num_topics)}"
path_to_topic_keys              = f"{output_directory_path}/mallet.topic_keys.{str(num_topics)}"
path_to_topic_distributions     = f"{output_directory_path}/mallet.topic_distributions.{str(num_topics)}"

In [None]:
# little_mallet_wrapper.quick_train_topic_model(path_to_mallet,output_directory_path,num_topics,training)

# Add Weather and To Dataset

## Loads

In [None]:
# Load Modules
import os
import pandas as pd

In [None]:
 # Load Data

weather = pd.read_csv("/home/uthlakanyana/Dropbox/Dissertation Code/Heathrow Met Data.csv",sep=",")
weather.columns = ["Year","Month","Max_Temp","Min_Temp","Frost Days","Rain","Sun"]
full_data = pd.read_csv("/home/uthlakanyana/Dropbox/Dissertation Code/Comprehensive Data - 4.csv")

# Directory Navigation
d = os.getcwd()
print(f'Current: {d}')
file_location = "/home/uthlakanyana/Dropbox/Dissertation Code/Specific Crime Dataset"
os.chdir(file_location)

# Obtain Directory Items
locations = [file_location+"/"+str(f) for f in os.listdir()]
    
# Obtain the Files Locations in Each Directory
dataframes = []
for l in locations:
    os.chdir(l)
    dataframes.extend([l+"/"+i for i in os.listdir(l)])

# Load Specific Crime Data

specific_crimes = {}

for d in dataframes:
    specific_crimes[d.rsplit("/")[-1].rsplit(".")[0]] = pd.read_csv(d).sort_values(by=["Year","Month"])


In [None]:
weather.head()

## Weather Linkages

In [None]:
# Link General Crimes with Weather

full_data = pd.merge(full_data,weather,on=["Year","Month"])
full_data.head()

In [None]:
full_data

In [None]:
# Link Specific Crimes with Weather

for i in specific_crimes.keys():
    specific_crimes[i] = pd.merge(specific_crimes[i],weather,on=["Year","Month"])

## Export

### General Crime

In [None]:
full_data.to_csv("/home/uthlakanyana/Dropbox/Dissertation Code/Comprehensive Data - 5.csv",index=False)

### Specific Crime

In [None]:
# Function to Export and Tag Specific Crime Datasets

def specific_exporter(dataframe,title):
    dataframe["Crime Type"] = title
    dataframe = dataframe.sort_values(by=["Year","Month"])

    return dataframe.to_csv(f"/home/uthlakanyana/Dropbox/Dissertation Code/Specific Crime Dataset/{title}/{i}.csv",index=False)

In [None]:
for i in specific_crimes.keys():
    specific_crimes.copy()[i] = specific_exporter(specific_crimes[i],i)

# Make Predictions

## Loads

In [None]:
# Load Modules

import os
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from sklearn import metrics  
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from xgboost import XGBClassifier
from matplotlib import pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.experimental import enable_halving_search_cv
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import svm
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import seaborn as sns
sns.set_theme()
from matplotlib import pyplot as pyplot
import matplotlib.pyplot as plt
from statsmodels.tsa.arima_model import ARIMA
from xgboost import XGBRegressor
from __future__ import print_function
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers.legacy import SGD
from tensorflow.keras import utils
import matplotlib.pyplot as plt
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from kerastuner import Hyperband
from sklearn.metrics import accuracy_score, f1_score
import category_encoders as ce
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import HalvingRandomSearchCV
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.metrics import f1_score
from sklearn.ensemble import BaggingClassifier
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
#warnings.simplefilter(action='ignore', category=all)
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import SMOTE

In [None]:
# Load Data

full_data = pd.read_csv("/home/uthlakanyana/Dropbox/Dissertation Code/Comprehensive Data - 5.csv")

# Directory Navigation
d = os.getcwd()
print(f'Current: {d}')
file_location = "/home/uthlakanyana/Dropbox/Dissertation Code/Specific Crime Dataset"
os.chdir(file_location)

# Obtain Directory Items
locations = [file_location+"/"+str(f) for f in os.listdir()]
    
# Obtain the Files Locations in Each Directory
dataframes = []
for l in locations:
    os.chdir(l)
    dataframes.extend([l+"/"+i for i in os.listdir(l)])

# Load Specific Crime Data

specific_crimes = {}

for d in dataframes:
    specific_crimes[d.rsplit("/")[-1].rsplit(".")[0]] = pd.read_csv(d).sort_values(by=["Year","Month"])


In [None]:
for i in full_data.columns:
    print(i,len(full_data[full_data[i].isna()==True]))

In [None]:
for i in specific_crimes:
    print(i,len(specific_crimes[i]))

In [None]:
# Function to Rename Columns

def renamer(dataframe):

    dataframe = dataframe
    renamed_columns = {"Working households (thousands)":"WH","Working households (per cent)":"%WH","Mixed households (thousands)":"MH",
                       "Mixed households (per cent)":"%MH","Workless households (thousands)":"WLH","Workless households (per cent)":"%WLH",
                       "bookmark_count":"BKMRK","favorite_count":"FVRT","retweet_count":"RTWT","reply_count":"RPLY","average_sentiment":"AVG_SENT",
                       "Total Population":"POP","Area (Hectares)":"HECT","All households":"LSOA_HHOLDS","Couple household with dependent children":"CPHHDC",
                       "Couple household without dependent children":"CPHHWDC","Lone parent household":"LPH","One person household":"OPH",
                       "% Couple household with dependent children":"%CPHHDC","% Couple household without dependent children":"%CPHHWDC",
                       "% Lone parent household":"%LPH","Households with at least one person aged 16 or over with English as a main language":"HHEML",
                       "Owned outright":"OO","Owned with a mortgage or loan":"OWML","Social rented":"SR","Private rented":"PR","Owned outright (%)":"%OO",
                       "Owned with a mortgage or loan (%)":"%OWML","Social rented (%)":"%SR","Private rented (%)":"%PR",
                       "% of households with no adults in employment: With dependent children":"%HWNAEWC","% 0-1 (poor access)":"%PA",
                       "Total Number of Families Claiming Benefit":"NFCB","Mean Annual Household Income estimate (£)":"MEAN_AIE",
                       "Median Annual Household Income estimate (£)":"MEDIAN_AIE"}
    
    dataframe = dataframe.rename(columns = renamed_columns)
    #dataframe = dataframe.drop(columns=["Area","LSOA_HHOLDS"])

    return dataframe


In [None]:
# Call renamer function

full_data = renamer(full_data)

for i in specific_crimes.keys():
    specific_crimes[i] = renamer(specific_crimes[i])

## EDA

In [None]:
# Check for Missing Variables in 
"""

full_data.isna().any()

"""

### Correlation Analysis

In [None]:
# Correlation Plots

"""
sns.set_style('dark')

Correlazioni = full_data

plt.figure(figsize=(16, 10))
corr = Correlazioni.corr()
mask = np.triu(np.ones_like(corr, dtype=np.bool_))
cut_off = 0.5  # only show cells with abs(correlation) at least this value
extreme_1 = 0.75  # show with a star
extreme_2 = 0.85  # show with a second star
extreme_3 = 0.90  # show with a third star
mask |= np.abs(corr) < cut_off
corr = corr[~mask]  # fill in NaN in the non-desired cells

remove_empty_rows_and_cols = True
if remove_empty_rows_and_cols:
    wanted_cols = np.flatnonzero(np.count_nonzero(~mask, axis=1))
    wanted_rows = np.flatnonzero(np.count_nonzero(~mask, axis=0))
    corr = corr.iloc[wanted_cols, wanted_rows]

annot = [[f"{val:.1g}"
          + ('' if abs(val) < extreme_1 else '\n★')  # add one star if abs(val) >= extreme_1
          + ('' if abs(val) < extreme_2 else '★')  # add an extra star if abs(val) >= extreme_2
          + ('' if abs(val) < extreme_3 else '★')  # add yet an extra star if abs(val) >= extreme_3
          for val in row] for row in corr.to_numpy()]
heatmap = sns.heatmap(corr, vmin=-1, vmax=1, annot=True, fmt='.1g', cmap='coolwarm',annot_kws={'size': 7,"color":"black"})
heatmap.set_title('Significant (Above 50%) Correlations Between Features', fontdict={'fontsize': 18}, pad=16)
plt.show()

"""

In [None]:
# ranked_correlations = full_data.corr().unstack().sort_values().drop_duplicates()

In [None]:
shoplifting = specific_crimes["Shoplifting"][["Year","Month","LSOA11NM","Differences"]]
shoplifting['Year'] = pd.to_datetime(shoplifting[['Year', 'Month']].assign(DAY=1))
shoplifting = shoplifting.drop(columns=["Month"])


In [None]:
shoplifting2 = shoplifting[shoplifting["LSOA11NM"]=="Westminster 016A"]
shoplifting2 = shoplifting2.reset_index(drop=True)
shoplifting2

In [None]:
for i in range(len(shoplifting2)):
    if shoplifting2["Differences"].iloc[i] == 0:
        shoplifting2.at[i,"Differences"] = "Ebb"
    else:
        shoplifting2.at[i,"Differences"] = "Flow"

In [None]:
shoplifting2["Crime Tides"] = shoplifting2["Differences"]
shoplifting2 = shoplifting2.drop(columns=["Differences"])
shoplifting2

In [None]:
#fig, ax = plt.subplots(figsize=(20, 5))
ax = sns.catplot(x="Year",y="Crime Tides",data=shoplifting2,hue="Crime Tides",palette="flare",height=5,aspect=2/1,order=["Flow","Ebb"]).set(title='Westminster 016A Shoplifting Trends')
#ax.set_xticklabels(rotation=)


In [None]:
#fig, ax = plt.subplots(figsize=(20, 5))
ax = sns.catplot(x="Year",data=shoplifting2,kind="count",hue="Crime Tides",palette="flare",height=5,aspect=2/1).set(title='Westminster 016A Shoplifting Trends')
#ax.set_xticklabels(rotation=)


In [None]:
sns.set_theme(style="white")

fig, ax = plt.subplots(figsize=(20, 5))
ax = sns.barplot(data=shoplifting2, x="Year", y="Differences",color="darkred",hue="Differences")
x_dates = shoplifting2['Year'].dt.strftime('%Y-%m').sort_values().unique()
ax.set_xticklabels(labels=x_dates, rotation=45)
ax.set_ylim(0,1,auto=False)

## Data Treatment/Splitting

In [None]:
# Set Random State

random_state = 50

### Train-Test Split

In [None]:
# Combine Specific Crime Into One Dataset

specific_crimes = pd.concat(specific_crimes.values())
specific_crimes = specific_crimes.reset_index(drop=True)

In [None]:
# Split Full Data in to X and Y Variables

full_data = {"x": full_data.drop(columns="Differences"),
             "y":full_data["Differences"]}

In [None]:
# Split Specific Crimes Into X and Y Variables

specific_crimes = {"x": specific_crimes.drop(columns="Differences"),
             "y":specific_crimes["Differences"]}

In [None]:
# Split General Crimes Into Train and Test Splits

full_train_x, full_test_x, full_train_y, full_test_y = train_test_split(full_data["x"],full_data["y"],test_size=0.10,random_state=random_state)

In [None]:
# Split Specific Crimes Into Train and Test Splits

specific_train_x, specific_test_x, specific_train_y, specific_test_y = train_test_split(specific_crimes["x"],specific_crimes["y"],test_size=0.10,random_state=random_state)

### Treat Dataframes

In [None]:
# Function to Treat Dataframes

def treat_dataframes(x):

    scaled_features = StandardScaler().fit_transform(x.drop(columns=["LSOA11NM","text","LSOA_Name","LSOA_Code"]).values)
    scaled_features_df = pd.DataFrame(scaled_features, index=x.index, columns=x.drop(columns=["LSOA11NM","text","LSOA_Name","LSOA_Code"]).columns)

    return scaled_features_df


def treat_dataframes(x):
    
    scale_cols = ['WH', '%WH', 'MH', '%MH', 'WLH', '%WLH',
       'COMESTRES', 'POPDEN', 'HHOLDS', 'AVHHOLDSZ', 'Area',
       'BKMRK', 'FVRT', 'RTWT', 'RPLY', 'AVG_SENT',
       'POP', 'HECT', 'LSOA_HHOLDS',
       'CPHHDC', 'CPHHWDC', 'LPH', 'OPH', '%CPHHDC', '%CPHHWDC', '%LPH',
       'HHEML', 'OO', 'OWML', 'SR', 'PR', '%OO', '%OWML', '%SR', '%PR',
       'Median Price', 'Sales', '%HWNAEWC', '%PA', 'NFCB', 'MEAN_AIE',
       'MEDIAN_AIE', 'Max_Temp', 'Min_Temp', 'Frost Days', 'Rain', 'Sun']


    x = x.drop(columns=["text","LSOA_Code","LSOA_Name","Crime Density 2022"])
    encoder = ce.BinaryEncoder(cols=["Year","Month","LSOA11NM"])
    x = encoder.fit_transform(x)
    
    x_index = x.index

    cols = x.columns

    x.loc[:,scale_cols] = StandardScaler().fit_transform(x[scale_cols])

    drops = ['WH', '%WH', 'MH', '%MH', 'WLH', '%WLH',
       'COMESTRES', 'POPDEN', 'HHOLDS', 'AVHHOLDSZ', 'Area',
       'POP', 'HECT', 'LSOA_HHOLDS',
       'CPHHDC', 'CPHHWDC', 'LPH', 'OPH', '%CPHHDC', '%CPHHWDC', '%LPH',
       'HHEML', 'OO', 'OWML', 'SR', 'PR', '%OO', '%OWML', '%SR', '%PR',
       'Median Price', 'Sales', '%HWNAEWC', '%PA', 'NFCB', 'MEAN_AIE',
       'MEDIAN_AIE']


    x = x.drop(columns=drops)


    return x


In [None]:
treat_dataframes(x).columns

## Make Discrete Predictions

#### Loadings

In [None]:
# Data Inputs (General)

crime_type = "General"
x = full_train_x
y = full_train_y
test_x = full_test_x
test_y = full_test_y

In [None]:
print(specific_train_x["Crime Type"].unique())

In [None]:
crime_type = "Other theft"

In [None]:
# Data Inputs (Specific)



specific_x = specific_train_x.copy()[specific_train_x.copy()["Crime Type"]==crime_type].drop(columns="Crime Type")
index = specific_x.index
specific_y = specific_train_y.loc[index]

test_specific_x = specific_test_x.copy()[specific_test_x.copy()["Crime Type"]==crime_type].drop(columns="Crime Type")
test_index = test_specific_x.index
test_specific_y = specific_test_y.loc[test_index]

x = specific_x
y = specific_y
test_x = test_specific_x
test_y = test_specific_y

In [None]:
# Print Dimensions and Crime Type
print(crime_type)

print("Training Dimensions are",len(treat_dataframes(x).columns), "x", len(x))
print("Test Dimensions are",len(treat_dataframes(x).columns), "x", len(test_x))

#### Support Vector Machines

In [None]:
# SVC Hyperparameters

def support_vector_machines_params(x,y):

    x = treat_dataframes(x)
    y = y
    f1 = make_scorer(f1_score,average='macro')

    #param_distributions = {
    #                       "bootstrap": [True, False],
    #                       "bootstrap_features": [True, False],
    #                       "base_estimator__C":[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
    #                       "base_estimator__kernel": ["linear", "poly", "rbf", "sigmoid"],
    #                       "base_estimator__degree": [1,2,3,4,5,6,7,8,9,10],
    #                       "base_estimator__gamma": ["scale","auto"],
    #                       "base_estimator__coef0": [0,1,2,3,4,5],
    #                       "base_estimator__shrinking": [True,False],
    #                       "base_estimator__probability": [True,False],
    #                       }

    param_distributions = {                        
                           "C":[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
                           "kernel": ["linear", "poly", "rbf", "sigmoid"],
                           "degree": [1,2,3,4,5,6,7,8,9,10],
                           "gamma": ["scale","auto"],
                           "coef0": [0,1,2,3,4,5],
                           "shrinking": [True,False],
                           "probability": [True,False],
                           }
    
    

    smote = SMOTE(
                    sampling_strategy='minority',
                    random_state=random_state, k_neighbors=5,)
    x, y = smote.fit_resample(x, y)

    search = HalvingRandomSearchCV(svm.SVC(), param_distributions,random_state=random_state,scoring=f1).fit(x, y)

    #search = HalvingRandomSearchCV(BalancedBaggingClassifier(base_estimator=svm.SVC(),random_state=random_state,sampling_strategy="not majority",n_estimators=150,sampler=SMOTE()),param_distributions,random_state=random_state,scoring=f1).fit(x, y)
    #search.best_params_.pop("bootstrap")
    #search.best_params_.pop("bootstrap_features")
    found = search.best_params_.copy()
    
    #found["shrinking"] = found.pop("base_estimator__shrinking")
    #found["probability"] = found.pop("base_estimator__probability")
    #found["kernel"] = found.pop("base_estimator__kernel")
    #found["gamma"] = found.pop("base_estimator__gamma")
    #found["degree"] = found.pop("base_estimator__degree")
    #found["coef0"] = found.pop("base_estimator__coef0")
    #found["C"] = found.pop("base_estimator__C")

    
    
    return found
# Support Vector Machine Fitting

def support_vector_machines_fitting(x,y,params):

    x = treat_dataframes(x)
    y = y

    clf = svm.SVC(**params)

    return clf.fit(x,y)
# Support Vector Machine Prediction

def support_vector_machines_prediction(model,x):

    return model.predict(x)

In [None]:
# Load Hyperparameters

svm_hyperparams = support_vector_machines_params(x,y)

In [None]:
# Show HyperParameters

svm_hyperparams

In [None]:
# Function Calls

svm_fitting = support_vector_machines_fitting(x,y,svm_hyperparams)
svm_prediction = treat_dataframes(test_x)

In [None]:
# Predict Test

svm_predict_test = support_vector_machines_prediction(svm_fitting,svm_prediction)
svm_predict_test

In [None]:
# Show SVM Accuracy

accuracy_score(y_pred=svm_predict_test,y_true=test_y)

In [None]:
# Generate F1 Score

f1_score(svm_predict_test,test_y)

#### XGBoost

In [None]:
# XGBoost Hyperparameters

def ex_gee_boost_params(x,y):

    x = treat_dataframes(x)
    y = y
    f1 = make_scorer(f1_score,average='macro')


    #param_distributions = {
    #                       "bootstrap": [True, False],
    #                       "bootstrap_features": [True, False],
    #                       "base_estimator__n_estimators":[12,30,50,100,125,150,175,200,225,250,275,300,325,350,375,400,425,450,475,500],
    #                       "base_estimator__objective":["binary:logistic"],
    #                       "base_estimator__learning_rate":[0.001,0.003, 0.006,0.009,0.01,0.05,0.06,0.07,0.08,0.09,0.1,0.12,0.15,0.2],
    #                       "base_estimator__max_depth":[1,2,3,4,5],
    #                       "base_estimator__gamma":[1,2,3,4,5],
    #                       "base_estimator__colsample_bylevel":[0.15,0.25,0.35,0.45,0.55,0.65,0.75,0.85,1],
    #                       "base_estimator__colsample_bytree":[0.15,0.25,0.35,0.45,0.55,0.65,0.75,0.85,1],
    #                       "base_estimator__base_score":[0.25,0.5,0.6]}
    param_distributions = {
                           "n_estimators":[12,30,50,100,125,150,175,200,225,250,275,300,325,350,375,400,425,450,475,500],
                           "objective":["binary:logistic"],
                           "learning_rate":[0.001,0.003, 0.006,0.009,0.01,0.05,0.06,0.07,0.08,0.09,0.1,0.12,0.15,0.2],
                           "max_depth":[1,2,3,4,5],
                           "gamma":[1,2,3,4,5],
                           "colsample_bylevel":[0.15,0.25,0.35,0.45,0.55,0.65,0.75,0.85,1],
                           "colsample_bytree":[0.15,0.25,0.35,0.45,0.55,0.65,0.75,0.85,1],
                           "base_score":[0.25,0.5,0.6]}

    smote = SMOTE(
                sampling_strategy='minority',
                random_state=random_state, k_neighbors=5,)

    x, y = smote.fit_resample(x, y)
    
    
    search = HalvingRandomSearchCV(xgb.XGBClassifier(), param_distributions,random_state=random_state,scoring=f1).fit(x, y)

    #search = HalvingRandomSearchCV(BalancedBaggingClassifier(base_estimator=xgb.XGBClassifier(),random_state=random_state,sampling_strategy="not majority",n_estimators=150,sampler=SMOTE()), param_distributions,random_state=random_state,scoring=f1).fit(x, y)
    found = search.best_params_.copy()
    
    #found.pop("bootstrap")
    #found.pop("bootstrap_features")
    #found["n_estimators"] = found.pop("base_estimator__n_estimators")
    #found["objective"] = found.pop("base_estimator__objective")
    #found["learning_rate"] = found.pop("base_estimator__learning_rate")
    #found["max_depth"] = found.pop("base_estimator__max_depth")
    #found["gamma"] = found.pop("base_estimator__gamma")
    #found["colsample_bylevel"] = found.pop("base_estimator__colsample_bylevel")
    #found["colsample_bytree"] = found.pop("base_estimator__colsample_bytree")
    #found["base_score"] = found.pop("base_estimator__base_score")
    
    return found
# XGBoost Model Predictions

def ex_gee_boost_fitting(x,y,params):

    x = treat_dataframes(x)
    y = y

    xgbc = xgb.XGBClassifier(**params)
    print(xgbc)
    #xgbc.fit(x,y)
    
    
    return xgbc.fit(x,y)

def ex_gee_boost_prediction(model,x):
    return model.predict(x)

In [None]:
# Load Hyperparameters

xgb_hyperparams = ex_gee_boost_params(x,y)


In [None]:
# Show Hyperparameters

xgb_hyperparams

In [None]:
# Function Calls

xgb_fitting = ex_gee_boost_fitting(x,y,ex_gee_boost_params(x,y))
xgb_prediction = treat_dataframes(test_x)

In [None]:
# Predict Test

xgb_predict_test = ex_gee_boost_prediction(xgb_fitting,xgb_prediction)
xgb_predict_test

In [None]:
# Generate Accuracy

accuracy_score(y_pred=xgb_predict_test,y_true=test_y)

In [None]:
# Generate F1 Score

f1_score(xgb_predict_test,test_y)

#### Multi-Layer Perceptron

In [None]:
# Neural Network Hyperparameters

def multi_layer_perceptron_params(x,y):
    
    x = treat_dataframes(x)
    y = y
    f1 = make_scorer(f1_score,average='macro')


    #param_distributions = {
    #                       "bootstrap": [True, False],
    #                       "bootstrap_features": [True, False],
    #                       "base_estimator__hidden_layer_sizes":[(5),(5,5),(5,5,5),(10),(20),(30),(40),(10,10),(20,20),(30,30),(40,40),(10,10,10),(20,20,20),(30,30,30),(40,40,40),(10,30,10),(20,40,10),(10,40,20),(30,10,30),(100),(200),(100,100),(100,200),(200,100),(300,200,100)],
    #                       "base_estimator__activation":["identity","logistic","tanh","relu"],
    #                       "base_estimator__solver":["lbfgs","sgd","adam"],
    #                       "base_estimator__alpha":[0.0001,0.0005,0.001,0.005,0.01,0.05],
    #                       "base_estimator__batch_size": [50,100,150,200,250,300,350],
    #                       "base_estimator__learning_rate": ["constant","invscaling","adaptive"],
    #                       "base_estimator__max_iter":[200,300,500]
    #                       }

    param_distributions = {
                           "hidden_layer_sizes":[(5),(5,5),(5,5,5),(10),(20),(30),(40),(10,10),(20,20),(30,30),(40,40),(10,10,10),(20,20,20),(30,30,30),(40,40,40),(10,30,10),(20,40,10),(10,40,20),(30,10,30),(100),(200),(100,100),(100,200),(200,100),(300,200,100)],
                           "activation":["identity","logistic","tanh","relu"],
                           "solver":["lbfgs","sgd","adam"],
                           "alpha":[0.0001,0.0005,0.001,0.005,0.01,0.05],
                           "batch_size": [50,100,150,200,250,300,350],
                           "learning_rate": ["constant","invscaling","adaptive"],
                           "max_iter":[200,300,500]
                           }

    smote = SMOTE(
                  sampling_strategy='minority',
                  random_state=random_state, k_neighbors=5,)
    
    x, y = smote.fit_resample(x, y)

    #search = HalvingRandomSearchCV(BalancedBaggingClassifier(base_estimator=MLPClassifier(),random_state=random_state,sampling_strategy="not majority",n_estimators=150,sampler=SMOTE()), param_distributions,random_state=random_state,scoring=f1).fit(x, y)
    search = HalvingRandomSearchCV(MLPClassifier(), param_distributions,random_state=random_state,scoring=f1).fit(x, y)
    found = search.best_params_

    #found.pop("bootstrap")
    #found.pop("bootstrap_features")

    #found["hidden_layer_sizes"] = found.pop("base_estimator__hidden_layer_sizes")
    #found["activation"] = found.pop("base_estimator__activation")
    #found["solver"] = found.pop("base_estimator__solver")
    #found["alpha"] = found.pop("base_estimator__alpha")
    #found["batch_size"] = found.pop("base_estimator__batch_size")
    #found["learning_rate"] = found.pop("base_estimator__learning_rate")
    #found["max_iter"] = found.pop("base_estimator__max_iter")

    return found
# Neural Network Fitting

def multi_layer_perceptron_fitting(x,y,params):
    
    x = treat_dataframes(x)
    y = y

    clf = MLPClassifier(**params)

    return clf.fit(x,y)
# Neural Network Predict

def multi_layer_perceptron_predict(model,x):
    return model.predict(x)

In [None]:
# Load Hyperparameters

mpl_hyperparams = multi_layer_perceptron_params(x,y)

In [None]:
# Show Hyperparameters

mpl_hyperparams

In [None]:
# Function Calls

mpl_fitting = multi_layer_perceptron_fitting(x,y,mpl_hyperparams)
mpl_prediction = treat_dataframes(test_x)

In [None]:
# Predict Test

mpl_predict_test = multi_layer_perceptron_predict(mpl_fitting,mpl_prediction)
mpl_predict_test

In [None]:
# Generate Accuracy

accuracy_score(y_pred=mpl_predict_test,y_true=test_y)

In [None]:
# Generate F1 Score

f1_score(mpl_predict_test,test_y)

### Ensemble Predictions

In [None]:
# Generate Ensemble Prediction

ensemble = [i/3 for i in (mpl_predict_test + svm_predict_test + xgb_predict_test)]

for i in range(len(ensemble)):
    if ensemble[i] < 0.6:
        ensemble[i] = 0
    else:
        ensemble[i] = 1

## Discrete Scoring

In [None]:
# SVC F1 Scores

print(f" SVC Accuracy Score:{round(accuracy_score(y_pred=svm_predict_test,y_true=test_y),2)}",f"\n SVC F1 Score {round(f1_score(y_pred=svm_predict_test,y_true=test_y),2)}")

In [None]:
# XGB Scores

print(f" XGB Accuracy Score:{round(accuracy_score(y_pred=xgb_predict_test,y_true=test_y),2)}",f"\n XGB F1 Score {round(f1_score(y_pred=xgb_predict_test,y_true=test_y),2)}")

In [None]:
# MPL F1 Score

print(f" MPL Accuracy Score:{round(accuracy_score(y_pred=mpl_predict_test,y_true=test_y),2)}",f"\n MPL F1 Score {round(f1_score(y_pred=mpl_predict_test,y_true=test_y),2)}")

In [None]:
# Calculate Ensemble Scores

print(f" Ensemble Accuracy Score:{round(accuracy_score(y_pred=ensemble,y_true=test_y),2)}",f"\n Ensemble F1 Score {round(f1_score(y_pred=ensemble,y_true=test_y),2)}")

## Collective Scoring

### Arrange Datasets for Collective Scoring

datasets = {"Full Data":[full_train_x,full_train_y,full_test_x,full_test_y],
            "Anti-social behaviour":[],
            "Theft from the person":[],
            "Possession of weapons":[],
            "Other crime":[],
            "Violence and sexual offences":[],
            "Bicycle theft":[],
            "Criminal damage and arson":[],
            "Robbery":[],
            "Public order":[],
            "Burglary":[],
            "Vehicle crime":[],
            "Drugs":[],
            "Shoplifting":[],
            "Other theft":[]
            }

for i in datasets.keys():
    if i != "Full Data":
        datasets[i].append(specific_train_x.copy()[specific_train_x.copy()["Crime Type"]==i].drop(columns="Crime Type"))
        index = datasets[i][0].index
        datasets[i].append(specific_train_y.copy().loc[index])
        datasets[i].append(specific_test_x.copy()[specific_test_x.copy()["Crime Type"]==i].drop(columns="Crime Type"))
        test_index = datasets[i][2].index
        datasets[i].append(specific_test_y.loc[test_index])


### Call Functions in Loop for 50 Features

datasets = {"Full Data":[full_train_x,full_train_y,full_test_x,full_test_y],
            "Possession of weapons":[],
            }
for i in datasets.keys():
    if i != "Full Data":
        datasets[i].append(specific_train_x.copy()[specific_train_x.copy()["Crime Type"]==i].drop(columns="Crime Type"))
        index = datasets[i][0].index
        datasets[i].append(specific_train_y.copy().loc[index])
        datasets[i].append(specific_test_x.copy()[specific_test_x.copy()["Crime Type"]==i].drop(columns="Crime Type"))
        test_index = datasets[i][2].index
        datasets[i].append(specific_test_y.loc[test_index])



scoring = dict.fromkeys(datasets)

def treat_dataframes(x):
    
    scale_cols = ['WH', '%WH', 'MH', '%MH', 'WLH', '%WLH',
       'COMESTRES', 'POPDEN', 'HHOLDS', 'AVHHOLDSZ', 'Area',
       'BKMRK', 'FVRT', 'RTWT', 'RPLY', 'AVG_SENT',
       'POP', 'HECT', 'LSOA_HHOLDS',
       'CPHHDC', 'CPHHWDC', 'LPH', 'OPH', '%CPHHDC', '%CPHHWDC', '%LPH',
       'HHEML', 'OO', 'OWML', 'SR', 'PR', '%OO', '%OWML', '%SR', '%PR',
       'Median Price', 'Sales', '%HWNAEWC', '%PA', 'NFCB', 'MEAN_AIE',
       'MEDIAN_AIE', 'Max_Temp', 'Min_Temp', 'Frost Days', 'Rain', 'Sun']


    x = x.drop(columns=["text","LSOA_Code","LSOA_Name","Crime Density 2022"])
    encoder = ce.BinaryEncoder(cols=["Year","Month","LSOA11NM"])
    x = encoder.fit_transform(x)
    
    x_index = x.index

    cols = x.columns

    x.loc[:,scale_cols] = StandardScaler().fit_transform(x[scale_cols])

    #x = x.loc[:,["AVG_SENT","BKMRK","RTWT","RPLY","Max_Temp","Min_Temp","Frost Days","Rain","Sun"]]


    return x

for i in datasets.keys():

    scoring[i] = dict()

    x = datasets[i][0]
    y = datasets[i][1]
    test_x = datasets[i][2]
    test_y = datasets[i][3]

    # Call SVMs
    svm_hyperparams = support_vector_machines_params(x,y)
    svm_fitting = support_vector_machines_fitting(x,y,svm_hyperparams)
    svm_prediction = treat_dataframes(test_x)
    svm_predict_test = support_vector_machines_prediction(svm_fitting,svm_prediction)

    # Call XGBs
    xgb_hyperparams = ex_gee_boost_params(x,y)
    xgb_fitting = ex_gee_boost_fitting(x,y,ex_gee_boost_fitting(x,y,ex_gee_boost_params(x,y)))#
    xgb_prediction = treat_dataframes(test_x)
    xgb_predict_test = ex_gee_boost_prediction(xgb_fitting,xgb_prediction)

    # Call MLPs
    mpl_hyperparams = multi_layer_perceptron_params(x,y)
    mpl_fitting = multi_layer_perceptron_fitting(x,y,mpl_hyperparams)
    mpl_prediction = treat_dataframes(test_x)
    mpl_predict_test = multi_layer_perceptron_predict(mpl_fitting,mpl_prediction)

    ensemble = [i/3 for i in (mpl_predict_test + svm_predict_test + xgb_predict_test)]

    for i in range(len(ensemble)):
        if ensemble[i] < 0.6:
            ensemble[i] = 0
        else:
            ensemble[i] = 1

    print(i)
    print("------------------------------")
    print(f" SVC Accuracy Score:{round(accuracy_score(y_pred=svm_predict_test,y_true=test_y),2)}",f"\n SVC F1 Score {round(f1_score(y_pred=svm_predict_test,y_true=test_y),2)}")
    print("\n")
    print(f" XGB Accuracy Score:{round(accuracy_score(y_pred=xgb_predict_test,y_true=test_y),2)}",f"\n XGB F1 Score {round(f1_score(y_pred=xgb_predict_test,y_true=test_y),2)}")
    print("\n")
    print(f" MPL Accuracy Score:{round(accuracy_score(y_pred=mpl_predict_test,y_true=test_y),2)}",f"\n MPL F1 Score {round(f1_score(y_pred=mpl_predict_test,y_true=test_y),2)}")
    print("\n")
    print(f" Ensemble Accuracy Score:{round(accuracy_score(y_pred=ensemble,y_true=test_y),2)}",f"\n Ensemble F1 Score {round(f1_score(y_pred=ensemble,y_true=test_y),2)}")
    print("==============================")

    
    scoring[i]["SVM"] = dict()
    scoring[i]["SVM"]["Accuracy"] = round(accuracy_score(y_pred=svm_predict_test,y_true=test_y),2)
    scoring[i]["SVM"]["F1 Score"] = round(f1_score(y_pred=svm_predict_test,y_true=test_y),2)
    
    scoring[i]["XGB"] = dict()
    scoring[i]["XGB"]["Accuracy"] = round(accuracy_score(y_pred=xgb_predict_test,y_true=test_y),2)
    scoring[i]["XGB"]["F1 Score"] = round(f1_score(y_pred=xgb_predict_test,y_true=test_y),2)

    scoring[i]["MPL"] = dict()
    scoring[i]["MPL"]["Accuracy"] = round(accuracy_score(y_pred=mpl_predict_test,y_true=test_y),2)
    scoring[i]["MPL"]["F1 Score"] = round(f1_score(y_pred=mpl_predict_test,y_true=test_y),2)

    scoring[i]["Ensemble"] = dict()
    scoring[i]["Ensemble"]["Accuracy"] = round(accuracy_score(y_pred=ensemble,y_true=test_y),2)
    scoring[i]["Ensemble"]["F1 Score"] = round(f1_score(y_pred=ensemble,y_true=test_y),2)



### Call Functions in Loop for 9 Features

scoring = dict.fromkeys(datasets)

def treat_dataframes(x):
    
    scale_cols = ['WH', '%WH', 'MH', '%MH', 'WLH', '%WLH',
       'COMESTRES', 'POPDEN', 'HHOLDS', 'AVHHOLDSZ', 'Area',
       'BKMRK', 'FVRT', 'RTWT', 'RPLY', 'AVG_SENT',
       'POP', 'HECT', 'LSOA_HHOLDS',
       'CPHHDC', 'CPHHWDC', 'LPH', 'OPH', '%CPHHDC', '%CPHHWDC', '%LPH',
       'HHEML', 'OO', 'OWML', 'SR', 'PR', '%OO', '%OWML', '%SR', '%PR',
       'Median Price', 'Sales', '%HWNAEWC', '%PA', 'NFCB', 'MEAN_AIE',
       'MEDIAN_AIE', 'Max_Temp', 'Min_Temp', 'Frost Days', 'Rain', 'Sun']


    x = x.drop(columns=["text","LSOA_Code","LSOA_Name","Crime Density 2022"])
    encoder = ce.BinaryEncoder(cols=["Year","Month","LSOA11NM"])
    x = encoder.fit_transform(x)
    
    x_index = x.index

    cols = x.columns

    x.loc[:,scale_cols] = StandardScaler().fit_transform(x[scale_cols])

    x = x.loc[:,["AVG_SENT","BKMRK","RTWT","RPLY","Max_Temp","Min_Temp","Frost Days","Rain","Sun"]]


    return x

for i in datasets.keys():

    scoring[i] = dict()

    x = datasets[i][0]
    y = datasets[i][1]
    test_x = datasets[i][2]
    test_y = datasets[i][3]

    # Call SVMs
    svm_hyperparams = support_vector_machines_params(x,y)
    svm_fitting = support_vector_machines_fitting(x,y,svm_hyperparams)
    svm_prediction = treat_dataframes(test_x)
    svm_predict_test = support_vector_machines_prediction(svm_fitting,svm_prediction)

    # Call XGBs
    xgb_hyperparams = ex_gee_boost_params(x,y)
    xgb_fitting = ex_gee_boost_fitting(x,y,ex_gee_boost_fitting(x,y,ex_gee_boost_params(x,y)))
    xgb_prediction = treat_dataframes(test_x)
    xgb_predict_test = ex_gee_boost_prediction(xgb_fitting,xgb_prediction)

    # Call MLPs
    mpl_hyperparams = multi_layer_perceptron_params(x,y)
    mpl_fitting = multi_layer_perceptron_fitting(x,y,mpl_hyperparams)
    mpl_prediction = treat_dataframes(test_x)
    mpl_predict_test = multi_layer_perceptron_predict(mpl_fitting,mpl_prediction)

    ensemble = [i/3 for i in (mpl_predict_test + svm_predict_test + xgb_predict_test)]

    for i in range(len(ensemble)):
        if ensemble[i] < 0.6:
            ensemble[i] = 0
        else:
            ensemble[i] = 1

    print(i)
    print("------------------------------")
    print(f" SVC Accuracy Score:{round(accuracy_score(y_pred=svm_predict_test,y_true=test_y),2)}",f"\n SVC F1 Score {round(f1_score(y_pred=svm_predict_test,y_true=test_y),2)}")
    print("\n")
    print(f" XGB Accuracy Score:{round(accuracy_score(y_pred=xgb_predict_test,y_true=test_y),2)}",f"\n XGB F1 Score {round(f1_score(y_pred=xgb_predict_test,y_true=test_y),2)}")
    print("\n")
    print(f" MPL Accuracy Score:{round(accuracy_score(y_pred=mpl_predict_test,y_true=test_y),2)}",f"\n MPL F1 Score {round(f1_score(y_pred=mpl_predict_test,y_true=test_y),2)}")
    print("\n")
    print(f" Ensemble Accuracy Score:{round(accuracy_score(y_pred=ensemble,y_true=test_y),2)}",f"\n Ensemble F1 Score {round(f1_score(y_pred=ensemble,y_true=test_y),2)}")
    print("==============================")

    
    scoring[i]["SVM"] = dict()
    scoring[i]["SVM"]["Accuracy"] = round(accuracy_score(y_pred=svm_predict_test,y_true=test_y),2)
    scoring[i]["SVM"]["F1 Score"] = round(f1_score(y_pred=svm_predict_test,y_true=test_y),2)
    
    scoring[i]["XGB"] = dict()
    scoring[i]["XGB"]["Accuracy"] = round(accuracy_score(y_pred=xgb_predict_test,y_true=test_y),2)
    scoring[i]["XGB"]["F1 Score"] = round(f1_score(y_pred=xgb_predict_test,y_true=test_y),2)

    scoring[i]["MPL"] = dict()
    scoring[i]["MPL"]["Accuracy"] = round(accuracy_score(y_pred=mpl_predict_test,y_true=test_y),2)
    scoring[i]["MPL"]["F1 Score"] = round(f1_score(y_pred=mpl_predict_test,y_true=test_y),2)

    scoring[i]["Ensemble"] = dict()
    scoring[i]["Ensemble"]["Accuracy"] = round(accuracy_score(y_pred=ensemble,y_true=test_y),2)
    scoring[i]["Ensemble"]["F1 Score"] = round(f1_score(y_pred=ensemble,y_true=test_y),2)

