In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import numpy as np
import datetime
from folium import LayerControl, Map, Marker, FeatureGroup, Icon
from folium.plugins import FastMarkerCluster
import matplotlib.pyplot as plt
%matplotlib inline

# Stop and Search dataset contains data for the following dates:
## March/2017 - February/2020

In [None]:
# read in the combined and cleaned dataframe
df = pd.read_csv('stop_and_search.csv')

# What ethnicities are stopped and searched more often? Is there a large gap between the counts of one ethnicity being stopped over another?

I predicted that there is some sort of gap in the counts of one ethnicity stopped over another. The way we measure success is by counting the amount of people stopped for each ethnicity then comparing the counts to see if there is a larger gap between two counts over the rest of the data.

In [None]:
# place ethnicities in a list in order from most frequent to less
# indexes 0-9 contain top 10 most frequent
# indexes 27-36 contain top 10 less frequent
ethnicities = df['Self-defined ethnicity'].value_counts().index.tolist()

# place ethnicity counts in another list
counts = df['Self-defined ethnicity'].value_counts().tolist()

# create dataframe with 2 lists
df_e = pd.DataFrame(list(zip(ethnicities, counts)), columns=['Ethnicity', 'Count'])

## Top 10 enthnicities stopped and searched

In [None]:
# plot the top10 ethnicities stopped and searched
df_e.loc[:9].plot(x = 'Ethnicity', y = 'Count', kind = 'barh', figsize = (10,5), title = 'Top 10 Ethnicities Stopped and Searched');

Looking at the top 10 ethnicities stopped and searched it looks like there is a consistent increase in the counts between ethnicities. There is a huge spike however for the 'White - English/Welsh/Scottish/Northern Irish/British' ethnicity. It is possible that this race is an outlier among the rest of the data. We can double check this by creating a pie chart.

In [None]:
label = df_e.loc[:9]['Ethnicity']
y = df_e.loc[:9]['Count']
f, a = plt.subplots(figsize = (20,8))
a.pie(y, autopct='%1.1f%%', labels=label, explode=(0.1,0,0,0,0,0,0,0,0,0));
plt.title('Percentage of Top 10 Ethnicities Stopped and Searched');
plt.show();

After creating this pie chart, we can now clearly see that the ethnicity 'White - English/Welsh/Scottish/Northern Irish/British' is an outlier and takes up almost half the pie chart.

## Top 10 ethnicities NOT stopped and searched (a.k.a. bottom 10)

In [None]:
# plot the bottom10 ethnicities stopped and searched
df_e.loc[26:37].plot(x = 'Ethnicity', y = 'Count', kind = 'barh', figsize = (10,5), title = 'Top 10 Ethnicities NOT Stopped and Searched');

Looking at the bottom 10 ethnicities stopped and searched it looks like there is a consistent increase in the counts between ethnicities. It doesn't look like there are any outliers, but we can analyze the data further by creating a pie chart.

In [None]:
label = df_e.loc[26:37]['Ethnicity']
y = df_e.loc[26:37]['Count']
f, a = plt.subplots(figsize = (20,8))
a.pie(y, autopct='%1.1f%%', labels=label);
plt.title('Percentage of Top 10 Ethnicities NOT Stopped and Searched');
plt.show();

After creating this pie chart, we can see more clearly that there is in fact a consistent increase in the counts for each ethnicity.

After looking at both top 10 and bottom 10 we can see that the only clear gap between the counts (outlier) is with the ethnicity 'White - English/Welsh/Scottish/Northern Irish/British.' With this we can draw up two possible reasons as to why this ethnicity is an outlier when it comes to times they were stopped and searched. Either that ethnicity is the majority in the U.K. or that ethnicity is targeted by police more frequently.

# What age range is more likely to get stopped and searched?

I predicted that people in their early 20’s get stopped and searched the most. The way we measure if our prediction was successful is by checking which age group has the higher percentage of being stopped and searched.

In [None]:
# places age ranges in a list in order from most frequent to less
ageRange = df['Age range'].value_counts().index.tolist()

# place age range counts in another list
count = df['Age range'].value_counts().tolist()

# create dataframe with 2 lists
df_a = pd.DataFrame(list(zip(ageRange, count)), columns=['Age range', 'Count'])

## Bar Graph of Age Ranges Stopped and Searched

In [None]:
# plot the age ranges that were stopped and searched
df_a.plot(x = 'Age range', y = 'Count', kind = 'barh', figsize = (10,5), title = 'Age Ranges That Were Stopped and Searched');

After creating this bar graph, you can clearly see that the age range that gets stopped and searched the most is 18-24. Not only are they the most stopped and searched but the gap between this age range and the others is greater than the gaps between the other age ranges.

## Pie Chart of Age Ranges Stopped and Searched

In [None]:
l = df_a['Age range']
y = df_a['Count']
x, r = plt.subplots(figsize = (20,8))
r.pie(y, autopct='%1.1f%%', labels=l, explode=(0.1,0,0,0,0));
plt.legend(bbox_to_anchor=(1,0.9))
plt.title('Percentage of Age Ranges Stopped and Searched');
plt.show();

After creating the pie chart, you can see clearly that the age range 18-24 takes up more than a third of the total amount of people stopped and searched. All other age ranges take up less than a quarter of the total.

## Locations of Age Ranges Being Stopped and Searched

In [None]:
# create separate dataframes for each age range
df_under10 = df[df['Age range'] == 'under 10'].copy()
df_under10 = df_under10[['Age range','Longitude','Latitude']].dropna()

df_10_17 = df[df['Age range'] == '10-17'].copy()
df_10_17 = df_10_17[['Age range','Longitude','Latitude']].dropna()

df_18_24 = df[df['Age range'] == '18-24'].copy()
df_18_24 = df_18_24[['Age range','Longitude','Latitude']].dropna()

df_25_34 = df[df['Age range'] == '25-34'].copy()
df_25_34 = df_25_34[['Age range','Longitude','Latitude']].dropna()

df_over34 = df[df['Age range'] == 'over 34'].copy()
df_over34 = df_over34[['Age range','Latitude','Longitude']].dropna()

In [None]:
# create coordinates
coor = (53, -3);

# create map variable
age_map = Map(location=coor, zoom_start=6, tiles='Stamen Watercolor');

# create marker cluster variables for each age range
under10 = FastMarkerCluster(df_under10[['Latitude', 'Longitude']].values.tolist());
r10_17 = FastMarkerCluster(df_10_17[['Latitude', 'Longitude']].values.tolist());
r18_24 = FastMarkerCluster(df_18_24[['Latitude', 'Longitude']].values.tolist());
r25_34 = FastMarkerCluster(df_25_34[['Latitude', 'Longitude']].values.tolist());
over34 = FastMarkerCluster(df_over34[['Latitude', 'Longitude']].values.tolist());

# create feature groups for map filtering
under10f = FeatureGroup(name='under 10');
r10_17f = FeatureGroup(name='10-17');
r18_24f = FeatureGroup(name='18-24');
r25_34f = FeatureGroup(name='25-34');
over34f = FeatureGroup(name='over 34');

In [None]:
# add clusters to feature groups
under10f.add_child(under10);
r10_17f.add_child(r10_17);
r18_24f.add_child(r18_24);
r25_34f.add_child(r25_34);
over34f.add_child(over34);
        
# add feature groups to the map
age_map.add_child(under10f);
age_map.add_child(r10_17f);
age_map.add_child(r18_24f);
age_map.add_child(r25_34f);
age_map.add_child(over34f);

# add Layer Control
age_map.add_child(LayerControl())

# display map
display(age_map);

After creating the map, you can see that for the age range 'under 10' they were stopped and searched in the areas of England and Wales. There was no one under the age of 10 stopped in Northern Ireland or Scotland. All other age ranges were stopped and searched in all four locations.