In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st

In [2]:
shooting_df = pd.read_csv('data/fatal-police-shootings-data.csv')
shooting_df = shooting_df.rename(columns = {'city': 'City'})
shooting_df.head()

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,City,state,signs_of_mental_illness,threat_level,flee,body_camera,longitude,latitude,is_geocoding_exact
0,3,Tim Elliot,2015-01-02,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False,-123.122,47.247,True
1,4,Lewis Lee Lembke,2015-01-02,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False,-122.892,45.487,True
2,5,John Paul Quintero,2015-01-03,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False,-97.281,37.695,True
3,8,Matthew Hoffman,2015-01-04,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False,-122.422,37.763,True
4,9,Michael Rodriguez,2015-01-04,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False,-104.692,40.384,True


In [3]:
census_df = pd.read_csv('data/full_census_data.csv')
census_df = census_df.drop(columns= {"Unnamed: 0", "Unnamed: 0.1"})
census_df = census_df.rename(columns = {"Geographic Area" : "state"})
census_df.head()

Unnamed: 0,state,City,Median Income,percent_completed_hs,poverty_rate,share_white,share_black,share_native_american,share_asian,share_hispanic
0,AL,Abanda,11207,21.2,78.8,67.2,30.2,0.0,0.0,1.6
1,AL,Abbeville,25615,69.1,29.1,54.4,41.4,0.1,1.0,3.1
2,AL,Adamsville,42575,78.9,25.5,52.3,44.9,0.5,0.3,2.3
3,AL,Addison,37083,81.4,30.7,99.1,0.1,0.0,0.1,0.4
4,AL,Akron,21667,68.6,42.0,13.2,86.5,0.0,0.0,0.3


In [4]:
merged = shooting_df.merge(census_df, how = 'left', on=['City', 'state'])
data = merged
data

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,City,state,...,latitude,is_geocoding_exact,Median Income,percent_completed_hs,poverty_rate,share_white,share_black,share_native_american,share_asian,share_hispanic
0,3,Tim Elliot,2015-01-02,shot,gun,53.0,M,A,Shelton,WA,...,47.247,True,37072,80.1,28.6,78.9,0.8,3.7,1.1,19.2
1,4,Lewis Lee Lembke,2015-01-02,shot,gun,47.0,M,W,Aloha,OR,...,45.487,True,65765,88.1,14.9,70.9,2.6,1,8.9,21.1
2,5,John Paul Quintero,2015-01-03,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,...,37.695,True,45947,87.5,17.3,71.9,11.5,1.2,4.8,15.3
3,8,Matthew Hoffman,2015-01-04,shot,toy weapon,32.0,M,W,San Francisco,CA,...,37.763,True,81294,87,13.2,48.5,6.1,0.5,33.3,15.1
4,9,Michael Rodriguez,2015-01-04,shot,nail gun,39.0,M,H,Evans,CO,...,40.384,True,47791,76.3,16.6,76.5,0.9,1.2,0.9,43.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6091,6651,,2021-02-26,shot and Tasered,gun,,M,,San Antonio,TX,...,29.429,True,46744,81.4,19.8,72.6,6.9,0.9,2.4,63.2
6092,6644,,2021-02-27,shot,undetermined,,M,,Dora,AL,...,33.729,True,40769,82.3,17.2,85.6,12.2,0.3,0,0.7
6093,6645,,2021-02-27,shot,gun,,M,,Sebeka,MN,...,46.616,True,35893,92.6,18,97.9,0.4,0.1,0,0.8
6094,6646,,2021-02-27,shot,undetermined,,M,,Sebeka,MN,...,46.616,True,35893,92.6,18,97.9,0.4,0.1,0,0.8


In [5]:
race_grouping = data.loc[:, ["id", "race"]]
race_grouping = race_grouping.groupby(["race"])["id"].count()
race_grouping = pd.DataFrame(race_grouping)
race_grouping.head(10)

Unnamed: 0_level_0,id
race,Unnamed: 1_level_1
A,100
B,1449
H,1025
N,83
O,47
W,2790


In [6]:
armed_grouping = data.loc[:, ["id", "armed"]]
armed_grouping = armed_grouping.groupby(["armed"])["id"].count()
armed_grouping = pd.DataFrame(armed_grouping)
armed_grouping  = armed_grouping .sort_values("id", ascending=False)
armed_grouping.head()

Unnamed: 0_level_0,id
armed,Unnamed: 1_level_1
gun,3482
knife,893
unarmed,398
toy weapon,205
vehicle,190


In [7]:
reduced_df = data.loc[:, ["id", "race", "armed", "threat_level", "signs_of_mental_illness"]]
reduced_df.head()

Unnamed: 0,id,race,armed,threat_level,signs_of_mental_illness
0,3,A,gun,attack,True
1,4,W,gun,attack,False
2,5,H,unarmed,other,False
3,8,W,toy weapon,attack,True
4,9,H,nail gun,attack,False


In [8]:
reduced_df["threat_level"].value_counts()

attack          3948
other           1892
undetermined     256
Name: threat_level, dtype: int64

In [9]:
reduced_df.loc[reduced_df.race != 'W', 'race'] = "1"
reduced_df.loc[reduced_df.race == 'W', 'race'] = "0"

In [10]:
reduced_df = reduced_df.drop(columns = "id")

In [11]:
reduced_df

Unnamed: 0,race,armed,threat_level,signs_of_mental_illness
0,1,gun,attack,True
1,0,gun,attack,False
2,1,unarmed,other,False
3,0,toy weapon,attack,True
4,1,nail gun,attack,False
...,...,...,...,...
6091,1,gun,attack,False
6092,1,undetermined,undetermined,False
6093,1,gun,attack,False
6094,1,undetermined,attack,False


In [12]:
X = reduced_df.drop("race", axis=1)
y = reduced_df["race"].values.reshape(-1, 1)
print(X.shape, y.shape)

(6096, 3) (6096, 1)


In [13]:
X.loc[X.armed != 'unarmed', 'armed'] = "armed"

In [14]:
X.loc[X.threat_level != 'attack', 'threat_level'] = "other/undefined"

In [15]:
X.loc[X.signs_of_mental_illness != 'True', 'signs_of_mental_illness'] = "False"

  res_values = method(rvalues)


In [16]:
X.loc[X.signs_of_mental_illness == 'True', 'signs_of_mental_illness'] = "True"

In [17]:
X = pd.get_dummies(X)

In [18]:
X

Unnamed: 0,armed_armed,armed_unarmed,threat_level_attack,threat_level_other/undefined,signs_of_mental_illness_False
0,1,0,1,0,1
1,1,0,1,0,1
2,0,1,0,1,1
3,1,0,1,0,1
4,1,0,1,0,1
...,...,...,...,...,...
6091,1,0,1,0,1
6092,1,0,0,1,1
6093,1,0,1,0,1
6094,1,0,1,0,1


In [20]:
X.drop(["armed_armed", "threat_level_other/undefined"], axis=1, inplace=True)

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [22]:
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)

In [23]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [24]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression()

In [25]:
 classifier.fit(X_train_scaled, y_train)

  return f(**kwargs)


LogisticRegression()

In [26]:
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.5402449693788276
Testing Data Score: 0.5485564304461942
