In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Compare killings by police vs intentional homicide rate, across countries

In [None]:
homicide_path = 'homicide_data_by_country.csv'
shooting_by_police_path = 'police_killings_by_country.csv'
# https://worldpopulationreview.com/country-rankings/police-killings-by-country
# https://www.indexmundi.com/facts/indicators/VC.IHR.PSRC.P5/rankings

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
h = pd.read_csv(homicide_path)
s = pd.read_csv(shooting_by_police_path)
s.columns = [elem.replace('country', 'Country') for elem in s.columns]
data_by_country = pd.merge(h,s, on='Country', how='inner')
fig, ax = plt.subplots(figsize=(20,12))
ax.scatter(x=np.log(data_by_country['homicide rate']),
           y=np.log(data_by_country['ratePer10M']),
           label = data_by_country['Country'])

for i, txt in enumerate(data_by_country['Country']):
    plt.annotate(txt, (np.log(data_by_country.loc[i,'homicide rate']), np.log(data_by_country.loc[i, 'ratePer10M'])))

fig.savefig('pofat_vs_hom_by_country.png')

In [None]:
data_by_country[['homicide rate', 'ratePer10M']].corr()

In [None]:
np.log(data_by_country[['homicide rate', 'ratePer10M']]).corr()

# Same comparison across states

In [None]:

us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
    'United States': 'ALL'
}

In [None]:
police_fatality_wapo_p = 'fatal-police-shootings-data-wapo.csv'
homicide_by_state_p = 'homicide_by_state_wikipedia.csv'
# https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_intentional_homicide_rate
# https://data.world/awram/us-police-involved-fatalities

In [None]:
# state abbreviation dictionary: https://gist.github.com/rogerallen/1583593

police_fatality = pd.read_csv(police_fatality_wapo_p, parse_dates = ['date'])
police_fatality['year'] = police_fatality['date'].dt.year
gb = police_fatality.groupby(['year', 'state']).count()
gb = gb.iloc[:,0]

po_fat = gb.unstack(['year'])
po_fat.columns = [str(elem) for elem in po_fat.columns]
po_fat = po_fat.reset_index()
po_fat['comp_years_pofat'] = po_fat[['2015', '2016', '2017', '2018', '2019']].sum(axis=1)

h_by_s = pd.read_csv(homicide_by_state_p)
h_by_s['state'] = h_by_s['state'].str.strip()
h_by_s['Murder Count'] = h_by_s['Murder Count'].str.replace(',','').astype(int)
h_by_s['relpop'] = h_by_s['Murder Count']/h_by_s['2019']
h_by_s['state'] = h_by_s['state'].apply(lambda entry: us_state_abbrev[entry])
h_by_s['comp_years_homicide'] = h_by_s[['2019', '2018', '2017', '2016', '2015']].sum(axis=1)

data_by_state = pd.merge(h_by_s[['state', 'comp_years_homicide', 'relpop']],
                po_fat[['state', 'comp_years_pofat']],
                on = 'state',
                how = 'inner')
data_by_state['pofat_rate'] = data_by_state['comp_years_pofat']/data_by_state['relpop']

fig, ax = plt.subplots(figsize=(20,12))
ax.scatter(x=np.log(data_by_state['comp_years_homicide']),
           y=np.log(data_by_state['pofat_rate']))

for i, txt in enumerate(data_by_state['state']):
    plt.annotate(txt, (np.log(data_by_state.loc[i,'comp_years_homicide']), np.log(data_by_state.loc[i, 'pofat_rate'])))

fig.savefig('pofat_vs_hom_by_US_state.png')

In [None]:
data_by_state[['comp_years_homicide', 'pofat_rate']].corr()

In [None]:
np.log(data_by_state[['pofat_rate','comp_years_homicide']]).corr()