In [1]:
import pandas as pd
from glob import glob
import numpy as np

In [2]:
sht_apd = pd.read_csv('./Raw/shootings-append.csv', low_memory=False)

In [3]:
april_files = glob('./Raw/raw dump April2016/*All*')
april_files

['./Raw/raw dump April2016/All complaints during 2013.xls',
 './Raw/raw dump April2016/All complaints during 2012.xls',
 './Raw/raw dump April2016/All complaints during 2014.xls',
 './Raw/raw dump April2016/All complaints during 2016YTD.xls',
 './Raw/raw dump April2016/All complaints during 2015.xls']

In [115]:
df_list = [pd.read_excel(file) for file in april_files]
april_df = pd.concat(df_list, ignore_index=True)

In [5]:
may_excel = pd.ExcelFile('./Raw/raw dump May2016/IPRA Shootings Data.xls')
may_excel.sheet_names

['qry_II_217_Data163_2016',
 'qry_II_217_Data163_2015',
 'qry_II_217_Data163_2014',
 'qry_II_217_Data163_2013',
 'qry_II_217_2012_Incid',
 'qry_II_217_2012_Parties',
 'qry_II_217_2011_Incid',
 'qry_II_217_2011_Parties',
 'qry_II_217_2010_Incid',
 'qry_II_217_2010_Parties',
 'qry_II_217_2009_Incid',
 'qry_II_217_2009_Parties',
 'qry_II_217_200807_Incid',
 'qry_II_217_200807_Parties',
 'Sheet3']

In [116]:
df_list = [may_excel.parse(sheet) for sheet in may_excel.sheet_names[:-1]]
may_df = pd.concat(df_list, ignore_index=True)

In [16]:
april_complaints = april_df.Complaint_Number.dropna().unique()
may_complaints = may_df.Complaint_Number.dropna().unique()
print(len(april_complaints))
print(len(may_complaints))

7175
361


In [18]:
non_intersect = [complaint for complaint in may_complaints
                 if complaint not in april_complaints]

In [19]:
len(non_intersect)

200

# Cleaning / Combining April and May

In [101]:
clean_april = april_df.copy()
clean_may = may_df.copy()

In [102]:
april_cols = [col.lower().replace(' ', '_') for col in clean_april.columns]
may_cols = [col.lower().replace(' ', '_') for col in clean_may.columns]
clean_april.columns = april_cols
clean_may.columns = may_cols

In April, several of there are multiple fname columns due to misspellings of the column headers. These can be combined, as there are no cases where one column has conflicting information with the other column

In [103]:
# confirming there are no conflicting values
sub_df = clean_april[['accused_officer_fname', 'accusedofficer_fname']]
sub_df = sub_df[(~sub_df.accused_officer_fname.isnull()) & 
                 (~sub_df.accusedofficer_fname.isnull())]
print(sum(sub_df.accused_officer_fname != sub_df.accusedofficer_fname))

sub_df = clean_april[['accused_officer_fname', 'accsued_officer_fname']]
sub_df = sub_df[(~sub_df.accused_officer_fname.isnull()) & 
                (~sub_df.accsued_officer_fname.isnull())]
print(sum(sub_df.accused_officer_fname != sub_df.accsued_officer_fname))

sub_df = clean_april[['accsued_officer_fname', 'accusedofficer_fname']]
sub_df = sub_df[(~sub_df.accsued_officer_fname.isnull()) & 
                (~sub_df.accusedofficer_fname.isnull())]
print(sum(sub_df.accsued_officer_fname != sub_df.accusedofficer_fname))

0
0
0


In [104]:
clean_april.accused_officer_fname.fillna(clean_april.accsued_officer_fname, inplace=True)
clean_april.accused_officer_fname.fillna(clean_april.accusedofficer_fname, inplace=True)
clean_april.drop(['accsued_officer_fname', 'accusedofficer_fname'], axis=1, inplace=True)

In [105]:
april_cols = clean_april.columns.tolist()
april_diff = sorted([col for col in april_cols if col not in may_cols])
may_diff = sorted([col for col in may_cols if col not in april_cols])

for i in range(len(may_diff)):
    try: 
        print('{:40}{}'.format(april_diff[i], may_diff[i]))
    except IndexError:
        print('{:40}{}'.format('NA', may_diff[i]))

accused_officer_fname                   accused_assignment
accused_officer_lname                   accused_fname
accused_star                            accused_lname
accused_unit                            accused_star_no
discipline_code                         closedatipra_datetime
involved_officer_age                    complaint_date
involved_officer_detail                 converted_rec_i
involved_officer_fname                  cr_required
involved_officer_lname                  finding_id
involved_officer_position               iad_ops
involved_officer_race                   invovled_party_age
involved_officer_sex                    invovled_party_assignment
involved_officer_unit                   invovled_party_description
ipra_closed_date                        invovled_party_detail
notification_date                       invovled_party_fname
recommended_number_of_days              invovled_party_lname
NA                                      invovled_party_position
NA           

In [106]:
clean_may.columns = [col.replace('party', 'officer').replace('invovled', 'involved')
                     for col in clean_may.columns]
may_cols = clean_may.columns

In [107]:
april_diff = sorted([col for col in april_cols if col not in may_cols])
may_diff = sorted([col for col in may_cols if col not in april_cols])

for i in range(len(may_diff)):
    try: 
        print('{:40}{}'.format(april_diff[i], may_diff[i]))
    except IndexError:
        print('{:40}{}'.format('NA', may_diff[i]))

accused_officer_fname                   accused_assignment
accused_officer_lname                   accused_fname
accused_star                            accused_lname
accused_unit                            accused_star_no
discipline_code                         closedatipra_datetime
involved_officer_unit                   complaint_date
ipra_closed_date                        converted_rec_i
notification_date                       cr_required
recommended_number_of_days              finding_id
NA                                      iad_ops
NA                                      involved_officer_assignment
NA                                      involved_officer_description
NA                                      involved_officer_type
NA                                      ipra_assign_date_time
NA                                      ipra_investigate_begin_date
NA                                      ipra_investigate_end_date
NA                                      ipra_investigator_

In [108]:
# these two columns seems to be the same as well
print(clean_may.penalty_code.unique())
print(clean_april.discipline_code.unique())

[nan 'VIOLATION NOTED' 'REPRIMAND' 'SUSPENSION' 'SEPARATION']
[nan 'SEPARATION' 'SUSPENSION' 'VIOLATION NOTED' 'REPRIMAND']


In [109]:
# these two columns seems to be the same as well
print(sorted(clean_may.involved_officer_assignment.unique()))
print(sorted(clean_april.involved_officer_unit.unique()))

[2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 20.0, 21.0, 23.0, 45.0, 50.0, 57.0, 116.0, 119.0, 120.0, 121.0, 123.0, 124.0, 126.0, 127.0, 140.0, nan, 1.0, 19.0, 22.0, 24.0, 25.0, 44.0, 113.0, 141.0, 142.0, 151.0, 152.0, 153.0, 180.0, 188.0, 189.0, 191.0, 193.0, 211.0, 212.0, 213.0, 214.0, 215.0, 216.0, 253.0, 277.0, 311.0, 312.0, 313.0, 314.0, 315.0, 341.0, 353.0, 384.0, 393.0, 477.0, 601.0, 603.0, 606.0, 610.0, 620.0, 630.0, 640.0, 701.0, 704.0]
[nan, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 22.0, 24.0, 25.0, 26.0, 44.0, 45.0, 50.0, 51.0, 59.0, 79.0, 86.0, 113.0, 116.0, 119.0, 121.0, 123.0, 124.0, 126.0, 127.0, 135.0, 140.0, 141.0, 142.0, 145.0, 153.0, 163.0, 166.0, 171.0, 177.0, 180.0, 189.0, 191.0, 192.0, 193.0, 196.0, 211.0, 212.0, 213.0, 214.0, 215.0, 261.0, 277.0, 311.0, 312.0, 313.0, 314.0, 315.0, 341.0, 353.0, 377.0, 384.0, 393.0, 441.0, 477.0, 606.0, 608.0, 61

In [110]:
# these two columns seems to be the same as well
print(sorted(clean_may.accused_assignment.unique()))
print(sorted(clean_april.accused_unit.unique()))

[nan, 1.0, 2.0, 4.0, 5.0, 6.0, 7.0, 8.0, 10.0, 11.0, 18.0, 171.0, 177.0, 189.0, 312.0, 610.0, 620.0]
[nan, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 24.0, 25.0, 26.0, 44.0, 45.0, 50.0, 51.0, 55.0, 59.0, 111.0, 121.0, 123.0, 124.0, 126.0, 141.0, 145.0, 163.0, 166.0, 167.0, 171.0, 177.0, 180.0, 184.0, 189.0, 191.0, 192.0, 193.0, 196.0, 214.0, 277.0, 311.0, 312.0, 313.0, 314.0, 315.0, 341.0, 353.0, 376.0, 377.0, 384.0, 393.0, 442.0, 542.0, 603.0, 606.0, 608.0, 610.0, 620.0, 630.0, 640.0, 650.0, 701.0, 704.0]


In [111]:
replacements = {'accused_fname': 'accused_officer_fname',
                'accused_lname': 'accused_officer_lname',
                'accused_star_no': 'accused_star',
                'no_of_days': 'recommended_number_of_days', 
                'closedatipra_datetime': 'ipra_closed_date',
                'penalty_code': 'discipline_code', 
                'involved_officer_assignment': 'involved_officer_unit',
                'accused_assignment': 'accused_unit'}

clean_may.rename(columns = replacements, inplace=True)

In [112]:
may_cols = clean_may.columns.tolist()
april_diff = sorted([col for col in april_cols if col not in may_cols])
may_diff = sorted([col for col in may_cols if col not in april_cols])

for i in range(len(may_diff)):
    try: 
        print('{:40}{}'.format(april_diff[i], may_diff[i]))
    except IndexError:
        print('{:40}{}'.format('NA', may_diff[i]))

notification_date                       complaint_date
NA                                      converted_rec_i
NA                                      cr_required
NA                                      finding_id
NA                                      iad_ops
NA                                      involved_officer_description
NA                                      involved_officer_type
NA                                      ipra_assign_date_time
NA                                      ipra_investigate_begin_date
NA                                      ipra_investigate_end_date
NA                                      ipra_investigator_type
NA                                      penalty_status
NA                                      police_shooting
NA                                      police_shooting_no
NA                                      report_status


#### At this point it's unclear to me what column in the May dataset the "notification_date" column in the April dataset might correspond to

#### As far as I can tell, "accused_unit" and "involved_officer_unit" are not the same

In [113]:
# there are cases where the accused_unit and involved_officer_unit are both not nan, 
# but are also not equal
sub_df = clean_april[['accused_unit', 'involved_officer_unit']]
sub_df = sub_df[(~sub_df.accused_unit.isnull()) & 
                 (~sub_df.involved_officer_unit.isnull())]
print(sum(sub_df.accused_unit != sub_df.involved_officer_unit))

sub_df = clean_may[['accused_unit', 'involved_officer_unit']]
sub_df2 = sub_df[(~sub_df.accused_unit.isnull()) & 
                 (~sub_df.involved_officer_unit.isnull())]
print(sum(sub_df2.accused_unit != sub_df2.involved_officer_unit))

9141
1527


In [117]:
combined_df = pd.concat([clean_april, clean_may], ignore_index=True)

# Exploring Data

In [120]:
combined_df['full_name'] = combined_df.accused_officer_fname + \
                           '_' + combined_df.accused_officer_lname

In [124]:
officer_complaints = combined_df.groupby('full_name').complaint_number.unique()\
                                .reset_index()

In [148]:
officer_complaints['complaint_counts'] = officer_complaints.complaint_number\
                                                           .apply(lambda x: len(x))

In [153]:
officer_complaints.sort_values(by='complaint_counts', ascending=False)

Unnamed: 0,full_name,complaint_number,complaint_counts
165,ANTHONY_ROSEN,"[1062800.0, 1066667.0, 1053667.0, 1053730.0, 1...",10
1255,JESSE_RODRIGUEZ,"[1061127.0, 1064485.0, 1053629.0, 1056779.0, 1...",9
2615,SAMUEL_TRUESDALE,"[1061304.0, 1061698.0, 1062683.0, 1064009.0, 1...",8
2481,ROBERT_LOBIANCO,"[1063527.0, 1063935.0, 1066371.0, 1054352.0, 1...",8
106,ANDREW_KEMPS,"[1062630.0, 1064964.0, 1057286.0, 1058817.0, 1...",8
2641,SEAN_CAMPBELL,"[1061897.0, 1066578.0, 1054138.0, 1054349.0, 1...",8
1010,HENRY_MORRISON JR,"[1059748.0, 1060890.0, 1062894.0, 1065619.0, 1...",8
236,BARTHOLOM_MURPHY,"[1059948.0, 1061097.0, 1064774.0, 1054788.0, 1...",7
2090,MICHELLE_MURPHY,"[1059574.0, 1060928.0, 1062095.0, 1062377.0, 1...",7
1629,KIRKLAND_CROSSLEY,"[1064357.0, 1057473.0, 1069691.0, 1071044.0, 1...",7


In [214]:
officer_complaints.to_csv('./officer_complaints.csv')

In [210]:
complaint_nums_off = combined_df.groupby('complaint_number').full_name.unique()\
                                .reset_index()
complaint_nums_off['counts'] = complaint_nums_off.full_name\
                                                 .apply(lambda x: len(x))

complaint_nums_off.sort_values(by='counts', ascending=False)

Unnamed: 0,complaint_number,full_name,counts
1085,1055981.0,"[COLIN_MACNIFF, CORY_PETRACCO, DANIEL_FELICIAN...",22
4953,1070415.0,"[nan, ABDULLAH_BEYAH, ADAM_PURICELLI, BRIAN_MC...",16
825,1055068.0,"[ADOLFO_GARCIA, ANNE_ZAMZOW, BENJAMEN_FERN, CH...",14
4074,1067139.0,"[nan, BRIAN_BLACKMAN, DANIEL_SAKO, DARRELL_DAV...",13
3875,1066370.0,"[nan, ADAM_DAOUD, CHARLES_ARTZ, DANIEL_KASPER,...",12
1127,1056113.0,"[nan, ALFRED_CARUSO, ANTHONY_BABICZ, ARLETTA_K...",12
4058,1067084.0,"[nan, ANTHONY_ROSEN, EDWARD_LEIGHTON, ERNEST_C...",12
5586,1072910.0,"[ANTHONY_BABICZ, BRIAN_KINNANE, JAMES_CWICK, J...",12
6962,1078329.0,"[BRIAN_HOOD, CARLOS_MOSTEK, CORDY_FOUCH JR, DA...",12
4134,1067407.0,"[nan, CHRISTOPH_FINDYSZ, EDWARD_NICOL, FRANK_Q...",11
