In [1]:
import pandas as pd
from glob import glob
import csv
import numpy as np

In [2]:
files = sorted(glob('./Raw/raw dump June 2016/*.xlsx'))
files

['./Raw/raw dump June 2016/P046957 - report 1 - all complaints in time frame.xlsx',
 './Raw/raw dump June 2016/P046957 - report 2 - identified accused xi.xlsx',
 './Raw/raw dump June 2016/P046957 - report 3 - police officer witness data xi.xlsx',
 './Raw/raw dump June 2016/P046957 - report 4 - victim data.xlsx',
 './Raw/raw dump June 2016/P046957 - report 5 - complainant (reporting party) data.xlsx']

### P046957 - report 1 - all complaints in time frame.xlsx

** Questions **
- For some of the Investigator rows there is a datetime in Unnamed: 6 - what is this?
- What is the column of values labeled as "Unnamed: 5" for the Investigator

In [3]:
df = pd.read_excel(files[0], skiprows=9)

In [4]:
df.dropna(how='all', inplace=True)
df = df[df['Location Code:'] != 'end of record'].copy()

In [5]:
df['Number:'].fillna(method='ffill', inplace=True)
df['Number:'] = df['Number:'].astype(int)

In [6]:
invest_df = df[df['Beat:'] == 'Investigator with Current Assignment and Rank:'].copy()
df = df[df['Beat:'] != 'Investigator with Current Assignment and Rank:'].copy()

In [7]:
df = df.replace('----', float('nan')).replace('-----', float('nan'))
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17045 entries, 0 to 50111
Data columns (total 13 columns):
Number:                 17045 non-null int64
Beat:                   16366 non-null object
Location Code:          16529 non-null object
Address of Incident:    12628 non-null object
Unnamed: 4              12877 non-null object
Unnamed: 5              824 non-null object
Unnamed: 6              13615 non-null object
Incident Date & Time    16531 non-null datetime64[ns]
Complaint Date          16531 non-null datetime64[ns]
Closed Date             14753 non-null datetime64[ns]
Unnamed: 10             0 non-null float64
Unnamed: 11             0 non-null float64
Unnamed: 12             0 non-null float64
dtypes: datetime64[ns](3), float64(3), int64(1), object(6)
memory usage: 1.8+ MB


In [8]:
df.drop(['Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12'], axis=1, inplace=True)

In [9]:
df.head(1)

Unnamed: 0,Number:,Beat:,Location Code:,Address of Incident:,Unnamed: 4,Unnamed: 5,Unnamed: 6,Incident Date & Time,Complaint Date,Closed Date
0,106213,1631,17,3700,N HARLEM AVE,,CHICAGO IL 60634,2015-07-19 21:00:00,2015-07-20,2015-09-21


In [10]:
df.columns = ['Complaint_Number', 'Beat', 'Location_Code', 'Address', 'Street', 'Apartment', 
              'City_State_Zipcode', 'Incident_Datetime', 'Complaint_Date', 'Closed_Date']
df.head(1)

Unnamed: 0,Complaint_Number,Beat,Location_Code,Address,Street,Apartment,City_State_Zipcode,Incident_Datetime,Complaint_Date,Closed_Date
0,106213,1631,17,3700,N HARLEM AVE,,CHICAGO IL 60634,2015-07-19 21:00:00,2015-07-20,2015-09-21


In [12]:
invest_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16404 entries, 1 to 50085
Data columns (total 13 columns):
Number:                 16404 non-null int64
Beat:                   16404 non-null object
Location Code:          16278 non-null object
Address of Incident:    16278 non-null object
Unnamed: 4              16278 non-null object
Unnamed: 5              11228 non-null object
Unnamed: 6              15714 non-null object
Incident Date & Time    0 non-null datetime64[ns]
Complaint Date          0 non-null datetime64[ns]
Closed Date             0 non-null datetime64[ns]
Unnamed: 10             0 non-null float64
Unnamed: 11             0 non-null float64
Unnamed: 12             0 non-null float64
dtypes: datetime64[ns](3), float64(3), int64(1), object(6)
memory usage: 1.8+ MB


In [18]:
invest_df.dropna(how='all', axis=1, inplace=True)
invest_df.drop('Beat:', axis=1, inplace=True)

In [19]:
invest_df.head()

Unnamed: 0,Number:,Location Code:,Address of Incident:,Unnamed: 4,Unnamed: 5,Unnamed: 6
1,106213,"KLIMAS, ROBERT",121,COMMANDER,0.0,2008-08-04 00:00:00
16,1038595,"DAUN, SHERRY",113,SUPERVISING INV IPRA,,2008-12-01 00:00:00
19,1039179,"JONES, VINCENT",113,INVESTIGATOR 2 IPRA,,
22,1053492,"MC GUIRE, TERRENCE",3,LIEUTENANT OF POLICE,376.0,1991-11-18 00:00:00
25,1053493,"BREIMON, GERALD",14,SERGEANT OF POLICE,869.0,1993-11-22 00:00:00


** Unnamed: 5 and 6 **

Looking at the full list of CPD officers it seems that Unnamed: 5 correspond to the officer's star number, while 6 corresponds to his/her appointment date. I wanted to confirm this, though, so I went ahead and looked at the subset of investigator names that appear in the full list of CPD officers

In [16]:
sworn = pd.read_csv('./all_sworn_officer.csv')
sworn_names = (sworn.LAST_NME + ', ' + sworn.FIRST_NME).tolist()
mask = invest_df['Location Code:'].drop_duplicates().apply(lambda x: x in sworn_names)
matches = invest_df.drop_duplicates(subset='Location Code:')[mask]

#the year value in the APPOINTED_DATE column is ambiguous, so it needs to be fixed
def fix_time(x):
    split = x.split('-')
    if split[-1][0] == '0' or split[-1][0] == '1':
        split[-1] = '20' + split[-1]
    else:
        split[-1] = '19' + split[-1]
    return '-'.join(split)

sworn.APPOINTED_DATE = pd.to_datetime(sworn.APPOINTED_DATE.astype(str).apply(fix_time), 
                                      errors='coerce')
matches['Unnamed: 6'] = matches['Unnamed: 6'].dt.normalize()

def which_matches(x):
    lname = x['Location Code:'].values[0].split(', ')[0]
    fname = x['Location Code:'].values[0].split(', ')[1]
    
    sub_sworn = sworn[(sworn.LAST_NME==lname)&(sworn.FIRST_NME==fname)]
    
    star = x['Unnamed: 5'].values[0]
    date = x['Unnamed: 6'].values[0]
    star_matches = False
    date_matches = False
    sworn_stars = ['STAR%s'%num for num in range(1, 11)]
    if star in sub_sworn[sworn_stars].dropna(axis=1).values:
        star_matches = True  
    if date in sub_sworn.APPOINTED_DATE.values:
        date_matches = True
    
    if star_matches and date_matches:
        return 'Both'
    elif star_matches or date_matches:
        return "One"
    else:
        return "Neither"
    
matches.dropna().groupby(level=0).apply(which_matches).value_counts()

Both    1050
One      154
dtype: int64

In [20]:
invest_df.columns = ['Complaint_Number', 'Investigator_Name', 
                     'Investigator_Current_Assignment', 'Investigator_Rank', 
                     'Investigator_Star', 'Investigator_Appointed_Date']

In [21]:
report1 = pd.merge(df, invest_df, on='Complaint_Number', how='outer')

### P046957 - report 2 - identified accused xi.xlsx

** Questions **
- Does "Unnamed: 11" and "Unnamed: 13" correspond to "Recommended Discipline" and "Discipline" (columns to the left of those)?

In [22]:
df = pd.read_excel(files[1], skiprows=8)

In [23]:
df.head()

Unnamed: 0,Number:,Accused:,Birth Yr:,Gender:,Race Code:,Date of Appt:,Current Unit:,Current Rank:,Star:,Complaint Category,Finding & Recommended Discipline,Unnamed: 11,Final Finding & Discipline,Unnamed: 13
0,107901.0,,,,,NaT,,,,,,,,
1,,"GONZALES, ROBIN",1981,F,S,2008-04-28,18.0,PO,5137.0,,,,,
2,,,end of record,,,NaT,,,,,,,,
3,108026.0,,,,,NaT,,,,,,,,
4,,"BECKER, JOHN",1970,M,WHI,2000-01-24,17.0,PO,4734.0,,,,,


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38571 entries, 0 to 38570
Data columns (total 14 columns):
Number:                             10086 non-null float64
Accused:                            18399 non-null object
Birth Yr:                           28485 non-null object
Gender:                             18399 non-null object
Race Code:                          18399 non-null object
 Date of Appt:                      18399 non-null datetime64[ns]
Current Unit:                       18399 non-null float64
Current Rank:                       18297 non-null object
Star:                               17371 non-null float64
Complaint Category                  16945 non-null object
Finding &
Recommended Discipline    6295 non-null object
Unnamed: 11                         15234 non-null float64
Final Finding & 
Discipline         5892 non-null object
Unnamed: 13                         14818 non-null float64
dtypes: datetime64[ns](1), float64(5), object(8)
memory usage: 4.1+ 

In [25]:
df['Number:'].fillna(method='ffill', inplace=True)
df['Number:'] = df['Number:'].astype(int)
df.head()

Unnamed: 0,Number:,Accused:,Birth Yr:,Gender:,Race Code:,Date of Appt:,Current Unit:,Current Rank:,Star:,Complaint Category,Finding & Recommended Discipline,Unnamed: 11,Final Finding & Discipline,Unnamed: 13
0,107901,,,,,NaT,,,,,,,,
1,107901,"GONZALES, ROBIN",1981,F,S,2008-04-28,18.0,PO,5137.0,,,,,
2,107901,,end of record,,,NaT,,,,,,,,
3,108026,,,,,NaT,,,,,,,,
4,108026,"BECKER, JOHN",1970,M,WHI,2000-01-24,17.0,PO,4734.0,,,,,


In [26]:
subset = df.columns.tolist()[1:]
df.dropna(subset=subset, how='all', inplace=True)
df = df[df['Birth Yr:'] != 'end of record']
df.head()

Unnamed: 0,Number:,Accused:,Birth Yr:,Gender:,Race Code:,Date of Appt:,Current Unit:,Current Rank:,Star:,Complaint Category,Finding & Recommended Discipline,Unnamed: 11,Final Finding & Discipline,Unnamed: 13
1,107901,"GONZALES, ROBIN",1981,F,S,2008-04-28,18.0,PO,5137.0,,,,,
4,108026,"BECKER, JOHN",1970,M,WHI,2000-01-24,17.0,PO,4734.0,,,,,
5,108026,"MARKHAM, SEAN",1971,M,I,2000-06-19,17.0,PO,19054.0,,,,,
8,1038595,"WELLS, OTIS",1967,M,BLK,2007-04-02,6.0,PO,5385.0,05D-NO ARREST,EX,600.0,EX,600.0
9,1038595,"MCCLAY, CHARLES",1983,M,BLK,2007-10-29,3.0,PO,4735.0,05D-NO ARREST,EX,600.0,EX,600.0


In [27]:
df.columns = [col.strip().replace(':', '').replace(' ', '_') for col in df.columns]
df.rename(columns={'Number': 'Complaint_Number',
                   'Accused': 'Name',
                   'Finding_&\nRecommended_Discipline': 'Finding',
                   'Unnamed_11': 'Recommended_Discipline',
                   'Final_Finding_&_\nDiscipline': 'Final_Finding',
                   'Unnamed_13': 'Discipline'}, inplace=True)
df.head()

Unnamed: 0,Complaint_Number,Name,Birth_Yr,Gender,Race_Code,Date_of_Appt,Current_Unit,Current_Rank,Star,Complaint_Category,Finding,Recommended_Discipline,Final_Finding,Discipline
1,107901,"GONZALES, ROBIN",1981,F,S,2008-04-28,18.0,PO,5137.0,,,,,
4,108026,"BECKER, JOHN",1970,M,WHI,2000-01-24,17.0,PO,4734.0,,,,,
5,108026,"MARKHAM, SEAN",1971,M,I,2000-06-19,17.0,PO,19054.0,,,,,
8,1038595,"WELLS, OTIS",1967,M,BLK,2007-04-02,6.0,PO,5385.0,05D-NO ARREST,EX,600.0,EX,600.0
9,1038595,"MCCLAY, CHARLES",1983,M,BLK,2007-10-29,3.0,PO,4735.0,05D-NO ARREST,EX,600.0,EX,600.0


In [28]:
report2 = df.copy()

### P046957 - report 3 - police officer witness data xi.xlsx

In [31]:
df = pd.read_excel(files[2], skiprows=9)
df.head()

Unnamed: 0.1,Unnamed: 0,Gender,Race,Star,Birth Year,Date Appointed,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,Number:,1053502,,,,NaT,,,,,,
1,"HARRIS, KAL",M,BLK,14236.0,1974.0,2001-05-29,,,,,,
2,,,end of record,,,NaT,,,,,,
3,Number:,1053509,,,,NaT,,,,,,
4,"BUKOWSKIBUS, GEORGE",M,WHI,11982.0,1966.0,1996-12-02,,,,,,


In [32]:
df['Complaint_Number'] = df[df['Unnamed: 0']=='Number:']['Gender']

In [33]:
cols = ['Complaint_Number'] + [col for col in df.columns if col != 'Complaint_Number']
df = df[cols].copy()
df.head()

Unnamed: 0.1,Complaint_Number,Unnamed: 0,Gender,Race,Star,Birth Year,Date Appointed,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,1053502.0,Number:,1053502,,,,NaT,,,,,,
1,,"HARRIS, KAL",M,BLK,14236.0,1974.0,2001-05-29,,,,,,
2,,,,end of record,,,NaT,,,,,,
3,1053509.0,Number:,1053509,,,,NaT,,,,,,
4,,"BUKOWSKIBUS, GEORGE",M,WHI,11982.0,1966.0,1996-12-02,,,,,,


In [34]:
df.Complaint_Number.fillna(method='ffill', inplace=True)
df = df[(df['Unnamed: 0'] != 'Number:') & (df['Race'] != 'end of record')]
df.head()

Unnamed: 0.1,Complaint_Number,Unnamed: 0,Gender,Race,Star,Birth Year,Date Appointed,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11
1,1053502,"HARRIS, KAL",M,BLK,14236.0,1974.0,2001-05-29,,,,,,
4,1053509,"BUKOWSKIBUS, GEORGE",M,WHI,11982.0,1966.0,1996-12-02,,,,,,
5,1053509,"HEINICHEN, WALTER",M,WHI,410.0,1968.0,1996-12-02,,,,,,
6,1053509,"LAMEKA, MARGARET",F,WHI,,1958.0,1986-06-16,,,,,,
9,1053545,"SCALES, MONICA",F,BLK,15769.0,1968.0,2006-09-25,,,,,,


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3432 entries, 1 to 6372
Data columns (total 13 columns):
Complaint_Number    3432 non-null object
Unnamed: 0          3432 non-null object
Gender              3432 non-null object
Race                3432 non-null object
Star                3220 non-null object
Birth Year          3415 non-null float64
Date Appointed      3415 non-null datetime64[ns]
Unnamed: 6          0 non-null float64
Unnamed: 7          0 non-null float64
Unnamed: 8          0 non-null float64
Unnamed: 9          0 non-null float64
Unnamed: 10         0 non-null float64
Unnamed: 11         0 non-null float64
dtypes: datetime64[ns](1), float64(7), object(5)
memory usage: 375.4+ KB


In [36]:
df.drop(['Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 
         'Unnamed: 10', 'Unnamed: 11'], axis=1, inplace=True)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3432 entries, 1 to 6372
Data columns (total 7 columns):
Complaint_Number    3432 non-null object
Unnamed: 0          3432 non-null object
Gender              3432 non-null object
Race                3432 non-null object
Star                3220 non-null object
Birth Year          3415 non-null float64
Date Appointed      3415 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(5)
memory usage: 214.5+ KB


In [38]:
df.rename(columns={'Unnamed: 0': 'Name'}, inplace=True)
df.head()

Unnamed: 0,Complaint_Number,Name,Gender,Race,Star,Birth Year,Date Appointed
1,1053502,"HARRIS, KAL",M,BLK,14236.0,1974.0,2001-05-29
4,1053509,"BUKOWSKIBUS, GEORGE",M,WHI,11982.0,1966.0,1996-12-02
5,1053509,"HEINICHEN, WALTER",M,WHI,410.0,1968.0,1996-12-02
6,1053509,"LAMEKA, MARGARET",F,WHI,,1958.0,1986-06-16
9,1053545,"SCALES, MONICA",F,BLK,15769.0,1968.0,2006-09-25


In [39]:
report3 = df.copy()

### P046957 - report 4 - victim data.xlsx

In [40]:
df = pd.read_excel(files[3], skiprows=7)
df.head()

Unnamed: 0,Number,Unnamed: 1,Gender,Age,Unnamed: 4,Race Desc,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12
0,1039179.0,,,,,,,,,,,,
1,,,F,25.832877,,BLACK,,,,,,,
2,,,M,38.0,,BLACK,,,,,,,
3,,,,,,end of record,,,,,,,
4,1053502.0,,,,,,,,,,,,


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19821 entries, 0 to 19820
Data columns (total 13 columns):
Number         6271 non-null float64
Unnamed: 1     0 non-null float64
Gender         7279 non-null object
Age            5889 non-null float64
Unnamed: 4     0 non-null float64
Race Desc      13550 non-null object
Unnamed: 6     0 non-null float64
Unnamed: 7     0 non-null float64
Unnamed: 8     0 non-null float64
Unnamed: 9     0 non-null float64
Unnamed: 10    0 non-null float64
Unnamed: 11    0 non-null float64
Unnamed: 12    0 non-null float64
dtypes: float64(11), object(2)
memory usage: 2.0+ MB


In [42]:
df.dropna(axis=1, how='all', inplace=True)
df.head()

Unnamed: 0,Number,Gender,Age,Race Desc
0,1039179.0,,,
1,,F,25.832877,BLACK
2,,M,38.0,BLACK
3,,,,end of record
4,1053502.0,,,


In [43]:
df['Number'].fillna(method='ffill', inplace=True)
df['Number'] = df['Number'].astype(int)
df.head()

Unnamed: 0,Number,Gender,Age,Race Desc
0,1039179,,,
1,1039179,F,25.832877,BLACK
2,1039179,M,38.0,BLACK
3,1039179,,,end of record
4,1053502,,,


In [44]:
subset = df.columns.tolist()[1:]
df.dropna(subset=subset, how='all', inplace=True)
df.head()

Unnamed: 0,Number,Gender,Age,Race Desc
1,1039179,F,25.832877,BLACK
2,1039179,M,38.0,BLACK
3,1039179,,,end of record
5,1053502,F,,BLACK
6,1053502,,,end of record


In [45]:
df = df[df['Race Desc'] != 'end of record']
df.head()

Unnamed: 0,Number,Gender,Age,Race Desc
1,1039179,F,25.832877,BLACK
2,1039179,M,38.0,BLACK
5,1053502,F,,BLACK
8,1053505,F,18.378082,BLACK
11,1053509,F,42.747945,WHITE HISPANIC


In [46]:
df.rename(columns={'Number': 'Complaint_Number', 'Race Desc': 'Race_Desc'}, inplace=True)
df.head()

Unnamed: 0,Complaint_Number,Gender,Age,Race_Desc
1,1039179,F,25.832877,BLACK
2,1039179,M,38.0,BLACK
5,1053502,F,,BLACK
8,1053505,F,18.378082,BLACK
11,1053509,F,42.747945,WHITE HISPANIC


In [47]:
report4 = df.copy()

### P046957 - report 5 - complainant (reporting party) data

In [48]:
df = pd.read_excel(files[4], skiprows=7)
df.head()

Unnamed: 0,Number,Unnamed: 1,Gender,Age,Unnamed: 4,Race Desc,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,1038595.0,,,,,,,,,,,
1,,,F,63.805479,,WHITE,,,,,,
2,,,,,,end of record,,,,,,
3,1039179.0,,,,,,,,,,,
4,,,M,50.449315,,BLACK,,,,,,


In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39917 entries, 0 to 39916
Data columns (total 12 columns):
Number         13238 non-null float64
Unnamed: 1     0 non-null float64
Gender         13441 non-null object
Age            11675 non-null float64
Unnamed: 4     0 non-null float64
Race Desc      26679 non-null object
Unnamed: 6     0 non-null float64
Unnamed: 7     0 non-null float64
Unnamed: 8     0 non-null float64
Unnamed: 9     0 non-null float64
Unnamed: 10    0 non-null float64
Unnamed: 11    0 non-null float64
dtypes: float64(10), object(2)
memory usage: 3.7+ MB


In [50]:
df.dropna(axis=1, how='all', inplace=True)
df.head()

Unnamed: 0,Number,Gender,Age,Race Desc
0,1038595.0,,,
1,,F,63.805479,WHITE
2,,,,end of record
3,1039179.0,,,
4,,M,50.449315,BLACK


In [51]:
df['Number'].fillna(method='ffill', inplace=True)
df['Number'] = df['Number'].astype(int)
df.head()

Unnamed: 0,Number,Gender,Age,Race Desc
0,1038595,,,
1,1038595,F,63.805479,WHITE
2,1038595,,,end of record
3,1039179,,,
4,1039179,M,50.449315,BLACK


In [52]:
subset = df.columns.tolist()[1:]
df.dropna(subset=subset, how='all', inplace=True)
df.head()

Unnamed: 0,Number,Gender,Age,Race Desc
1,1038595,F,63.805479,WHITE
2,1038595,,,end of record
4,1039179,M,50.449315,BLACK
5,1039179,M,51.391781,BLACK
6,1039179,F,34.641096,BLACK


In [53]:
df = df[df['Race Desc'] != 'end of record']
df.head()

Unnamed: 0,Number,Gender,Age,Race Desc
1,1038595,F,63.805479,WHITE
4,1039179,M,50.449315,BLACK
5,1039179,M,51.391781,BLACK
6,1039179,F,34.641096,BLACK
9,1053492,F,51.364384,BLACK


In [55]:
df.rename(columns={'Number': 'Complaint_Number', 'Race Desc': 'Race_Desc'}, inplace=True)
df.head()

Unnamed: 0,Complaint_Number,Gender,Age,Race_Desc
1,1038595,F,63.805479,WHITE
4,1039179,M,50.449315,BLACK
5,1039179,M,51.391781,BLACK
6,1039179,F,34.641096,BLACK
9,1053492,F,51.364384,BLACK


In [56]:
report5 = df.copy()

### Merging datasets

In [57]:
report1.columns

Index(['Complaint_Number', 'Beat', 'Location_Code', 'Address', 'Street',
       'Apartment', 'City_State_Zipcode', 'Incident_Datetime',
       'Complaint_Date', 'Closed_Date', 'Investigator_Name',
       'Investigator_Current_Assignment', 'Investigator_Rank',
       'Investigator_Star', 'Investigator_Appointed_Date'],
      dtype='object')

In [58]:
report2.columns

Index(['Complaint_Number', 'Name', 'Birth_Yr', 'Gender', 'Race_Code',
       'Date_of_Appt', 'Current_Unit', 'Current_Rank', 'Star',
       'Complaint_Category', 'Finding', 'Recommended_Discipline',
       'Final_Finding', 'Discipline'],
      dtype='object')

In [59]:
report3.columns

Index(['Complaint_Number', 'Name', 'Gender', 'Race', 'Star', 'Birth Year',
       'Date Appointed'],
      dtype='object')

In [60]:
report4.columns

Index(['Complaint_Number', 'Gender', 'Age', 'Race_Desc'], dtype='object')

In [61]:
report5.columns

Index(['Complaint_Number', 'Gender', 'Age', 'Race_Desc'], dtype='object')

In [62]:
report1.rename(columns={'Investigator_Appointed_Date': 'Investigator_Date_Appointed'})
report2.rename(columns={'Date_of_Appt': 'Date_Appointed'})

cols = ['Accused_' + col.strip().replace(' ', '_') for col in report2.columns[1:]]
report2.columns = ['Complaint_Number'] + cols

cols = ['PO_Witness_' + col.strip().replace(' ', '_') for col in report3.columns[1:]]
report3.columns = ['Complaint_Number'] + cols

cols = ['Victim_' + col.strip().replace(' ', '_') for col in report4.columns[1:]]
report4.columns = ['Complaint_Number'] + cols

cols = ['Complainant_' + col.strip().replace(' ', '_') for col in report5.columns[1:]]
report5.columns = ['Complaint_Number'] + cols

In [63]:
report1.columns

Index(['Complaint_Number', 'Beat', 'Location_Code', 'Address', 'Street',
       'Apartment', 'City_State_Zipcode', 'Incident_Datetime',
       'Complaint_Date', 'Closed_Date', 'Investigator_Name',
       'Investigator_Current_Assignment', 'Investigator_Rank',
       'Investigator_Star', 'Investigator_Appointed_Date'],
      dtype='object')

In [64]:
report2.columns

Index(['Complaint_Number', 'Accused_Name', 'Accused_Birth_Yr',
       'Accused_Gender', 'Accused_Race_Code', 'Accused_Date_of_Appt',
       'Accused_Current_Unit', 'Accused_Current_Rank', 'Accused_Star',
       'Accused_Complaint_Category', 'Accused_Finding',
       'Accused_Recommended_Discipline', 'Accused_Final_Finding',
       'Accused_Discipline'],
      dtype='object')

In [65]:
report3.columns

Index(['Complaint_Number', 'PO_Witness_Name', 'PO_Witness_Gender',
       'PO_Witness_Race', 'PO_Witness_Star', 'PO_Witness_Birth_Year',
       'PO_Witness_Date_Appointed'],
      dtype='object')

In [66]:
report4.columns

Index(['Complaint_Number', 'Victim_Gender', 'Victim_Age', 'Victim_Race_Desc'], dtype='object')

In [67]:
report5.columns

Index(['Complaint_Number', 'Complainant_Gender', 'Complainant_Age',
       'Complainant_Race_Desc'],
      dtype='object')

In [68]:
merged = pd.merge(report1, report2, on='Complaint_Number', how='outer')
merged = pd.merge(merged, report3, on='Complaint_Number', how='outer')
merged = pd.merge(merged, report4, on='Complaint_Number', how='outer')
merged = pd.merge(merged, report5, on='Complaint_Number', how='outer')

In [69]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31597 entries, 0 to 31596
Data columns (total 40 columns):
Complaint_Number                   31597 non-null object
Beat                               26935 non-null object
Location_Code                      27149 non-null object
Address                            19684 non-null object
Street                             20110 non-null object
Apartment                          1530 non-null object
City_State_Zipcode                 21396 non-null object
Incident_Datetime                  27151 non-null datetime64[ns]
Complaint_Date                     27151 non-null datetime64[ns]
Closed_Date                        24203 non-null datetime64[ns]
Investigator_Name                  27753 non-null object
Investigator_Current_Assignment    27753 non-null object
Investigator_Rank                  27753 non-null object
Investigator_Star                  19661 non-null object
Investigator_Appointed_Date        26836 non-null object
Accused_Name 

### Saving everything

In [70]:
report1.to_csv('./Clean/June2016/report1_all_complaints.csv')
report2.to_csv('./Clean/June2016/report2_accused.csv')
report3.to_csv('./Clean/June2016/report3_po_witness.csv')
report4.to_csv('./Clean/June2016/report4_victim.csv')
report5.to_csv('./Clean/June2016/report5_complainant.csv')
merged.to_csv('./Clean/June2016/june2016_all.csv')

report1.to_excel('./Clean/June2016/report1_all_complaints.xlsx')
report2.to_excel('./Clean/June2016/report2_accused.xlsx')
report3.to_excel('./Clean/June2016/report3_po_witness.xlsx')
report4.to_excel('./Clean/June2016/report4_victim.xlsx')
report5.to_excel('./Clean/June2016/report5_complainant.xlsx')
merged.to_excel('./Clean/June2016/june2016_all.xlsx')