### Task
Step two - Students to merge the disciplinary action database with the BPD financial contributions data. Verify common names using the Race/Ethnicity BPD personnel dataset.
First merge the disciplinary action database with the entire "All_Police_Contributions.csv" dataset, then filter for "Boston Police" under the "Employer" column.

### Merge with Fuzzy Matching using Fuzzy merge template

In [1]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
from fuzzywuzzy import fuzz 

boston = pd.read_csv("processedBostonPoliceInternalAffairs.csv")
police = pd.read_csv("processedPoliceContributions.csv")




In [2]:
# Install Dask as a library using the following code:
import sys
!{sys.executable} -m pip install "dask[complete]"

# Dask is an additional library for Pandas that parrellizes the memory when handling dataframes, this greatly 
# speeds up the merging and other data processing.



In [3]:
# String similarity between the two fields
def FuzzySimilarity(row):
    address_1 = row['Name']
    address_2 = row['Contributor']
    fuzzy_ratio = fuzz.ratio(address_1,address_2)
    return fuzzy_ratio

In [4]:
# Create a new column called lastName character that has the first letter of the last name as its separate column

def getLastCh(s):
    "s is a string of the Name/Contributor Columns of the datasets"
    s_list = s.split()
    suffixes = ['jr','jr.','sr','sr.','i','ii','iii']

    # remove suffixes in s_list
    for i in reversed(range(len(s_list))):
        if s_list[i] in suffixes:
            s_list.pop(i)

    lastName = s_list[-1]
    firstCh = lastName[0]
    return firstCh

boston['lastNameCh'] = [getLastCh(s) for s in boston['Name']]
police['lastNameCh'] = [getLastCh(s) for s in police['Contributor']]

In [5]:
print(boston[['Name','lastNameCh']].head())

                  Name lastNameCh
0     joseph abasciano          a
1     joseph abasciano          a
2     joseph abasciano          a
3     joseph abasciano          a
4  ramadani abdul-aziz          a


In [6]:
print(police[['Contributor', 'lastNameCh']].head())

          Contributor lastNameCh
0  allan l ciccone jr          c
1     linskey michael          m
2     haffner william          w
3      maglio matthew          m
4       colbert donna          d


Things to take note of:

1. Some names are entered in the incorrect format. For example, "Gannetti, iii, Salvatore" was instead entered as "Gannetti, Salvatore iii" creating a case in preprocessing to result in salvatore iii gannetti.
2. Some suffixes have . after and others don't (i.e. jr and jr.)

We removed suffixes before identifying the last name character.

In [7]:
# This program will merge thr two dataframes using their lastName characters then apply a string similarity score
# for each row then we'll filter the string similarity value to create the final dataframe with name matches.

unique_names = list(boston['lastNameCh'].unique())

for name in unique_names:
    df1_sub_zip = boston[boston['lastNameCh'] == name]
    df2_sub_zip = police[police['lastNameCh'] == name]

    df_merge = dd.merge(df1_sub_zip, df2_sub_zip, how='left', left_on='lastNameCh', right_on='lastNameCh')
    
    df_merge['Fuzzy Similarity'] = df_merge.apply(lambda row: FuzzySimilarity(row), axis=1)
    
    # You can adjust this number for a more selective fuzzy similarity merge
    Fuzzy_Filter = df_merge[df_merge['Fuzzy Similarity'] > 85]
    
    title = "Merge_df_name_" + name + ".csv"
        #with ExcelWriter(title) as writer:
    Fuzzy_Filter.to_csv(title, encoding = "utf-8")
    

# We write each lastName character to a CSV, this will then all be m 

In [8]:
# This will create the list
List_of_csv_titles = []
for name in unique_names:
    title = "Merge_df_name_" + name + ".csv"
    List_of_csv_titles.append(title)
print(List_of_csv_titles)

['Merge_df_name_a.csv', 'Merge_df_name_b.csv', 'Merge_df_name_c.csv', 'Merge_df_name_d.csv', 'Merge_df_name_s.csv', 'Merge_df_name_e.csv', 'Merge_df_name_f.csv', 'Merge_df_name_g.csv', 'Merge_df_name_h.csv', 'Merge_df_name_i.csv', 'Merge_df_name_j.csv', 'Merge_df_name_l.csv', 'Merge_df_name_k.csv', 'Merge_df_name_m.csv', 'Merge_df_name_n.csv', 'Merge_df_name_o.csv', 'Merge_df_name_p.csv', 'Merge_df_name_q.csv', 'Merge_df_name_r.csv', 'Merge_df_name_t.csv', 'Merge_df_name_v.csv', 'Merge_df_name_w.csv', 'Merge_df_name_x.csv', 'Merge_df_name_y.csv', 'Merge_df_name_z.csv']


In [9]:
# Merging all the batches

df_merge_final = pd.DataFrame()
for files in List_of_csv_titles:
    data = pd.read_csv(files)
    df_merge_final = df_merge_final.append(data)
df_merge_final

Unnamed: 0.1,Unnamed: 0,Unnamed: 0_x,Name,Rank,Race,Year,CaseID,TypeOfMisconduct,Outcome,lastNameCh,...,Recipient,Record Type Description,Record Type ID,Source Description,State,Tender Type Description,Tender Type ID,UUID,Zip,Fuzzy Similarity
0,30596,1746,john fitzpatrick,Police Officer,White,2016,IAD2016-0231,Citizen complaint,,f,...,"DeMaria Jr., Carlo",Individual,201,2017 Pre-election Report (MUN),MA,,0,cc22c79d-dee8-575a-8f08-7224cac98aeb,2149,89
1,30765,1747,richard h fitzpatrick,Sergeant,White,2011,IAD2011-0581,Citizen complaint,,f,...,"Ciommo, Mark",Individual,201,10/10/12 Deposit Report,MA,Check,1,a343c2ab-c29f-5782-a85c-a341bb196317,2030,86
2,30773,1747,richard h fitzpatrick,Sergeant,White,2011,IAD2011-0581,Citizen complaint,,f,...,"Ciommo, Mark",Individual,201,5/2/13 Deposit Report,MA,Check,1,a89564d5-07a0-5752-9685-df7a3b89a73c,2030,86
3,30801,1747,richard h fitzpatrick,Sergeant,White,2011,IAD2011-0581,Citizen complaint,,f,...,"Ciommo, Mark",Individual,201,11/8/13 Deposit Report,MA,Check,1,f3a17c78-772a-50e4-bf62-4f33d72a0b70,2030,86
4,30837,1747,richard h fitzpatrick,Sergeant,White,2011,IAD2011-0581,Citizen complaint,,f,...,"Ciommo, Mark",Individual,201,12/16/14 Deposit Report,MA,Check,1,41815222-250b-50f9-b7b9-10532f0adffa,2030,86
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28,251505,4818,john k rogers,Police Officer,White,2019,IAD2019-0506,Internal investigation,,r,...,"Walsh, Martin J.",Individual,201,10/2/17 Deposit Report,MA,Check,1,2344b24c-1e8a-5498-b1d4-cf7a49036f2e,02190-2842,87
29,251643,4818,john k rogers,Police Officer,White,2019,IAD2019-0506,Internal investigation,,r,...,"Walsh, Martin J.",Individual,201,12/6/19 Deposit Report,MA,Check,1,e5cc1e53-ff05-584c-8b99-6d070c5bd5a3,02190-2842,87
30,252315,4819,john k rogers,Police Officer,White,2019,IAD2019-0506,Internal investigation,,r,...,"Walsh, Martin J.",Individual,201,3/14/17 Deposit Report,MA,Check,1,6fd55787-1379-526f-a663-41eac2cd6e29,02190-2842,87
31,252359,4819,john k rogers,Police Officer,White,2019,IAD2019-0506,Internal investigation,,r,...,"Walsh, Martin J.",Individual,201,10/2/17 Deposit Report,MA,Check,1,2344b24c-1e8a-5498-b1d4-cf7a49036f2e,02190-2842,87


In [10]:
df_merge_final.to_csv("df_merge_final.csv")