### Task
Step two - Students to merge the disciplinary action database with the BPD financial contributions data. Verify common names using the Race/Ethnicity BPD personnel dataset.
First merge the disciplinary action database with the entire "All_Police_Contributions.csv" dataset, then filter for "Boston Police" under the "Employer" column.

### Merge with Fuzzy Matching using Fuzzy merge template

In [1]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
from fuzzywuzzy import fuzz 

boston = pd.read_csv("Final BostonPoliceInternalAffairs.csv")
police = pd.read_csv("Final Police Contributions.csv")




In [2]:
# Install Dask as a library using the following code:
import sys
!{sys.executable} -m pip install "dask[complete]"

# Dask is an additional library for Pandas that parrellizes the memory when handling dataframes, this greatly 
# speeds up the merging and other data processing.



In [3]:
# String similarity between the two fields
def FuzzySimilarity(row):
    address_1 = row['Name']
    address_2 = row['Contributor']
    fuzzy_ratio = fuzz.ratio(address_1,address_2)
    return fuzzy_ratio

In [5]:
# Create a new column called lastName character that has the first letter of the last name as its separate column

def getLastCh(s):
    "s is a string of the Name/Contributor Columns of the datasets"
    s_list = s.split()
    suffixes = ['jr','jr.','sr','sr.','i','ii','iii']

    # remove suffixes in s_list
    for i in reversed(range(len(s_list))):
        if s_list[i] in suffixes:
            s_list.pop(i)

    lastName = s_list[-1]
    firstCh = lastName[0]
    return firstCh

boston['lastNameCh'] = [getLastCh(s) for s in boston['Name']]
police['lastNameCh'] = [getLastCh(s) for s in police['Contributor']]

In [6]:
print(boston[['Name','lastNameCh']].head())

                  Name lastNameCh
0     joseph abasciano          a
1     joseph abasciano          a
2     joseph abasciano          a
3     joseph abasciano          a
4  ramadani abdul-aziz          a


In [7]:
print(police[['Contributor', 'lastNameCh']].head())

          Contributor lastNameCh
0  allan l ciccone jr          c
1     michael linskey          l
2     william haffner          h
3      matthew maglio          m
4       donna colbert          c


Things to take note of:

1. Some names are entered in the incorrect format. For example, "Gannetti, iii, Salvatore" was instead enterred as "Gannetti, Salvatore iii" creating a case in preprocessing to result in salvatore iii gannetti.
2. Some suffixes have . after and others don't (i.e. jr and jr.)

We removed suffixes before identifying the last name character.

In [8]:
# This program will merge thr two dataframes using their lastName characters then apply a string similarity score
# for each row then we'll filter the string similarity value to create the final dataframe with name matches.

unique_names = list(boston['lastNameCh'].unique())

for name in unique_names:
    df1_sub_zip = boston[boston['lastNameCh'] == name]
    df2_sub_zip = police[police['lastNameCh'] == name]

    df_merge = dd.merge(df1_sub_zip, df2_sub_zip, how='left', left_on='lastNameCh', right_on='lastNameCh')
    
    df_merge['Fuzzy Similarity'] = df_merge.apply(lambda row: FuzzySimilarity(row), axis=1)
    
    # You can adjust this number for a more selective fuzzy similarity merge
    Fuzzy_Filter = df_merge[df_merge['Fuzzy Similarity'] > 85]
    
    title = "Merge_df_name_" + name + ".csv"
        #with ExcelWriter(title) as writer:
    df_merge.to_csv(title, encoding = "utf-8")
    

# We write each lastName character to a CSV, this will then all be m 

In [9]:
# This will create the list
List_of_csv_titles = []
for name in unique_names:
    title = "Merge_df_name_" + name + ".csv"
    List_of_csv_titles.append(title)
print(List_of_csv_titles)

['Merge_df_name_a.csv', 'Merge_df_name_b.csv', 'Merge_df_name_c.csv', 'Merge_df_name_d.csv', 'Merge_df_name_s.csv', 'Merge_df_name_e.csv', 'Merge_df_name_f.csv', 'Merge_df_name_g.csv', 'Merge_df_name_h.csv', 'Merge_df_name_i.csv', 'Merge_df_name_j.csv', 'Merge_df_name_l.csv', 'Merge_df_name_k.csv', 'Merge_df_name_m.csv', 'Merge_df_name_n.csv', 'Merge_df_name_o.csv', 'Merge_df_name_p.csv', 'Merge_df_name_q.csv', 'Merge_df_name_r.csv', 'Merge_df_name_t.csv', 'Merge_df_name_v.csv', 'Merge_df_name_w.csv', 'Merge_df_name_x.csv', 'Merge_df_name_y.csv', 'Merge_df_name_z.csv']


In [11]:
# Merging all the batches

df_merge_final = pd.DataFrame()
for files in List_of_csv_titles:
    data = pd.read_csv(files)
    df_merge_final = df_merge_final.append(data)
df_merge_final

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0.1,Unnamed: 0,Unnamed: 0_x,Name,Rank,Race,Year,CaseID,TypeOfMisconduct,Outcome,lastNameCh,...,Recipient,Record Type Description,Record Type ID,Source Description,State,Tender Type Description,Tender Type ID,UUID,Zip,Fuzzy Similarity
0,0,0,joseph abasciano,Police Officer,White,2011,IAD2011-0182,Citizen complaint,,a,...,"Sutter, C. Samuel",Individual,201,2/20/10 Deposit Report,MA,Check,1,5eace80d-8740-5505-a230-0a72e2d82f2f,02747,29
1,1,0,joseph abasciano,Police Officer,White,2011,IAD2011-0182,Citizen complaint,,a,...,"Balser, Ruth B.",Individual,201,2010 Pre-primary Report (ND),MA,Not Specified,0,1e50a41f-6ff1-5f55-8183-7e3d65da5559,024652136,32
2,2,0,joseph abasciano,Police Officer,White,2011,IAD2011-0182,Citizen complaint,,a,...,"Flaherty, Timothy",Individual,201,2010 Pre-election Report (Special) (ND),MA,Not Specified,0,66aea717-06fa-51f5-9b80-e9f1d39f6ec0,02445,28
3,3,0,joseph abasciano,Police Officer,White,2011,IAD2011-0182,Citizen complaint,,a,...,"Doherty, Christian L.",Individual,201,2010 Pre-primary Report (ND),MA,Not Specified,0,b65bd67b-89a2-5914-b314-4c3856229a4a,01830-3357,36
4,4,0,joseph abasciano,Police Officer,White,2011,IAD2011-0182,Citizen complaint,,a,...,"Sutter, C. Samuel",Individual,201,5/19/10 Deposit Report,MA,Check,1,a87aa7f5-d163-522c-a8a1-75eca93fb2de,02747,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,685,5659,peter a zographos,Police Officer,White,2012,IAD2012-0336,Citizen complaint,,z,...,"Gulluni, Anthony Domenic",Individual,201,9/9/14 Deposit Report,MA,Credit Card,3,67cae092-688c-55d8-9d21-c7b3094b0af6,01129,34
686,686,5659,peter a zographos,Police Officer,White,2012,IAD2012-0336,Citizen complaint,,z,...,"Liang, Nina",Individual,201,10/23/15 Deposit Report,MA,Check,1,c9e0aa38-6d7c-5aff-9f22-acf054e7e241,02351,34
687,687,5659,peter a zographos,Police Officer,White,2012,IAD2012-0336,Citizen complaint,,z,...,"Gulluni, Anthony Domenic",Individual,201,8/30/17 Deposit Report,MA,Check,1,287c4aa2-4408-5ad9-ae97-e1cdcab22a73,01129,34
688,688,5659,peter a zographos,Police Officer,White,2012,IAD2012-0336,Citizen complaint,,z,...,"Gulluni, Anthony Domenic",Individual,201,3/27/18 Deposit Report,MA,Cash,6,e14ad1ae-4132-51e3-9dd7-c555754af1e5,01129,34


In [12]:
df_merge_final.to_csv("df_merge_final.csv")