In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
from fuzzywuzzy import fuzz 



## Import the merged dataset

In [2]:
df = pd.read_csv('mergedEARNINGS.csv')

print(df)

      Unnamed: 0  Unnamed: 0_x              Name            Rank   Race  \
0              0          11.0  joseph abasciano  Police Officer  White   
1              1          13.0  joseph abasciano  Police Officer  White   
2              2          26.0  joseph abasciano  Police Officer  White   
3              3          28.0  joseph abasciano  Police Officer  White   
4              4          41.0  joseph abasciano  Police Officer  White   
...          ...           ...               ...             ...    ...   
8903        8903           NaN  robert twitchell             NaN    NaN   
8904        8904           NaN        royce veal             NaN    NaN   
8905        8905           NaN    emmet t. walsh             NaN    NaN   
8906        8906           NaN  robert g. warren             NaN    NaN   
8907        8907           NaN   william woodley             NaN    NaN   

        Year        CaseID   TypeOfMisconduct                   Allegation  \
0     2011.0  IAD2011

In [3]:
df = df.drop(columns=['Unnamed: 0', 'Unnamed: 0_x', 'Unnamed: 0_y'])

print(df.shape)

(8908, 32)


## Import employee earnings dataset (general)

In [4]:
employeeDF = pd.read_csv('EmployeeEarnings.csv')

print(employeeDF)

                                   NAME           DEPARTMENT_NAME  \
0                        Abadi,Kidani A      Assessing Department   
1      Abajue Umeh,Kenechukwu Stephanie   BPS Boston Arts Academy   
2                      Abasciano,Joseph  Boston Police Department   
3                        Abatzis,Kelley   BPS Health and Wellness   
4                Abban,Christopher John    Boston Fire Department   
...                                 ...                       ...   
21853            Zuckert,Samuel Ellliot      BPS Welcome Services   
21854              Zukowski III,Charles  Boston Police Department   
21855                    Zuniga,Kevin Y    BPS Counseling Service   
21856            Zwarich,Maralene Zoann                 Eliot K-8   
21857                   Zwerdling,Laura     BPS Special Education   

                              TITLE        REGULAR RETRO        OTHER  \
0      Property Utilization Officer    $78,836.75    NaN     $175.00    
1                        

## Make subset of the earnings dataset with only Boston Police Department employees

In [5]:
polOnlyDF = employeeDF[employeeDF['DEPARTMENT_NAME'].str.contains('police', case=False, na=False)]

print(polOnlyDF)

                          NAME           DEPARTMENT_NAME  \
2             Abasciano,Joseph  Boston Police Department   
19         Abdul-Aziz,Ramadani  Boston Police Department   
22                   ABEL,KENY  Boston Police Department   
32     Abrahamson,Patrick Olaf  Boston Police Department   
38       Abreu,Carlos De jesus  Boston Police Department   
...                        ...                       ...   
21837           Zingg,Robert M  Boston Police Department   
21843       Zographos,Peter A.  Boston Police Department   
21847                   Zou,Zi  Boston Police Department   
21849        Zubrin,William W.  Boston Police Department   
21854     Zukowski III,Charles  Boston Police Department   

                             TITLE        REGULAR RETRO        OTHER  \
2                   Police Officer    $96,890.67    NaN     $850.00    
19                  Police Officer            NaN   NaN   $9,924.38    
22                  Cadet (Police)    $15,599.70    NaN        

## clean names in new dataset to match format of our other datasets

In [6]:
def fixname(name):
    name = name.lower()
    name = name.split(",")
    for i in range(len(name)):
        name[i] = name[i].strip()
    if len(name) == 2:
        return name[1] + ' ' + name[0]
    if len(name) == 3:
        return name[1] + ' ' + name[2] + ' ' + name[0]
    else:
        return name
    
fixname(polOnlyDF['NAME'][57])

'dora luz acevedo'

In [7]:
names = polOnlyDF['NAME'].apply(fixname)
polOnlyDF['NAME'] = names

print(polOnlyDF)

                          NAME           DEPARTMENT_NAME  \
2             joseph abasciano  Boston Police Department   
19         ramadani abdul-aziz  Boston Police Department   
22                   keny abel  Boston Police Department   
32     patrick olaf abrahamson  Boston Police Department   
38       carlos de jesus abreu  Boston Police Department   
...                        ...                       ...   
21837           robert m zingg  Boston Police Department   
21843       peter a. zographos  Boston Police Department   
21847                   zi zou  Boston Police Department   
21849        william w. zubrin  Boston Police Department   
21854     charles zukowski iii  Boston Police Department   

                             TITLE        REGULAR RETRO        OTHER  \
2                   Police Officer    $96,890.67    NaN     $850.00    
19                  Police Officer            NaN   NaN   $9,924.38    
22                  Cadet (Police)    $15,599.70    NaN        

## Do Fuzzy Matching!!!

In [8]:
# String similarity between the two fields

def fuzzySimilarity(row):
    name1 = row['Name']
    name2 = row['NAME']
    fuzzy_ratio = fuzz.token_set_ratio(name1, name2)
    return fuzzy_ratio

In [9]:
# Create a new column called lastName character that has the first letter of the last name as its separate column

def getLastCh(s):
    "s is a string of the Name/Contributor Columns of the datasets"
    s_list = s.split()
    suffixes = ['jr','jr.','sr','sr.','i','ii','iii']

    # remove suffixes in s_list
    for i in reversed(range(len(s_list))):
        if s_list[i] in suffixes:
            s_list.pop(i)

    lastName = s_list[-1]
    firstCh = lastName[0]
    return firstCh

polOnlyDF['lastNameCh'] = [getLastCh(s) for s in polOnlyDF['NAME']]
df['lastNameCh'] = [getLastCh(s) for s in df['Name']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  polOnlyDF['lastNameCh'] = [getLastCh(s) for s in polOnlyDF['NAME']]


In [10]:
# This program will merge the two dataframes using their lastName characters then apply a string similarity score for each row then we'll filter the string similarity value to create the final dataframe with name matches.

unique_names = list(df['lastNameCh'].unique())

for name in unique_names:
    df2_sub_zip = polOnlyDF[polOnlyDF['lastNameCh'] == name]
    df1_sub_zip = df[df['lastNameCh'] == name]

    df_merge = dd.merge(df1_sub_zip, df2_sub_zip, how='left', left_on='lastNameCh', right_on='lastNameCh')
    
    df_merge['Fuzzy Similarity'] = df_merge.apply(lambda row: fuzzySimilarity(row), axis=1)
    
    # You can adjust this number for a more selective fuzzy similarity merge
    Fuzzy_Filter = df_merge[df_merge['Fuzzy Similarity'] > 85]
    
    title = "./fuzzyDatasets/merge_df_name_" + name + ".csv"
    Fuzzy_Filter.to_csv(title, encoding = "utf-8")

In [11]:
# This will create the list

list_of_csv_titles = []

for name in unique_names:
    title = "./fuzzyDatasets/merge_df_name_" + name + ".csv"
    list_of_csv_titles.append(title)

In [12]:
# Merging all the batches

df_merge_final = pd.DataFrame()

for files in list_of_csv_titles:
    data = pd.read_csv(files)
    df_merge_final = df_merge_final.append(data)

In [13]:
print(df_merge_final)

    Unnamed: 0               Name            Rank   Race    Year  \
0            0   joseph abasciano  Police Officer  White  2011.0   
1           92   joseph abasciano  Police Officer  White  2011.0   
2          184   joseph abasciano  Police Officer  White  2011.0   
3          276   joseph abasciano  Police Officer  White  2011.0   
4          368   joseph abasciano  Police Officer  White  2013.0   
..         ...                ...             ...    ...     ...   
1            3    vladimir xavier  Police Officer  Black  2012.0   
2            5    vladimir xavier  Police Officer  Black  2014.0   
0            0  anthony d ierardi        Sergeant  White  2017.0   
0            5     robert m zingg       Detective  White  2011.0   
1           15     robert m zingg       Detective  White  2012.0   

          CaseID        TypeOfMisconduct                   Allegation  \
0   IAD2011-0182       Citizen complaint  Neg.Duty/Unreasonable Judge   
1   IAD2011-0182       Citizen compla

In [14]:
print(list(df_merge_final))

['Unnamed: 0', 'Name', 'Rank', 'Race', 'Year', 'CaseID', 'TypeOfMisconduct', 'Allegation', 'Finding', 'Outcome', 'Address', 'Amount', 'CPF ID', 'City', 'Contributor', 'Date', 'Datetime', 'Employer', 'Occupation', 'Principal Officer', 'Recipient', 'Record Type Description', 'Record Type ID', 'Source Description', 'State', 'Tender Type Description', 'Tender Type ID', 'UUID', 'Zip', 'DATE ADDED', 'AGENCY', 'STATUS', 'INFORMATION REGARDING LEAD ENTRY', 'lastNameCh', 'NAME', 'DEPARTMENT_NAME', 'TITLE', 'REGULAR', 'RETRO', 'OTHER', 'OVERTIME', 'INJURED', 'DETAIL', 'QUINN / EDUCATION INCENTIVE', 'TOTAL EARNINGS', 'POSTAL', 'Fuzzy Similarity']


In [15]:
df_merge_final = df_merge_final.drop(columns=['Unnamed: 0', 'lastNameCh', 'Fuzzy Similarity'])

print(list(df_merge_final))

['Name', 'Rank', 'Race', 'Year', 'CaseID', 'TypeOfMisconduct', 'Allegation', 'Finding', 'Outcome', 'Address', 'Amount', 'CPF ID', 'City', 'Contributor', 'Date', 'Datetime', 'Employer', 'Occupation', 'Principal Officer', 'Recipient', 'Record Type Description', 'Record Type ID', 'Source Description', 'State', 'Tender Type Description', 'Tender Type ID', 'UUID', 'Zip', 'DATE ADDED', 'AGENCY', 'STATUS', 'INFORMATION REGARDING LEAD ENTRY', 'NAME', 'DEPARTMENT_NAME', 'TITLE', 'REGULAR', 'RETRO', 'OTHER', 'OVERTIME', 'INJURED', 'DETAIL', 'QUINN / EDUCATION INCENTIVE', 'TOTAL EARNINGS', 'POSTAL']


In [16]:
df_merge_final.shape

(8821, 44)

## Save & export new dataset :) 

In [19]:
df_merge_final.to_csv("mergedEARNINGS.csv")