lets use fuzzywuzzy to clean the names in the incident data

In [20]:
import os
import json
import pandas as pd
import altair as alt
from fuzzywuzzy import process

cwd = os.getcwd()
data_dir = os.path.join(cwd, 'data')
source_dir = os.path.join(data_dir,'source')
manual_dir = os.path.join(data_dir, 'manual')
processed_dir = os.path.join(data_dir, 'processed')

# the discipline data that is one-row-per-incident
incident_xlsx = os.path.join(source_dir, '2010-2020.xlsx')
incident_df = pd.read_excel(
    incident_xlsx,
    parse_dates = ['FINAL DISP DATE']
)

incident_df['full_name'] = incident_df['EMPLOYEE FIRST NAME'] + ' ' + incident_df['EMPLOYEE LAST NAME']
incident_df['clean_name'] = incident_df['full_name'].str.strip().str.lower()

Take the list of unique names, take each name and compare to the rest of the names

In [24]:
# unique_names = incident_df.clean_name.dropna().unique()

'''for name in unique_names:
    results = process.extract(
        name,
        unique_names,
        limit=5
    )
    
    second_best_result_score = results[1][1] # [1][1] gets the score of the second best match for the name. First best will be the name 
    if 85 < second_best_result_score < 100:
        #print(results)
        pass'''

'for name in unique_names:\n    results = process.extract(\n        name,\n        unique_names,\n        limit=5\n    )\n    \n    second_best_result_score = results[1][1] # [1][1] gets the score of the second best match for the name. First best will be the name \n    if 85 < second_best_result_score < 100:\n        #print(results)\n        pass'

In [54]:
pd.isna(pd.na)

AttributeError: module 'pandas' has no attribute 'na'

In [70]:
def strip_text(df):
    name = df['name']
    name = name.lower().replace(',',' ').replace("'"," ").replace("."," ").replace('(', ' ').replace(')',' ').strip()
    clean_name = " ".join(sorted(name.split())) # this splits the name into words, alphebetizes and rejoins
 
    return clean_name

In [71]:
def clean_race_ethnicity(df):
    race = df.race_ethnicity
    
    if pd.isna(race):
        return pd.NA
    else:
        race = race.strip().lower()
        race_ethnicity_cleaner = {
            'white (not of hispanic origin)': 'white',
            'black': 'black',
            'asian or pacific islander': 'asian/pacific islander',
            'hispanic': 'hispanic',
            'american indian/alaskan native': 'native american',
            'unknown': 'unknown',
            'black': 'black',
            'hispanic': 'hispanic',
            'asian or pacific islander': 'asian/pacific islander',
            'unknown': 'unknown',
            'two or more races': 'multiracial',
            'asian': 'asian/pacific islander',
            'hawaiian or pacific islander': 'asian/pacific islander',
            'white': 'white',
            'american indian or alaskan native': 'native american',
            'black or african american': 'black',
            'hispanic or latino of any race': 'hispanic',
            'two or more races': 'multiracial',
            'asian': 'asian/pacific islander',
            'native hawaiian or other pacific': 'asian/pacific islander',
        }
    
    return race_ethnicity_cleaner[race]

In [72]:
staff_roster_csv = os.path.join(processed_dir, 'staff_roster_cleaned.csv')
staff_roster_df = pd.read_csv(staff_roster_csv)


staff_roster_df['clean_name'] = staff_roster_df.apply(
    strip_text,
    axis=1
)
staff_roster_df['clean_race_ethnicity'] = staff_roster_df.apply(clean_race_ethnicity, axis=1)

staff_roster_df.groupby(
    ['clean_name', 'gender', 'clean_race_ethnicity']
).size().reset_index().rename(columns={0:'count'})

Unnamed: 0,clean_name,gender,clean_race_ethnicity,count
0,a adrienne ewing-roush,F,black,8
1,a aldea lopez luis,M,hispanic,7
2,a alexander judy,F,white,1
3,a ali boone j james,M,black,2
4,a alisha meeks,F,white,1
...,...,...,...,...
3854,todd w watson,M,white,12
3855,trevor vondohlen w,M,white,9
3856,vanbergen william zachary,M,white,3
3857,waters william z,M,white,12


In [86]:
staff_roster_df.gender.value_counts()

M    17829
F     5816
Name: gender, dtype: int64

In [74]:
staff_roster_df.clean_race_ethnicity.value_counts()

white                     18250
black                      4384
hispanic                    423
asian/pacific islander      284
multiracial                 257
native american              30
unknown                      15
Name: clean_race_ethnicity, dtype: int64

In [82]:
from fuzzywuzzy import fuzz

In [85]:
incident_unique_names = incident_df.clean_name.dropna().unique()

roster_unique_names = staff_roster_df.clean_name.dropna().unique()

for name in incident_unique_names[0:20]:
    results = process.extract(
        name,
        roster_unique_names,
        limit=5,
        scorer=fuzz.token_sort_ratio
    )
    print(name)
    print(results)
    print('----')

daniel bowling
[('bowling daniel j', 93), ('brown daniel j', 79), ('bowling t wade', 71), ('bossing janice k', 67), ('cole daniel e', 67)]
----
kevin breeding
[('breeding kevin l', 93), ('j keating kevin', 69), ('frei kevin l', 69), ('brown kevin m', 67), ('b coleman kevin', 62)]
----
aaron thomas
[('aaron sr thomas', 89), ('aaron ii thomas', 89), ('adam r thomas', 80), ('a jackson thomas', 79), ('h rowan thomas', 77)]
----
aaron jones
[('aaron jones l', 92), ('anton jones x', 75), ('a gregory jones', 69), ('dhana jones k', 67), ('g jonathan jones', 67)]
----
aaron wigginton
[('aaron r wigginton', 94), ('aaron ii thomas', 67), ('arnold n whitney', 58), ('allen jerron washington', 58), ('aaron jones l', 57)]
----
jeffrey moseley
[('jeffery moseley t', 88), ('jeremy moseley t', 84), ('jeffrey massey s', 77), ('jeffrey l poole', 73), ('jeffrey mitchell t', 67)]
----
aaron thomas ii
[('aaron ii thomas', 100), ('aaron sr thomas', 87), ('anthony e ii thomas', 76), ('barnes e thomas', 73), ('