### String Matching Practice

**(April 11, 2024)**

In [1]:
from tqdm.notebook import tqdm

#### Custom String Matching Score Functions

In [2]:
import string
import numpy as np

### Loading in Data

In [3]:
import pandas as pd

In [5]:
# reading in election data; File is not stored locally in this repository
election_data = pd.read_csv('raw_feb_23_city_wide.csv', dtype=str)

In [6]:
election_data

Unnamed: 0,Last_Name,First_Name,Middle_Name,Name_Style,Street_Number,Street_Name,Street_Type,Street_Dir_Suffix,Unit_Type,Apartment_Number,...,SPECIAL ELECTION WARD 4 AND 8(Apr/28/2015),DISTRICT OF COLUMBIA GENERAL ELECTI(Nov/04/2014),SPECIAL BD OF ED ELECTION WARD 8(Jul/15/2014),DC MAYORAL PRIMARY 2014(Apr/01/2014),DIST OF COLUMBIA AT-LARGE SPECIAL E(Apr/23/2013),DISTRICT OF COLUMBIA GENERAL ELECTI(Nov/06/2012),SPECIAL ELECTION WARD 5 COUNCIL(May/15/2012),DC PRESIDENTIAL PREF/COUNCIL PRIMAR(Apr/03/2012),RECALL SPECIAL ELECTION 4B04(Feb/28/2012),2011 DISTRICT OF COLUMBIA SPECIAL E(Apr/26/2011)
0,A Blessing,Rebecca,Ann,,1722,19th,ST,NW,APT,706,...,,,,,,,,,,
1,A-Jaoudi,Edward,,,2503,Ralph Ellison,Way,NE,,,...,,,,,,,,,,
2,Aaberg,Elizabeth,Margaret,,3019,15th,ST,NW,APT,4,...,,,,,,,,,,
3,Aaby,Erik,,,1300,4th,St,SE,,UNIT 715,...,N,,N,,,N,,,,
4,Aaby,Katherine,Diane,,1854,Park,Rd,NW,,,...,N,,N,N,,A,N,N,N,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
520556,Zytnick,Jonathon,,,425,L,ST,NW,,305,...,,,,,,,,,,
520557,Zytnick,Kiva,Keane,,1303,Newton,St,NE,,,...,N,Y,N,Y,,P,N,,N,A
520558,Zywokarte,Michael,Donald,,1265,4th,St,SW,,,...,N,A,N,A,Y,A,N,,N,A
520559,Zywokarte,Sandra,Leigh,,1265,4th,St,SW,,,...,N,A,N,A,Y,A,N,,N,A


In [7]:
# generating list of names
first_name_list = list(election_data['First_Name'])
last_name_list = list(election_data['Last_Name'])
full_name_list = [str(char1) + ' ' + str(char2) for char1, char2 in zip(first_name_list, last_name_list)]

#### Rapid Fuzzy Match

https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#rapidfuzz.fuzz.partial_ratio

In [8]:
from rapidfuzz import fuzz

In [9]:
# https://rapidfuzz.github.io/RapidFuzz/Usage/fuzz.html#rapidfuzz.fuzz.partial_ratio

In [10]:
def score_function_fuzz(guess_full_name, full_name_list):

    full_name_score_dict = dict()
    for idx in tqdm(range(len(full_name_list))):

        # getting full name for row
        name_row = str(full_name_list[idx])

        # lowering strings    
        name_row = name_row.lower()
        guess_full_name = guess_full_name.lower()
    
        # compiling scores
        final_score = fuzz.ratio(guess_full_name, name_row)/100
        full_name_score_dict[idx] = final_score

    # sorting dictionary
    sorted_dictionary = dict(sorted(full_name_score_dict.items(), reverse=True, key=lambda item: item[1]))

    # top five key value pairs (indices and scores)
    indices_scores_list = list(sorted_dictionary.items())[:5]

    return indices_scores_list    

#### Finding Similar Data Base Entries

In [12]:
# displaying last and first name only
election_data[['Last_Name', 'First_Name']]

Unnamed: 0,Last_Name,First_Name
0,A Blessing,Rebecca
1,A-Jaoudi,Edward
2,Aaberg,Elizabeth
3,Aaby,Erik
4,Aaby,Katherine
...,...,...
520556,Zytnick,Jonathon
520557,Zytnick,Kiva
520558,Zywokarte,Michael
520559,Zywokarte,Sandra


In [13]:
# finding elmements in election database that are similar to a given string
score_function_fuzz('Alexandra Karabatos', full_name_list)

  0%|          | 0/520561 [00:00<?, ?it/s]

[(247654, 0.9743589743589743),
 (247699, 0.85),
 (407872, 0.8333333333333335),
 (261560, 0.8235294117647058),
 (261711, 0.8235294117647058)]

In [14]:
# Testing the element
election_data.iloc[247654]

Last_Name                                           Karabatsos
First_Name                                           Alexandra
Middle_Name                                              Maria
Name_Style                                                 NaN
Street_Number                                             1300
Street_Name                                                  K
Street_Type                                                 ST
Street_Dir_Suffix                                           SE
Unit_Type                                                  APT
Apartment_Number                                           302
City_Name                                           Washington
Zip_Code                                                 20003
Registration_Date                                   2021-10-16
Party                                                      DEM
Precinct                                                 91.04
SMD                                                    