# OCR Matching Algorithm Results & Benchmarks

This notebook is to document the performance of the matching algorithm which checks the values returned by OCR against the voter registry dataframe. It will mainly serve to illustrate the results along two main axes, the search function and the scorer used by the fuzzy matching function.

In [None]:
# libraries used for matching
import pandas as pd
from rapidfuzz import fuzz, process, utils
import time
from loguru  import logger
import sys
import json
import glob

# creating dataframe from the registry
voter_records_2023_df = pd.read_csv('../data/raw_feb_23_city_wide.csv', dtype=str)

# adding Jupyter Notebook system standout as a sink for logger
logger.add(sys.standout, level="INFO")

## Vanilla

### By Full Name

### By Ward

## Vectorized Columns

In [None]:
start_time = time.time()
voter_records_2023_df['Full Name'] = voter_records_2023_df["First_Name"] + ' ' + voter_records_2023_df['Last_Name']
voter_records_2023_df['Full Address'] =  voter_records_2023_df["Street_Number"] + " " + voter_records_2023_df["Street_Name"] + " " + voter_records_2023_df["Street_Type"] + " " + voter_records_2023_df["Street_Dir_Suffix"]
voter_records_2023_df['OCR'] = voter_records_2023_df["Full Name"] + ' ' + voter_records_2023_df["Full Address"]
end_time = time.time()

logger.info(f"Initialized columns in: {start_time - end_time}"

### By Name + Address

### By Ward

### By Entire Registry

## Hierarchical Search

In [None]:
def tiered_search(name, address):
    name_address_combo = f"{name} {address}"
    # Searches for a match within the Ward returned by OCR
    high_match_ids = score_fuzzy_match_slim(name_address_combo, voter_records_2023_df[voter_records_2023_df['WARD'] == f"{dict_['Ward']}.0"]["OCR"])
    name_, score_, id_ = high_match_ids[0]
    # If no Valid matches are found, searches for a match against the entire registry
    if score_ < 85.0:
        high_match_ids = score_fuzzy_match_slim(name_address_combo, voter_records_2023_df["OCR"])
        name_, score_, id_ = high_match_ids[0]
    if score_ >= 85.0:
        return high_match_ids[0]
    # IF no Valid matches have been found, searches for a match using only the Full Name
    else:
        matched_full_names = score_fuzzy_match_slim(name, voter_records_2023_df["Full Name"], scorer_=fuzz.ratio)
        full_name, full_name_score, full_name_id = matched_full_names[0]
    # Compare scores of full name + address match to score of Full Name match and take the record with the highest score in the format Tuple(matched_record, score, index)
    if score_ > full_name_score:
        return high_match_ids[0]
    else:
        address = voter_records_2023_df.loc[full_name_id, 'Full Address']
        full_name = f"{full_name} {address}"
        return (full_name, full_name_score, full_name_id)

In [None]:
matched_list = list()
start_time = time.time()
i = 0

with open('../data/processed_ocr_data.json', 'r') as file:
    resulting_data = json.load(file)

for dict_ in resulting_data:
    temp_dict = dict()
    name_, score_, id_ = tiered_search(dict_['Name'], dict_['Address'])
    temp_dict['OCR RECORD'] = f"{dict_['Name']} {dict_['Address']}"
    temp_dict['MATCHED RECORD'] = name_
    temp_dict['SCORE'] = score_
    temp_dict['VALID'] = False
    if score_ > 85.0:
        temp_dict['VALID'] = True
    matched_list.append(temp_dict)
    matching_bar.progress((i+1)/len(resulting_data), text=f"Matching OCR Names - page {i+1} of {len(resulting_data)}")
    i+=1

## Editable Table
match_df = pd.DataFrame(matched_list, columns=["OCR RECORD", "MATCHED RECORD", "SCORE", "VALID"])
match_df

end_time = time.time()
total_records = len(match_df)
valid_matches = match_df["VALID"].sum()
logger.info(f"OCR and Match Time {end_time-start_time:.3f} secs | Matched Records: {valid_matches} of {total_records} - {valid_matches/total_records * 100:2f}%")
