### Notebook Demo for Processing Fake Data

**Jan 8, 2025**

In [2]:
# from ocr_matching_functions_2024_09 import *
import sys
sys.path.append('../..')  # Add parent directory to Python path
from app.ocr_helper import *
from app.fuzzy_match_helper import *

import nest_asyncio
nest_asyncio.apply()

#### Performing Optical Character Recognition on Ballot Petitions

In [2]:
# timing the result
start_time = time.time()

# creating ocr data frame from voter pdfs
ocr_df = create_ocr_df(filedir=f"fake_data_collection", filename="fake_signed_petitions.pdf")

# recording elapsed time; uncomment in the notebook run
print(f'\nElapsed Time: {time.time()-start_time:.3f} secs')

Converting PDF file to Image Format

Cropping Images and Converting to Bytes Objects


  0%|          | 0/100 [00:00<?, ?it/s]


Files Successfully Converted to Bytes
Performing OCR to read Names and Addresses
Processing batches in 10 pages per batch


  0%|          | 0/10 [00:00<?, ?it/s]


Processing batch 1 of 10

Processing batch 2 of 10

Processing batch 3 of 10

Processing batch 4 of 10

Processing batch 5 of 10

Processing batch 6 of 10

Processing batch 7 of 10

Processing batch 8 of 10

Processing batch 9 of 10

Processing batch 10 of 10

Elapsed Time: 135.864 secs


In [3]:
ocr_df.head(10)

Unnamed: 0,OCR Name,OCR Address,Date,OCR Ward,Page Number,Row Number,Filename
0,Alexis Walter,23407 Hawkins Lock,1/15,4,1,1,fake_signed_petitions.pdf
1,Jenny Jones,82424 Rachel Views,1/15,5,1,2,fake_signed_petitions.pdf
2,Jack Stewart,756 Ashley Court Suite 408,1/15,2,1,3,fake_signed_petitions.pdf
3,Brady Herrera,88242 Gray Well,1/15,5,1,4,fake_signed_petitions.pdf
4,Robert Ayala,6563 Charles Greens Apt. 636,1/15,1,1,5,fake_signed_petitions.pdf
5,Erik Acosta,734 William Oval,1/15,2,2,1,fake_signed_petitions.pdf
6,William Hopkins,1869 Catty Mews Apt. 061,1/15,2,2,2,fake_signed_petitions.pdf
7,Debbie Rosa,995 Johnson Lane Apt. 926,1/15,4,2,3,fake_signed_petitions.pdf
8,Kenneth Ho,777 Rivera Mews Suite 338,1/15,6,2,4,fake_signed_petitions.pdf
9,Christina Goodman,543 Melissa Ferry,1/15,5,2,5,fake_signed_petitions.pdf


#### Voter Records

In [4]:
# getting voter records
voter_records = pd.read_csv("fake_data_collection/fake_voter_records.csv")

In [5]:
# displaying first few voter records
voter_records.head(10)

Unnamed: 0,First_Name,Last_Name,Street_Number,Street_Name,Street_Type,Street_Dir_Suffix
0,Erica,Massey,6071,Martin Island,,
1,Terry,Osborne,395,Kathryn Mall,,
2,David,Holmes,30154,Tara Ports Apt. 314,,
3,Michele,Ballard,310,Landry Hills,,
4,Mary,Wiggins,26734,Susan Cliffs Suite 119,,
5,Audrey,Smith,1814,Murphy Loaf,,
6,Willie,Davis,62934,Crystal Summit Suite 401,,
7,Candace,Jones,7496,Melissa Neck Suite 811,,
8,Patricia,Hayes,948,Michael Mountains Suite 010,,
9,Deborah,Davies,93187,Brooks Corners Apt. 910,,


In [6]:
# getting only necessary information
select_voter_records = create_select_voter_records(voter_records)
select_voter_records

Unnamed: 0,Full Name,Full Address
0,Erica Massey,6071 Martin Island
1,Terry Osborne,395 Kathryn Mall
2,David Holmes,30154 Tara Ports Apt. 314
3,Michele Ballard,310 Landry Hills
4,Mary Wiggins,26734 Susan Cliffs Suite 119
...,...,...
99995,Tabitha Hooper,7084 Tammy Bypass Apt. 864
99996,Robin Mccormick,13203 Thompson Path
99997,Ana Drake,159 Erica Island
99998,Donna Morgan,1497 Lucas Port Suite 139


In [7]:
# timing the result
start_time = time.time()

# creating new matched DF
ocr_matched_df = create_ocr_matched_df(ocr_df, select_voter_records, threshold = 85)

# recording elapsed time; uncomment in the notebook run
print(f'\nElapsed Time: {time.time()-start_time:.3f} secs')

  0%|          | 0/1 [00:00<?, ?it/s]


Elapsed Time: 13.041 secs


In [8]:
# showing result
ocr_matched_df 

Unnamed: 0,OCR Name,OCR Address,Matched Name,Matched Address,Date,Match Score,Valid,Page Number,Row Number,Filename
0,Alexis Walter,23407 Hawkins Lock,Alexis Walter,23407 Hawkins Lock,1/15,94.736842,True,1,1,fake_signed_petitions.pdf
1,Jenny Jones,82424 Rachel Views,Jenny Jones,82424 Rachel Views,1/15,94.736842,True,1,2,fake_signed_petitions.pdf
2,Jack Stewart,756 Ashley Court Suite 408,Jack Stewart,756 Ashley Court Suite 408,1/15,96.296296,True,1,3,fake_signed_petitions.pdf
3,Brady Herrera,88242 Gray Well,Brady Herrera,88242 Gray Well,1/15,93.750000,True,1,4,fake_signed_petitions.pdf
4,Robert Ayala,6563 Charles Greens Apt. 636,Robert Ayala,6563 Charles Greens Apt. 636,1/15,96.551724,True,1,5,fake_signed_petitions.pdf
...,...,...,...,...,...,...,...,...,...,...
495,Susan Mayo Md,258 Little Lakes Apt. 986,Susan Mayo MD,258 Little Lakes Apt. 986,1/15,92.449923,True,100,1,fake_signed_petitions.pdf
496,Kimberly Reilly,93811 Kent Plains,Kimberly Reilly,93811 Kent Plains,1/15,94.444444,True,100,2,fake_signed_petitions.pdf
497,Mark Aguilar,783 Styx Landing,Mark Aguilar,783 Stephanie Landing,1/15,81.159420,False,100,3,fake_signed_petitions.pdf
498,Chelsea Tran,233 Glover Motorway,Chelsea Branch,47 Aaron Meadow,1/15,45.993031,False,100,4,fake_signed_petitions.pdf


In [9]:
validated_no = sum(list(ocr_matched_df["Valid"]))
print(f"Number of Valid Signatures: {validated_no}")

Number of Valid Signatures: 392


#### Checking Metrics

Here we check how well the OCR model is performing. 

In [13]:
# truth data
truth_data = pd.read_csv("fake_data_collection/all_petition_signers.csv")
spurious_data = pd.read_csv("fake_data_collection/spurious_signers.csv")

# Create full name columns for comparison
truth_data['Full Name'] = truth_data['First_Name'] + ' ' + truth_data['Last_Name'] 
spurious_data['Full Name'] = spurious_data['First_Name'] + ' ' + spurious_data['Last_Name']

# Add column for valid (non-spurious) signers
truth_data['Valid'] = ~truth_data['Full Name'].isin(spurious_data['Full Name'])

truth_data.head(10)

Unnamed: 0,First_Name,Last_Name,Street_Number,Street_Name,Street_Type,Street_Dir_Suffix,Full Name,Valid
0,Adam,Welch,5211,Shaw Wall,,,Adam Welch,True
1,Jody,Compton,37705,Raymond Gardens,,,Jody Compton,True
2,Jordan,Mcdonald,165,Madison Ramp,,,Jordan Mcdonald,True
3,Tanya,Wilkinson,813,Kimberly Cliff Apt. 762,,,Tanya Wilkinson,True
4,Tracie,Meyer,95915,Terry Ford Apt. 548,,,Tracie Meyer,True
...,...,...,...,...,...,...,...,...
495,Jeremy,Figueroa,3890,Edward Parkways,,,Jeremy Figueroa,False
496,Richard,Klein,23923,Newton Pines Suite 528,,,Richard Klein,False
497,Peter,Harvey,12239,Victoria Mountains Suite 294,,,Peter Harvey,False
498,George,Vargas Jr.,435,Nelson Fork,,,George Vargas Jr.,False


In [16]:
# within ocr_matched_df, we can now check if the valid column is true for the signer
ocr_matched_df["Truth Labels"] = ocr_matched_df["Matched Name"].isin(truth_data["Full Name"])

In [15]:
# compute accuracy, precision, recall, and f1 score
predictions = ocr_matched_df["Valid"]
truth = ocr_matched_df["Truth Labels"]

accuracy = (predictions == truth).mean()
precision = (predictions & truth).mean() / (predictions).mean()
recall = (predictions & truth).mean() / (truth).mean()
f1_score = 2 * (precision * recall) / (precision + recall)

print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}") # What percentage of "valid" predictions are correct?
print(f"Recall: {recall:.3f}") # What percentage of truly valid signatures were correctly identified?
print(f"F1 Score: {f1_score:.3f}") # Harmonic Mean of Precision and Recall

Accuracy: 0.982
Precision: 0.997
Recall: 0.980
F1 Score: 0.989


In [20]:
# import helicone costs
helicone_costs = pd.read_csv("helicone_metrics.csv")
total_cost = np.sum(helicone_costs['cost'])
total_cost

0.0802213499999959

In [21]:
# Performance Metrics

dict_metrics = {
    "Accuracy": accuracy,
    "Precision": precision,
    "Recall": recall,
    "F1 Score": f1_score,
    "Cost": total_cost, # in USD
    "Computation Time": "120 s"
}
dict_metrics


{'Accuracy': 0.982,
 'Precision': 0.9974489795918368,
 'Recall': 0.9799498746867168,
 'F1 Score': 0.988621997471555,
 'Cost': 0.0802213499999959,
 'Computation Time': '120 s'}

#### From Helicone Results (https://us.helicone.ai/requests)
- **Processing Time:** ~4 ($\pm$ 2) minutes for 100 pages using `gpt-4o-mini` (Typically 2 minutes but can be longer)
- **Processsing Cost:** ~0.08 USD per 100 pages using `gpt-4o-mini` (That's 8 cents per page)
