In [347]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import humanize

In [281]:
# eavs = pd.read_csv('/Users/will/Repos/voting-equipment/eavs/2024_EAVS_for_Public_Release_nolabel_V1.csv')
eavs = pd.read_excel('/Users/will/Repos/voting-equipment/eavs/2024_EAVS_for_Public_Release_V1_xlsx.xlsx')
codebook = pd.read_excel('/Users/will/Repos/voting-equipment/eavs/2024_EAVS_Codebook.xlsx', index_col=0)

vv = pd.read_csv('data/vv_capture_2025-07-23_14-12-58/cleaned_verifier_data.csv', converters={'FIPS code': str}).convert_dtypes()


  vv = pd.read_csv('data/vv_capture_2025-07-23_14-12-58/cleaned_verifier_data.csv', converters={'FIPS code': str}).convert_dtypes()


In [282]:
# Simulation parameters:
# - vvsg2_multiplier: applies a cost bump to all equipment types to reflect VVSG 2.x
#       ACET says 20-50% more, but we've heard lower and higher estimates, so let's use 1.2.
# - bmd_cost: base cost of Ballot Marking Devices. $3500 is what Brennan used,
#       and aligns with Caulfield's findings for ExpressVote pricing.
# - hand_fed_scanner_cost: cost of precinct-count scanners like DS200.
#       $5750 replicates Caulfield's estimate for DS200:
#       "We present summary statistics on pricing trends for a number of different voting
#        machines. The most common one in our sample—ES&S’s Model DS200 optical
#        scanner—had a strikingly consistent mode and median unit price of $5,750, which
#        is consistent with previous assessments"
#       $5000 is what Brennan used
# - batch_fed_scanner_cost: cost of high-speed scanners like DS850.
#       "These scan­ners can cost anywhere between $50,000 and $100,000 each,"
#       according to https://verifiedvoting.org/voting-machines-at-risk-in-2022-a-joint-analysis-from-the-brennan-center-and-verified-voting/
# - n_hand_fed_scanners_per_precinct: number of hand-fed scanners per precinct.
# - n_bmds_per_precinct: number of BMDs per precinct.
# - n_bmds_for_all_per_precinct: number of BMDs per precinct in jurisdictions
#       that use BMDs or DREs for all voters.
# - n_voters_per_batch_scanner: number of voters required per batch-fed scanner.
#   could get some anchors and adjust this by rate of mail-in voting.

parameters = {
    "vvsg2_multiplier": 1.2,
    "bmd_cost": 3500,
    "hand_fed_scanner_cost": 5000,
    "batch_fed_scanner_cost": 75000,
    "n_hand_fed_scanners_per_precinct": 1,
    "n_bmds_per_precinct": 2,
    "n_bmds_for_all_per_precinct": 4,
    "n_voters_per_batch_scanner": 10000,
}


goals:
  - estimate number of hand fed scanners per polling place
  - estimate number of BMDs per polling place, whether BMD for all or not
  - estimate number of batch-fed scanners per registered voter

sanity checks:
  - are total number of registered voters in line with VV dataset?
    - VV: 222 M; EAVS: 235 M. ✔️
  - and what about precincts?
    - VV: 184 K; EAVS: 178 K. ✔️
    - but why does VV use precincts and not polling places? there are 95.8 k polling places.

In [283]:
# convert to int if numeric
def safe_pos_float(x):
    try:
        x = float(x)
        if x > 0:
            return x
        else:
            return 0
    except ValueError:
        return 0

In [354]:
humanize.intword(eavs['A1a'].apply(safe_pos_float).sum()) # registered voters

'234.5 million'

In [355]:
humanize.intword(eavs['D1a'].apply(safe_pos_float).sum()) # precincts

'177.7 thousand'

In [356]:
humanize.intword(eavs['A4k'].apply(safe_pos_float).sum()) # polling places total

'2.2 million'

In [357]:
humanize.intword(eavs['D2a'].apply(safe_pos_float).sum()) # polling places 2024 general election

'95.8 thousand'

In [None]:
humanize.intword(eavs['D3a'].apply(safe_pos_float).sum()) # polling places 2024 election day

'94.9 thousand'

In [320]:
df = eavs.copy()
def coerce_column(col):
    col = pd.to_numeric(col, errors='coerce')
    col.loc[col < 0] = np.nan  # set negative values to NaN
    return col

numeric_columns = ['F5c_1', 'D1a', 'A1a', 'D2a', 'F1a', 'F1b', 'F1e', 'F1f', 'F1g']
for col in numeric_columns:
    df[col] = coerce_column(df[col])

In [342]:
has_bmd = df['F5a'] == 'Yes'
bmd_for_all_fips = vv[vv['Election Day Marking Method'] == 'Ballot Marking Devices for all voters']['FIPS code'].apply(int).unique()

bmd_for_all = (df['F5a'] == 'Yes') & (df['FIPSCode'].isin(bmd_for_all_fips))
n_bmds = df[bmd_for_all]['F5c_1'].apply(safe_pos_float).sum() # BMDs for all voters
n_precincts = df[bmd_for_all]['D1a'].apply(safe_pos_float).sum()
n_polling_places = df[bmd_for_all]['D2a'].apply(safe_pos_float).sum() # polling places total

n_bmds / n_polling_places # BMDs per precinct in jurisdictions that use BMDs for all voters

np.float64(6.3921419196409675)

In [343]:
not_bmd_for_all = (df['F5a'] == 'Yes') & (~df['FIPSCode'].isin(bmd_for_all_fips))
n_bmds = df[not_bmd_for_all]['F5c_1'].apply(safe_pos_float).sum() # BMDs for not all voters
n_precincts = df[not_bmd_for_all]['D1a'].apply(safe_pos_float).sum()
n_polling_places = df[not_bmd_for_all]['D2a'].apply(safe_pos_float).sum() # polling places total

n_bmds / n_polling_places

np.float64(1.2365954069738605)

In [323]:
# need to sort models by which ones are central scanners

In [324]:
scanners = {
    'AccuVote-OS (Premier)': 'hand-fed',
    'BallotNow (Hart)': 'batch-fed',
    'Chatsworth ACP (MicroVotes)': 'hand-fed',
    'ClearCast (Clear Ballot)': 'hand-fed',
    'ClearCount (Clear Ballot)': 'batch-fed',
    'DS200 (ES&S)': 'hand-fed',
    'DS300 (ES&S)': 'hand-fed',
    'DS450 (ES&S)': 'batch-fed',
    'DS850 (ES&S)': 'batch-fed',
    'DS950 (ES&S)': 'batch-fed',
    'ExpressVote Tabulator (ES&S)': 'hand-fed',
    'ExpressVoteXL (ES&S)': 'hand-fed',
    'IBML (Los Angeles County)': 'batch-fed',
    'ImageCast Central/ICC (Dominion)': 'batch-fed',
    'ImageCast Evolution/ICE (Dominion)': 'hand-fed',
    'ImageCast Precinct/ICP (Dominion)': 'hand-fed',
    'ImageCast Precint2/ICP2': 'hand-fed',
    'M100 (ES&S)': 'hand-fed',
    'M650 (ES&S)': 'batch-fed',
    'OpenElect Freedom Vote Scan (Unisyn)': 'hand-fed',
    'OpenElect Voting Central Scan/OVCS (Unisyn)': 'batch-fed',
    'OpenElect Voting Optical Scan/OVO (Unisyn)': 'hand-fed',
    'Verity Central (Hart)': 'batch-fed',
    'Verity Scan (Hart)': 'hand-fed',
    'eScan (Hart)': 'hand-fed'
}


In [325]:
# Function to compute hand-fed and batch-fed totals for a row
def count_scanners(row):
    hand_total = 0
    batch_total = 0
    has_hand = False
    has_batch = False
    has_data = False

    for i in range(1, 4):
        model = row.get(f'F6b_{i}')
        count = row.get(f'F6c_{i}')
        if pd.isna(model) or pd.isna(count):
            continue
        has_data = True

        feed_type = scanners.get(model, 'unknown')
        if feed_type == 'unknown':
            return pd.Series({'hand_fed_total': np.nan, 'batch_fed_total': np.nan})
        elif feed_type == 'hand-fed':
            hand_total += count
            has_hand = True
        elif feed_type == 'batch-fed':
            batch_total += count
            has_batch = True

    if not has_data:
        return pd.Series({'hand_fed_total': np.nan, 'batch_fed_total': np.nan})

    return pd.Series({
        'hand_fed_total': hand_total if has_hand else np.nan,
        'batch_fed_total': batch_total if has_batch else np.nan
    })


# Ensure scanner counts are numeric (in case they're strings)
for i in range(1, 4):
    col = f'F6c_{i}'
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Apply function row-wise
df[['hand_fed_total', 'batch_fed_total']] = df.apply(count_scanners, axis=1)


In [326]:
has_hand_fed_scanners = df['hand_fed_total'] > 0
n_precincts = df[has_hand_fed_scanners]['D1a'].apply(safe_pos_float).sum()
df[has_hand_fed_scanners]['hand_fed_total'].sum() / n_precincts


np.float64(0.9561009677741196)

In [327]:
has_batch_fed_scanners = df['batch_fed_total'] > 0
n_voters = df[has_batch_fed_scanners]['A1a'].apply(safe_pos_float).sum()
n_voters / df[has_batch_fed_scanners]['batch_fed_total'].sum()

np.float64(51811.12229539041)

In [328]:
df['batch_fed_total'].sum()

np.float64(2126.0)

In [332]:
has_scanner = df['F6a'] == 'Yes'

df[has_scanner]['F6b_1'].value_counts() + df[has_scanner]['F6b_2'].value_counts() + df[has_scanner]['F6b_3'].value_counts()

AccuVote-OS (Premier)                             NaN
BallotNow (Hart)                                  NaN
Chatsworth ACP (MicroVotes)                       NaN
ClearCast (Clear Ballot)                          NaN
ClearCount (Clear Ballot)                        49.0
DS200 (ES&S)                                   2170.0
DS300 (ES&S)                                    107.0
DS450 (ES&S)                                    413.0
DS850 (ES&S)                                    176.0
DS950 (ES&S)                                     39.0
Data not available                              378.0
Does not apply                                    NaN
ExpressVote Tabulator (ES&S)                     13.0
ExpressVoteXL (ES&S)                              NaN
IBML (Los Angeles County)                         NaN
ImageCast Central/ICC (Dominion)                407.0
ImageCast Evolution/ICE (Dominion)                NaN
ImageCast Precinct/ICP (Dominion)                 NaN
ImageCast Precint2/ICP2     

In [None]:
df[df.apply(lambda x: x.str.contains('ExpressVote').any(), axis=1)]

Unnamed: 0,FIPSCode,Jurisdiction_Name,State_Full,State_Abbr,A1a,A1b,A1c,A1d_Other,A1d,A1Comments,...,F11Comments,F12a,F12b,F12c,F12d,F12e,F12Comments,F13,hand_fed_total,batch_fed_total
69,400100000,APACHE COUNTY,ARIZONA,AZ,57277.0,53735,3542,,0,,...,,Central location,Central location,Central location,Central location,Central location,,,,2.0
70,400300000,COCHISE COUNTY,ARIZONA,AZ,93667.0,82242,11425,,,,...,,Precinct or polling location,Both central and precinct location,Central location,Central location,Central location,,,20.0,3.0
71,400500000,COCONINO COUNTY,ARIZONA,AZ,107747.0,90681,17066,,,,...,,Central location,Central location,Central location,Central location,Central location,,,,3.0
72,400700000,GILA COUNTY,ARIZONA,AZ,38962.0,34653,4309,,,,...,,Central location,Central location,Central location,Central location,Central location,,,,1.0
73,400900000,GRAHAM COUNTY,ARIZONA,AZ,23295.0,20714,2581,,,,...,,Precinct or polling location,Precinct or polling location,Central location,Central location,Central location,,,8.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6456,5603700000,SWEETWATER COUNTY,WYOMING,WY,18032.0,18032,Does not apply,DOES NOT APPLY,Does not apply,"IN WYOMING, VOTERS DESIGNATED AS “INACTIVE” AR...",...,,Central location,Central location,Central location,Central location,Central location,,,32.0,1.0
6457,5603900000,TETON COUNTY,WYOMING,WY,15390.0,15390,Does not apply,DOES NOT APPLY,Does not apply,"IN WYOMING, VOTERS DESIGNATED AS “INACTIVE” AR...",...,,Central location,Central location,Central location,Central location,Central location,,,12.0,1.0
6458,5604100000,UINTA COUNTY,WYOMING,WY,10041.0,10041,Does not apply,DOES NOT APPLY,Does not apply,"IN WYOMING, VOTERS DESIGNATED AS “INACTIVE” AR...",...,,Central location,Central location,Central location,Central location,Central location,,,11.0,
6459,5604300000,WASHAKIE COUNTY,WYOMING,WY,4203.0,4203,Does not apply,DOES NOT APPLY,Does not apply,"IN WYOMING, VOTERS DESIGNATED AS “INACTIVE” AR...",...,,Central location,Central location,Central location,Central location,Central location,,,7.0,


do batch-fed scanners as a function of number of mail-in ballots

In [301]:
pd.to_numeric(df['C8a'], errors='coerce').sum()

np.float64(47629437.0)

In [302]:
df[['F1b', 'F1e', 'F1f', 'F1g']] = df[['F1b', 'F1e', 'F1f', 'F1g']].fillna(0)

In [303]:
df['ballots_counted_centrally'] = df.loc[df['F12a'] == 'Central location', 'F1b'] + \
    df.loc[df['F12c'] == 'Central location', 'F1e'] + \
    df.loc[df['F12d'] == 'Central location', 'F1f'] + \
    df.loc[df['F12e'] == 'Central location', 'F1g']

df['ballots_counted_centrally'].sum() / 1e6

np.float64(55.069413)

State_Full
CALIFORNIA                  16.164330
TEXAS                       11.488820
FLORIDA                     10.999125
NEW YORK                     8.389626
PENNSYLVANIA                 7.074875
OHIO                         5.851625
NORTH CAROLINA               5.756106
ILLINOIS                     5.717147
MICHIGAN                     5.706503
GEORGIA                      5.297500
VIRGINIA                     4.511853
NEW JERSEY                   4.321921
WASHINGTON                   3.949810
MASSACHUSETTS                3.512930
ARIZONA                      3.477975
WISCONSIN                    3.434185
MINNESOTA                    3.271069
COLORADO                     3.240754
MISSOURI                     3.126837
TENNESSEE                    3.090161
MARYLAND                     3.028813
INDIANA                      2.986839
SOUTH CAROLINA               2.566404
ALABAMA                      2.272911
OREGON                       2.269608
KENTUCKY                     2.086090
L

In [309]:
(df.groupby('State_Full')['ballots_counted_centrally'].sum().sort_values(ascending=False) / df.groupby('State_Full')['F1a'].sum().sort_values(ascending=False)).sort_values(ascending=False)*100

State_Full
HAWAII                      100.000000
GUAM                         99.494766
NEVADA                       99.248939
COLORADO                     99.005355
WEST VIRGINIA                96.870669
LOUISIANA                    93.874914
OKLAHOMA                     93.468207
DELAWARE                     93.129419
NORTHERN MARIANA ISLANDS     92.743854
NEW YORK                     89.417085
NEW MEXICO                   87.397446
WYOMING                      85.416951
CALIFORNIA                   83.172522
ILLINOIS                     81.783869
UTAH                         74.920785
MARYLAND                     74.809571
NEBRASKA                     70.935766
TEXAS                        68.043176
SOUTH DAKOTA                 52.736615
KENTUCKY                     46.229741
MISSOURI                     42.897311
OHIO                         35.855801
ARKANSAS                     22.367007
FLORIDA                      13.312423
ARIZONA                       9.818386
INDIANA       

In [267]:
df[['F1b', 'F1e', 'F1f', 'F1g']].sum().sum()/1e6

np.float64(127.95959)

In [254]:
0 + np.nan

nan

In [None]:
df['F12a'].value_counts() #where votes were tallied for ballots cast in person on Election Day
# aligns with F1b: total voters in person on election day

# b ballots cast on accessible equipment for voters with disabilities
??

# c provisional ballots
# aligns with F1e

# d in person before eletion day
# F1f

# e mail ballots
# F1g



F12a
Precinct or polling location          4296
Central location                      1088
Both central and precinct location     736
Data not available                     195
Does not apply                          79
Name: count, dtype: int64

In [218]:
df['F12e'].value_counts()

F12e
Precinct or polling location          2888
Central location                      2707
Both central and precinct location     719
Data not available                      72
Does not apply                           8
Name: count, dtype: int64