In [449]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import humanize

In [450]:
# eavs = pd.read_csv('/Users/will/Repos/voting-equipment/eavs/2024_EAVS_for_Public_Release_nolabel_V1.csv')
eavs = pd.read_excel('/Users/will/Repos/voting-equipment/eavs/2024_EAVS_for_Public_Release_V1_xlsx.xlsx')
codebook = pd.read_excel('/Users/will/Repos/voting-equipment/eavs/2024_EAVS_Codebook.xlsx', index_col=0)

vv = pd.read_csv('data/vv_capture_2025-07-23_14-12-58/cleaned_verifier_data.csv', converters={'FIPS code': str}).convert_dtypes()


  vv = pd.read_csv('data/vv_capture_2025-07-23_14-12-58/cleaned_verifier_data.csv', converters={'FIPS code': str}).convert_dtypes()


goals:
  - estimate number of hand fed scanners per polling place
  - estimate number of BMDs per polling place, whether BMD for all or not
  - estimate number of batch-fed scanners per registered voter

sanity checks:
  - are total number of registered voters in line with VV dataset?
    - VV: 222 M; EAVS: 235 M. ✔️
  - and what about precincts?
    - VV: 184 K; EAVS: 178 K. ✔️
    - but why does VV use precincts and not polling places? there are 95.8 k polling places.


update: maybe just get amounts of equipment directly from EAVS

In [451]:
# convert to int if numeric
def safe_pos_float(x):
    try:
        x = float(x)
        if x > 0:
            return x
        else:
            return 0
    except ValueError:
        return 0
    
def coerce_column(col):
    col = pd.to_numeric(col, errors='coerce')
    col.loc[col < 0] = np.nan  # set negative values to NaN
    return col

numeric_columns = ['F5c_1', 'D1a', 'A1a', 'D2a', 'F1a', 'F1b', 'F1d', 'F1e', 'F1f', 'F1g']
for col in numeric_columns:
    eavs[col] = coerce_column(eavs[col])

In [452]:
print(f'{humanize.intword(eavs["A1a"].apply(safe_pos_float).sum())} registered voters')
print(f'{humanize.intword(eavs["F1a"].apply(safe_pos_float).sum())} ballots cast in person on election day')
print(f'{humanize.intword(eavs["D1a"].apply(safe_pos_float).sum())} precincts')
print(f'{humanize.intword(eavs["A4k"].apply(safe_pos_float).sum())} polling places total')
print(f'{humanize.intword(eavs["D2a"].apply(safe_pos_float).sum())} polling places 2024 general election')
print(f'{humanize.intword(eavs["D3a"].apply(safe_pos_float).sum())} polling places 2024 election day')

234.5 million registered voters
158.2 million ballots cast in person on election day
177.7 thousand precincts
2.2 million polling places total
95.8 thousand polling places 2024 general election
94.9 thousand polling places 2024 election day



EAVS question format on equipment

F[#] Codes:

F3 → DRE without VVPAT

F4 → DRE with VVPAT

F5 → Ballot Marking Device (BMD)

F6 → Scanner
```
F[#] — Device Category
├── a           → Is this device type used? (Yes/No)
├── b_1         → Model of first device (if any)
│   └── c_1     → Quantity of that model
├── b_2         → Model of second device (if any)
│   └── c_2     → Quantity of that model
└── b_3         → Model of third device (if any)
    └── c_3     → Quantity of that model
```

In [453]:
# Ensure equipment counts are numeric 
for equipment in ['3', '4', '5', '6']:
    for i in range(1, 4):
        col = f'F{equipment}c_{i}'
        if col in eavs.columns:
            eavs[col] = coerce_column(eavs[col])


In [None]:
scanners = {
    'AccuVote-OS (Premier)': 'hand-fed',
    'BallotNow (Hart)': 'batch-fed',
    'Chatsworth ACP (MicroVotes)': 'hand-fed',
    'ClearCast (Clear Ballot)': 'hand-fed',
    'ClearCount (Clear Ballot)': 'batch-fed',
    'DS200 (ES&S)': 'hand-fed',
    'DS300 (ES&S)': 'hand-fed',
    'DS450 (ES&S)': 'batch-fed',
    'DS850 (ES&S)': 'batch-fed',
    'DS950 (ES&S)': 'batch-fed',
    'ExpressVote Tabulator (ES&S)': 'hand-fed',
    'ExpressVoteXL (ES&S)': 'hand-fed',
    'IBML (Los Angeles County)': 'batch-fed',
    'ImageCast Central/ICC (Dominion)': 'batch-fed',
    'ImageCast Evolution/ICE (Dominion)': 'hand-fed',
    'ImageCast Precinct/ICP (Dominion)': 'hand-fed',
    'ImageCast Precint2/ICP2': 'hand-fed',
    'M100 (ES&S)': 'hand-fed',
    'M650 (ES&S)': 'batch-fed',
    'OpenElect Freedom Vote Scan (Unisyn)': 'hand-fed',
    'OpenElect Voting Central Scan/OVCS (Unisyn)': 'batch-fed',
    'OpenElect Voting Optical Scan/OVO (Unisyn)': 'hand-fed',
    'Verity Central (Hart)': 'batch-fed',
    'Verity Scan (Hart)': 'hand-fed',
    'eScan (Hart)': 'hand-fed'
}


In [498]:
def count_devices(row, prefix):
    total = 0
    has_data = False

    for i in range(1, 4):
        model = row.get(f'{prefix}b_{i}')
        count = row.get(f'{prefix}c_{i}')
        if pd.isna(model) or pd.isna(count):
            continue
        has_data = True
        total += count

    if not has_data:
        return np.nan
    return total

equipment_codes = {'dre_wo_vvpat': 'F3',
                   'dre_w_vvpat': 'F4',
                   'bmd': 'F5',
                   'scanner': 'F6'}

for equipment, prefix in equipment_codes.items():
    eavs[f'{equipment}_total'] = eavs.apply(lambda row: count_devices(row, prefix), axis=1)

# Function to compute hand-fed and batch-fed totals for a row
def count_scanners(row):
    hand_total = 0
    batch_total = 0
    has_hand = False
    has_batch = False
    has_data = False

    for i in range(1, 4):
        model = row.get(f'F6b_{i}')
        count = row.get(f'F6c_{i}')
        if pd.isna(model) or pd.isna(count):
            continue
        has_data = True

        feed_type = scanners.get(model, 'unknown')
        if feed_type == 'unknown':
            return pd.Series({'hand_fed_total': np.nan, 'batch_fed_total': np.nan})
        elif feed_type == 'hand-fed':
            hand_total += count
            has_hand = True
        elif feed_type == 'batch-fed':
            batch_total += count
            has_batch = True

    if not has_data:
        return pd.Series({'hand_fed_total': np.nan, 'batch_fed_total': np.nan})

    return pd.Series({
        'hand_fed_total': hand_total if has_hand else np.nan,
        'batch_fed_total': batch_total if has_batch else np.nan
    })

# Apply function row-wise
eavs[['hand_fed_total', 'batch_fed_total']] = eavs.apply(count_scanners, axis=1)


In [503]:
eavs[eavs.columns[-6:]].sum()

bmd_total             222062.0
dre_wo_vvpat_total     10642.0
dre_w_vvpat_total      24604.0
scanner_total          99712.0
hand_fed_total         95830.0
batch_fed_total         2126.0
dtype: float64

In [408]:
has_hand_fed_scanners = df['hand_fed_total'] > 0
n_precincts = df[has_hand_fed_scanners]['D1a'].apply(safe_pos_float).sum()
df[has_hand_fed_scanners]['hand_fed_total'].sum() / n_precincts


np.float64(0.9561009677741196)

In [409]:
has_batch_fed_scanners = df['batch_fed_total'] > 0
n_voters = df[has_batch_fed_scanners]['A1a'].apply(safe_pos_float).sum()
n_voters / df[has_batch_fed_scanners]['batch_fed_total'].sum()

np.float64(51811.12229539041)

In [410]:
df['batch_fed_total'].sum()

np.float64(2126.0)

In [411]:
has_scanner = df['F6a'] == 'Yes'

df[has_scanner]['F6b_1'].value_counts() + df[has_scanner]['F6b_2'].value_counts() + df[has_scanner]['F6b_3'].value_counts()

AccuVote-OS (Premier)                             NaN
BallotNow (Hart)                                  NaN
Chatsworth ACP (MicroVotes)                       NaN
ClearCast (Clear Ballot)                          NaN
ClearCount (Clear Ballot)                        49.0
DS200 (ES&S)                                   2170.0
DS300 (ES&S)                                    107.0
DS450 (ES&S)                                    413.0
DS850 (ES&S)                                    176.0
DS950 (ES&S)                                     39.0
Data not available                              378.0
Does not apply                                    NaN
ExpressVote Tabulator (ES&S)                     13.0
ExpressVoteXL (ES&S)                              NaN
IBML (Los Angeles County)                         NaN
ImageCast Central/ICC (Dominion)                407.0
ImageCast Evolution/ICE (Dominion)                NaN
ImageCast Precinct/ICP (Dominion)                 NaN
ImageCast Precint2/ICP2     

do batch-fed scanners as a function of number of mail-in ballots?

In [412]:
pd.to_numeric(df['C8a'], errors='coerce').sum()

np.float64(47629437.0)

In [413]:
df[['F1b', 'F1d', 'F1e', 'F1f', 'F1g']] = df[['F1b', 'F1d', 'F1e', 'F1f', 'F1g']].fillna(0)

In [436]:
df['ballots_counted_centrally'] = (df.loc[df['F12a'] == 'Central location', 'F1b'] + # ballots cast in person on election day
    df.loc[df['F12c'] == 'Central location', 'F1e'] + # provisional ballots
    df.loc[df['F12d'] == 'Central location', 'F1f'] + # ballots cast early in person
    df.loc[df['F12e'] == 'Central location', 'F1d'] + # mail ballots, jurisdictions that do not send mail ballots to all voters
    df.loc[df['F12e'] == 'Central location', 'F1g']) #  mail ballots, jurisdictions that send mail ballots to all voters

humanize.intword(df['ballots_counted_centrally'].sum())


# F1b/e/f/g (ballots counted) * F12a/c/d/e

'60.8 million'

In [None]:
# % of total ballots (F1a) counted centrally. this is way off because jurisdictions are incorrectly reporting a central scan
(df.groupby('State_Full')['ballots_counted_centrally'].sum().sort_values(ascending=False) / 
  df.groupby('State_Full')['F1a'].sum().sort_values(ascending=False)).sort_values(ascending=False)*100

State_Full
NORTHERN MARIANA ISLANDS    100.000000
HAWAII                      100.000000
GUAM                         99.838193
LOUISIANA                    99.795705
WEST VIRGINIA                99.779773
OKLAHOMA                     99.732087
DELAWARE                     99.673191
ILLINOIS                     99.558609
WYOMING                      99.512767
NEW YORK                     99.393537
MARYLAND                     99.381705
NEW MEXICO                   99.377642
NEVADA                       99.248939
COLORADO                     99.005355
NEBRASKA                     97.631651
CALIFORNIA                   83.172522
SOUTH DAKOTA                 82.570530
UTAH                         74.920785
TEXAS                        70.363092
KENTUCKY                     49.051815
MISSOURI                     45.564064
OHIO                         44.184359
ARIZONA                      35.581653
ARKANSAS                     22.799520
FLORIDA                      18.397282
INDIANA       

think about one factor where the number of batch-fed scanners would vary. or do we even have to do this since EAVS has the actual models? maybe EAVS is really the better baseline to work from, vs. VV.

In [None]:
df = eavs.copy()

has_bmd = df['F5a'] == 'Yes'
bmd_for_all_fips = vv[vv['Election Day Marking Method'] == 'Ballot Marking Devices for all voters']['FIPS code'].apply(int).unique()

bmd_for_all = (df['F5a'] == 'Yes') & (df['FIPSCode'].isin(bmd_for_all_fips))
n_bmds = df[bmd_for_all]['F5c_1'].apply(safe_pos_float).sum() # BMDs for all voters
n_precincts = df[bmd_for_all]['D1a'].apply(safe_pos_float).sum()
n_polling_places = df[bmd_for_all]['D2a'].apply(safe_pos_float).sum() # polling places total

n_bmds / n_polling_places # BMDs per precinct in jurisdictions that use BMDs for all voters

np.float64(6.3921419196409675)

In [None]:
not_bmd_for_all = (df['F5a'] == 'Yes') & (~df['FIPSCode'].isin(bmd_for_all_fips))
n_bmds = df[not_bmd_for_all]['F5c_1'].apply(safe_pos_float).sum() # BMDs for not all voters
n_precincts = df[not_bmd_for_all]['D1a'].apply(safe_pos_float).sum()
n_polling_places = df[not_bmd_for_all]['D2a'].apply(safe_pos_float).sum() # polling places total

n_bmds / n_polling_places

np.float64(1.2365954069738605)