In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import humanize

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)


### Pre-processing 

In [34]:
eavs = pd.read_excel('data/eavs/2024_EAVS_for_Public_Release_V1_xlsx.xlsx')
codebook = pd.read_excel('data/eavs/2024_EAVS_Codebook.xlsx', index_col=0)

vv = pd.read_csv('data/vv_capture_2025-07-23_14-12-58/cleaned_verifier_data.csv', converters={'FIPS code': str}).convert_dtypes()
vv = vv[vv['Year'] == 2026]


eavs.rename(columns={
    'Jurisdiction_Name': 'juris',
    'FIPSCode': 'fips'
}, inplace=True)

vv.rename(columns={
    'Jurisdiction': 'juris',
    'FIPS code': 'fips'
}, inplace=True)

eavs['fips'] = eavs['fips'].apply(lambda x: str(x).zfill(10))


  vv = pd.read_csv('data/vv_capture_2025-07-23_14-12-58/cleaned_verifier_data.csv', converters={'FIPS code': str}).convert_dtypes()


In [3]:
# check how good the FIPS merging will be between VV and EAVS

def missing_fips_coverage(vv, eavs, verbose=False):
    # See how many vv FIPS codes are in eavs
    vv['in_eavs'] = vv['fips'].isin(eavs['fips'])
    coverage_vv = vv['in_eavs'].mean()  # proportion of vv matched
    if verbose:
        print(f"Coverage of vv in eavs: {coverage_vv:.1%}")
        print(vv['in_eavs'].value_counts())
        print('\n')

    # See how many eavs FIPS codes are in vv
    eavs['in_vv'] = eavs['fips'].isin(vv['fips'])
    coverage_eavs = eavs['in_vv'].mean()  # proportion of eavs matched
    if verbose:
        print(f"Coverage of eavs in vv: {coverage_eavs:.1%}")
        print(eavs['in_vv'].value_counts())
        print('\n')

    # Codes in vv but not in eavs
    vv_missing = vv.loc[~vv['in_eavs'], 'fips'].unique()
    return vv_missing

vv_missing = missing_fips_coverage(vv, eavs);

In [4]:
# match WI on counties
wi = vv['State'] == 'Wisconsin'
vv['wi_county'] = vv[wi]['juris'].apply(lambda x: x.split('(')[1].split('County')[0].strip().upper())
vv['wi_jurisdiction'] = vv[wi]['juris'].apply(lambda x: x.split('(')[0].strip().upper())

wi = eavs['State_Full'] == 'WISCONSIN'
eavs.loc[wi, 'wi_county'] = eavs.loc[wi, 'juris'].apply(lambda x: x.split(' - ')[1].split('COUNTY')[0].strip())
eavs.loc[wi, 'wi_jurisdiction'] = eavs.loc[wi, 'juris'].apply(lambda x: x.split(' - ')[0].strip())

county_fips_dict = (
    vv.groupby('wi_county')['fips']
      .apply(lambda x: list({code[:5] for code in x if pd.notnull(code)})[0])
      .to_dict()
)

for name, fips in county_fips_dict.items():
    mask = eavs['wi_county'] == name
    eavs.loc[mask, 'fips'] = fips + eavs.loc[mask, 'fips'].str[5:]

vv_missing = missing_fips_coverage(vv, eavs);

In [5]:
# match WI on jurisdictions
juris_fips_dict = (
    vv.groupby('wi_jurisdiction')['fips']
      .apply(lambda x: list({code for code in x if pd.notnull(code)})[0])
      .to_dict()
)

for name, fips in juris_fips_dict.items():
    if fips in vv_missing:
        mask = eavs['wi_jurisdiction'] == name
        eavs.loc[mask, 'fips'] = fips

vv_missing = missing_fips_coverage(vv, eavs);


In [6]:
# some small fixes
fixes = {
    '2501943790': '2501900000', # Nantucket, MA
    '1100100000': '1100000000', # Washington, DC
}
eavs['fips'] = eavs['fips'].replace(fixes)

# fix NH
for fips in vv[vv['fips'].isin(vv_missing)].drop_duplicates('fips')['fips']:
    eavs.loc[eavs['fips'].str.contains(fips[:-2]), 'fips'] = fips


In [7]:
bmd_for_all = ['Ballot Marking Devices for all voters']
dre_for_all = ['DREs with VVPAT for all voters', 'DREs without VVPAT for all voters']


bmd_for_all_fips = vv[vv['Election Day Marking Method'].isin(bmd_for_all)]['fips'].unique()
eavs['bmd_for_all'] = eavs['fips'].isin(bmd_for_all_fips)

dre_for_all_fips = vv[vv['Election Day Marking Method'].isin(dre_for_all)]['fips'].unique()
eavs['dre_for_all'] = eavs['fips'].isin(dre_for_all_fips)

eavs['bmd_or_dre_for_all'] = eavs['bmd_for_all'] | eavs['dre_for_all']

In [8]:
# coerce non-equipment columns

# convert to int if numeric
def safe_pos_float(x):
    try:
        x = float(x)
        if x > 0:
            return x
        else:
            return 0
    except ValueError:
        return 0
    
def coerce_column(col):
    col = pd.to_numeric(col, errors='coerce')
    col.loc[col < 0] = np.nan  # set negative values to NaN
    return col

numeric_columns = ['D1a', 'A1a', 'D2a', 'F1a', 'F1b', 'F1d', 'F1e', 'F1f', 'F1g']
for col in numeric_columns:
    eavs[col] = coerce_column(eavs[col])

# Coerce equipment count columns
for equipment in ['3', '4', '5', '6']:
    for i in range(1, 4):
        col = f'F{equipment}c_{i}'
        if col in eavs.columns:
            eavs[col] = coerce_column(eavs[col])



In [9]:
# merge in eligible voters for ND

nd = eavs["State_Full"] == "NORTH DAKOTA"

# Normalize county names
# Remove " COUNTY", make lowercase, strip spaces
eavs.loc[nd, 'nd_county'] = eavs.loc[nd, 'juris'].str.replace(" COUNTY", "", case=False).str.strip().str.lower()

nd_voters = pd.read_excel('data/nd_voters.xlsx', skiprows=6)
for _, row in nd_voters[:-1].iterrows():
    eavs.loc[(eavs['State_Full'] == "NORTH DAKOTA") & (eavs['nd_county'] == row['County'].lower()), 'A1a'] = row['Eligible Voters']

In [10]:
print(f'{humanize.intword(eavs["A1a"].apply(safe_pos_float).sum())} registered voters')
print(f'{humanize.intword(eavs["F1a"].apply(safe_pos_float).sum())} ballots cast')
print(f'{humanize.intword(eavs["D1a"].apply(safe_pos_float).sum())} precincts')
print(f'{humanize.intword(eavs["A4k"].apply(safe_pos_float).sum())} polling places total')
print(f'{humanize.intword(eavs["D2a"].apply(safe_pos_float).sum())} polling places 2024 general election')
print(f'{humanize.intword(eavs["D3a"].apply(safe_pos_float).sum())} polling places 2024 election day')

235.1 million registered voters
158.2 million ballots cast
177.7 thousand precincts
2.2 million polling places total
95.8 thousand polling places 2024 general election
94.9 thousand polling places 2024 election day


In [11]:
# indicate whether scanners are hand-fed or batch-fed

scanners = {
    'AccuVote-OS (Premier)': 'hand-fed',
    'BallotNow (Hart)': 'batch-fed',
    'Chatsworth ACP (MicroVotes)': 'hand-fed',
    'ClearCast (Clear Ballot)': 'hand-fed',
    'ClearCount (Clear Ballot)': 'batch-fed',
    'DS200 (ES&S)': 'hand-fed',
    'DS300 (ES&S)': 'hand-fed',
    'DS450 (ES&S)': 'batch-fed',
    'DS850 (ES&S)': 'batch-fed',
    'DS950 (ES&S)': 'batch-fed',
    'ExpressVote Tabulator (ES&S)': 'hand-fed',
    'ExpressVoteXL (ES&S)': 'hand-fed',
    'IBML (Los Angeles County)': 'batch-fed',
    'ImageCast Central/ICC (Dominion)': 'batch-fed',
    'ImageCast Evolution/ICE (Dominion)': 'hand-fed',
    'ImageCast Precinct/ICP (Dominion)': 'hand-fed',
    'ImageCast Precint2/ICP2': 'hand-fed',
    'M100 (ES&S)': 'hand-fed',
    'M650 (ES&S)': 'batch-fed',
    'OpenElect Freedom Vote Scan (Unisyn)': 'hand-fed',
    'OpenElect Voting Central Scan/OVCS (Unisyn)': 'batch-fed',
    'OpenElect Voting Optical Scan/OVO (Unisyn)': 'hand-fed',
    'Verity Central (Hart)': 'batch-fed',
    'Verity Scan (Hart)': 'hand-fed',
    'eScan (Hart)': 'hand-fed'
}


EAVS question format on equipment

F[#] Codes:

F3 → DRE without VVPAT

F4 → DRE with VVPAT

F5 → Ballot Marking Device (BMD)

F6 → Scanner
```
F[#] — Device Category
├── a           → Is this device type used? (Yes/No)
├── b_1         → Model of first device (if any)
│   └── c_1     → Quantity of that model
├── b_2         → Model of second device (if any)
│   └── c_2     → Quantity of that model
└── b_3         → Model of third device (if any)
    └── c_3     → Quantity of that model
├── d_1         → This equipment supported
                  In-precinct Election Day
                  regular ballot marking
                  and/or counting (used by all voters)
├── d_2         → This equipment supported
                  In-precinct accessible voting
                  primarily for voters with disabilities
├── d_3         → This equipment supported
                  Provisional ballot marking and/or
                  counting
├── d_4         → This equipment supported
                  In-person early voting
                  ballot marking and/or counting
├── d_5         → This equipment supported
                  Mail ballot counting only
                  (F5 and F6: BMDs and Scanners only)
```

- Dominion ICE? it's a hybrid scanner/BMD. can we tell if most jurisdictions just use it as a scanner? and assume that?
    - some jurisdictions treat it as a BMD and some as a scanner
- Same with the hart A/T.
    - eScan is always treated as a scanner
- ExpressVote XL: let's treat it as just a BMD.
    - Sometimes treated as a BMD and sometimes as a scanner
- Let's exclude hybrids from analysis.

In [12]:
models = [f'F{equipment}b_{number}{suffix}' for equipment in range(3,7) for number in range(1,4) for suffix in ['', 'other']]
models = [f'F{equipment}b_{number}{suffix}' for equipment in range(3,7) for number in range(1,4) for suffix in ['']]

all_models = pd.DataFrame(eavs[models].melt()['value'].value_counts())

all_models


Unnamed: 0_level_0,count
value,Unnamed: 1_level_1
Valid skip,40927
Does not apply,7676
Data not available,2630
DS200 (ES&S),2170
ExpressVote (ES&S),1651
Other (use text box to describe),493
ImageCast Precinct/ICP (Dominion),449
AutoMARK (ES&S),424
DS450 (ES&S),413
ImageCast Central/ICC (Dominion),407


In [13]:
# models = [f'F{equipment}b_{number}{suffix}' for equipment in range(3,7) for number in range(1,4) for suffix in ['', 'other']]
models = [f'F{equipment}b_{number}{suffix}' for equipment in range(3,7) for number in range(1,4) for suffix in ['other']]
pd.set_option('display.max_rows', None)

all_models = pd.DataFrame(eavs[models].melt()['value'].value_counts())
all_models.head(200)


Unnamed: 0_level_0,count
value,Unnamed: 1_level_1
VALID SKIP,41058
DOES NOT APPLY,1511
DNA (DATA NOT AVAILABLE),705
"ACCUVOTE/A,B,C,D-1.96.13, DOMINION",177
MICROVOTE EMS 4.41 VOTING SYSTEM,56
MICROVOTE INFINITY VP-1 WITH VVPAT,44
DS 850'S ARE USED AS A CENTRAL COUNT FOR MAIL AND PROVISIONAL BALLOTS 3 MACHINES SERVICE ALL MUNICIPALITIES.,39
VERITY DUO,36
SOFTWARE VERSION: 2.2 VENDOR: ADKINS,22
VOTINGWORKS VXSUITE 3.1.2,21


In [14]:
# count  devices for each type of equipment

def count_devices(row, prefix):
    total = 0
    has_data = False

    for i in range(1, 4):
        model = row.get(f'{prefix}b_{i}')
        count = row.get(f'{prefix}c_{i}')
        if pd.isna(model) or pd.isna(count):
            continue
        has_data = True
        total += count

    if not has_data:
        return np.nan
    return total

equipment_codes = {'dre_wo_vvpat': 'F3',
                   'dre_w_vvpat': 'F4',
                   'bmd': 'F5',
                   'scanner': 'F6'}

for equipment, prefix in equipment_codes.items():
    eavs[f'{equipment}_total'] = eavs.apply(lambda row: count_devices(row, prefix), axis=1)

In [15]:

# count hand-fed and batch-fed totals for a row
def count_scanners(row):
    hand_total = 0
    batch_total = 0
    has_hand = False
    has_batch = False
    has_data = False

    for i in range(1, 4):
        model = row.get(f'F6b_{i}')
        count = row.get(f'F6c_{i}')
        if pd.isna(model) or pd.isna(count):
            continue
        has_data = True

        feed_type = scanners.get(model, 'unknown')
        if feed_type == 'unknown':
            return pd.Series({'hand_fed_total': np.nan, 'batch_fed_total': np.nan})
        elif feed_type == 'hand-fed':
            hand_total += count
            has_hand = True
        elif feed_type == 'batch-fed':
            batch_total += count
            has_batch = True

    if not has_data:
        return pd.Series({'hand_fed_total': np.nan, 'batch_fed_total': np.nan})

    return pd.Series({
        'hand_fed_total': hand_total if has_hand else np.nan,
        'batch_fed_total': batch_total if has_batch else np.nan
    })

# Apply function row-wise
eavs[['hand_fed_total', 'batch_fed_total']] = eavs.apply(count_scanners, axis=1)


In [16]:
def incomplete_equipment_data(row):
    incomplete = False
    # identifies rows where equipment is marked as 'Yes' but no count is provided

    for equipment in ['3', '4', '5', '6']:
        if row[f'F{equipment}a'] == 'Yes':
            # only check the first model for each equipment type
            if pd.isna(row[f'F{equipment}c_1']): # to require model, also use pd.isna(row[f'F{equipment}b_1'])
                incomplete = True

    return incomplete

eavs['incomplete_equipment'] = eavs.apply(incomplete_equipment_data, axis=1)

equipment_counts = [key + '_total' for key in equipment_codes.keys()][:-1] + ['hand_fed_total', 'batch_fed_total']
eavs['total_equipment_counts'] = eavs[equipment_counts].sum(axis=1)

eavs['equipment_per_voter'] = eavs['total_equipment_counts'] / eavs['A1a']

## Apply filters and describe filtering proess

In [17]:
hybrid_models = ['ExpressVoteXL (ES&S)', 'ES&S EXPRESSVOTE XL', 'ImageCast Evolution/ICE (Dominion)', 'eScan (Hart)']

drop_masks = {'complete equipment': eavs['incomplete_equipment'],
            'zero equipment reports': eavs['total_equipment_counts'] == 0,
            'hybrid models': eavs.apply(lambda r: r.isin(hybrid_models).any(), axis=1),
            'DRE for all': eavs['dre_for_all']
            }

removed_mask = pd.Series(False, index=eavs.index)

for i, (name, mask) in enumerate(drop_masks.items(), start=1):
    delta = mask & ~removed_mask
    print(f"Mask {i} ({name}) removes an additional: {delta.sum()} jurisdictions, representing {humanize.intword(eavs[delta]['A1a'].sum())} registered voters")
    removed_mask |= mask


Mask 1 (complete equipment) removes an additional: 2443 jurisdictions, representing 35.3 million registered voters
Mask 2 (zero equipment reports) removes an additional: 401 jurisdictions, representing 4.9 million registered voters
Mask 3 (hybrid models) removes an additional: 199 jurisdictions, representing 17.0 million registered voters
Mask 4 (DRE for all) removes an additional: 189 jurisdictions, representing 11.3 million registered voters


In [18]:
eavs['use_for_analysis'] = ~removed_mask
eq = eavs[eavs['use_for_analysis']]

In [19]:
grouped = eq.groupby('bmd_for_all')[equipment_counts + ['scanner_total'] + ['A1a']].sum()
per_voter_rates = grouped[equipment_counts].div(grouped['A1a'], axis=0)
per_voter_rates


Unnamed: 0_level_0,dre_wo_vvpat_total,dre_w_vvpat_total,bmd_total,hand_fed_total,batch_fed_total
bmd_for_all,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
False,3e-06,3e-06,0.000499,0.000509,1.2e-05
True,1.4e-05,1.3e-05,0.002817,0.00046,9e-06


Things to check:
- Could I use EAVS instead of VV to determine BMD for all vs no?
- Is EAVS equipment type usable? as in mail vs others... *hmm. wouldn't change things. i think we need a caveat that says we ignore mail-in policies and also what the equipment was used for.*

For hazard analysis: try to pull from EAVS into VV because VV is more like the form we want to use. Try for each jurisdiction to get a count.

Check whether the jurisdictions really are match-able between EAVS and VV.


## extrapolate findings to the rows that were not used for analysis

In [20]:
eavs = eavs.copy()

# Apply idealized estimates only to rows not used for analysis
mask = ~eavs['use_for_analysis']

# Estimate based on whether the jurisdiction uses BMDs for all voters
for bmd_flag in [True, False]:
    rate_row = per_voter_rates.loc[bmd_flag]
    submask = mask & (eavs['bmd_or_dre_for_all'] == bmd_flag)
    for equipment in equipment_counts:
        eavs.loc[submask, f'{equipment}_ideal'] = eavs.loc[submask, 'A1a'] * rate_row[equipment]

# # For rows used in analysis, copy actuals into the _ideal columns
for equipment in equipment_counts:
    eavs.loc[eavs['use_for_analysis'], f'{equipment}_ideal'] = eavs.loc[eavs['use_for_analysis'], equipment]

eavs['bmd_total_ideal'] = (
    eavs.get('bmd_total_ideal', 0).fillna(0) +
    eavs.get('dre_wo_vvpat_total_ideal', 0).fillna(0) +
    eavs.get('dre_w_vvpat_total_ideal', 0).fillna(0)
)


In [None]:
cbo = pd.read_excel('data/cbo_projections.xlsx',
                    sheet_name='1. Econ Vars_Annual Rates',
                    header=7, index_col=0).T

inflation_rates = cbo['Growth of the CPI-U'] / 100

def inflation_multiplier(inflation_series, start_year, end_year):
    """Calculate compounded inflation multiplier from start_year to end_year (inclusive)."""
    relevant_years = inflation_series.loc[start_year:end_year]
    multiplier = (1 + relevant_years).prod()
    return multiplier


In [26]:
# vvsg_multiplier = 1.2  # multiplier from old 20-50% range

start_year = 2026

replacement_year = 2028

inflation = inflation_multiplier(inflation_rates, start_year, replacement_year)

# cost_multiplier = vvsg_multiplier * inflation

# starting point: median prices on caulfield estimates; see historical_prices.ipynb

base_2021 = {
    "BMD": 3665,
    "Hand-fed scanner": 5611,
    "Batch-fed scanner": 80157,
}

# acet ranges for 2026: scanners: 20-30% more expensive than in 2021
#                       BMDs: 35-50% more expensive due to display requirements
ranges_2026 = {
    "BMD": (1.35, 1.5),
    "Hand-fed scanner": (1.2, 1.3),
    "Batch-fed scanner": (1.2, 1.3),
}

rows = []
for equip in ["BMD", "Hand-fed scanner", "Batch-fed scanner"]:
    base = base_2021[equip]
    low_mult, high_mult = ranges_2026[equip]
    low  = base * low_mult  * inflation
    high = base * high_mult * inflation
    mean = base * np.mean([low_mult, high_mult]) * inflation

    rows.append({
        "equipment": equip.replace("_", " "),
        "2021 mean": base,
        f"{replacement_year} low": low,
        f"{replacement_year} high": high,
        f"{replacement_year} mean": mean,
    })

df = pd.DataFrame(rows).set_index("equipment")

# Pretty display columns (currency + range)
def fmt(x): return f"${x:,.0f}"

df_display = pd.DataFrame({
    "2021 mean": df["2021 mean"].map(fmt),
    f"{replacement_year} estimate": (df[f"{replacement_year} low"].map(fmt) + " – " + df[f"{replacement_year} high"].map(fmt)),
    f"{replacement_year} mean": df[f"{replacement_year} mean"].map(fmt),
})

df_display


Unnamed: 0_level_0,2021 mean,2028 estimate,2028 mean
equipment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BMD,"$3,665","$5,303 – $5,892","$5,598"
Hand-fed scanner,"$5,611","$7,217 – $7,818","$7,517"
Batch-fed scanner,"$80,157","$103,093 – $111,684","$107,388"


In [28]:

# Build table rows
rows = []

for label, group in eavs.groupby('bmd_or_dre_for_all'):
    jurisdictions = len(group)

    n_registered_voters = group['A1a'].sum()

    bmd_count = group['bmd_total_ideal'].sum()
    hand_fed_count = group['hand_fed_total_ideal'].sum()
    batch_fed_count = group['batch_fed_total_ideal'].sum()

    bmd_total = bmd_count * df.loc['BMD', f'{replacement_year} mean']
    hand_fed_total = hand_fed_count * df.loc['Hand-fed scanner', f'{replacement_year} mean']
    batch_fed_total = batch_fed_count * df.loc['Batch-fed scanner', f'{replacement_year} mean']
    total_cost = bmd_total + hand_fed_total + batch_fed_total

    rows.append({
        'bmd_or_dre_for_all': label,
        '# Jurisdictions': jurisdictions,
        '# Registered Voters': n_registered_voters,
        '# BMDs': int(bmd_count),
        'BMD Total Cost': bmd_total,
        '# Hand-Fed Scanners': int(hand_fed_count),
        'Hand-Fed Scanner Total Cost': hand_fed_total,
        '# Batch-Fed Scanners': int(batch_fed_count),
        'Batch-Fed Scanner Total Cost': batch_fed_total,
        'Total Cost': total_cost,
    })

# Create DataFrame
cost_table = pd.DataFrame(rows)

# Add total row
totals = {
    'bmd_or_dre_for_all': 'Total',
    '# Registered Voters': cost_table['# Registered Voters'].sum(),
    '# Jurisdictions': cost_table['# Jurisdictions'].sum(),
    '# BMDs': cost_table['# BMDs'].sum(),
    'BMD Total Cost': cost_table['BMD Total Cost'].sum(),
    '# Hand-Fed Scanners': cost_table['# Hand-Fed Scanners'].sum(),
    'Hand-Fed Scanner Total Cost': cost_table['Hand-Fed Scanner Total Cost'].sum(),
    '# Batch-Fed Scanners': cost_table['# Batch-Fed Scanners'].sum(),
    'Batch-Fed Scanner Total Cost': cost_table['Batch-Fed Scanner Total Cost'].sum(),
    'Total Cost': cost_table['Total Cost'].sum(),
}
cost_table = pd.concat([cost_table, pd.DataFrame([totals])], ignore_index=True)

# Custom formatter: shows M or B with 1 decimal when needed
def number_formatter(x, pos=None):
    units = [(1e12, 'T'), (1e9, 'B'), (1e6, 'M'), (1e3, 'K')]

    for factor, suffix in units:
        if x >= factor:
            return f'{x / factor:.3g} {suffix}'.rstrip('0').rstrip('.')

    return f'{x:.0f}'

# Format cost columns
for col in ['BMD Total Cost', 'Hand-Fed Scanner Total Cost', 'Batch-Fed Scanner Total Cost', 'Total Cost']:
    # cost_table[col] = cost_table[col].map('${:,.0f}'.format)
    # cost_table[col] = cost_table[col].apply(lamda x: "humanize.intword)
    cost_table[col] = cost_table[col].apply(lambda x: f'${number_formatter(x)}')

# Format number columns
for col in ['# Jurisdictions', '# Registered Voters', '# BMDs', '# Hand-Fed Scanners', '# Batch-Fed Scanners']:
    cost_table[col] = cost_table[col].apply(number_formatter)

cost_table

cost_table = cost_table.set_index('bmd_or_dre_for_all')
cost_table.index = cost_table.index.map({
    True: 'BMD/DRE-for-all jurisdictions',
    False: 'HMPB+BMD jurisdictions',
    'Total': 'Total'
})
cost_table.index.name = None  # Remove index title

# Reorder rows
cost_table = cost_table.reindex([
    'BMD/DRE-for-all jurisdictions',
    'HMPB+BMD jurisdictions',
    'Total'
])

cost_table.index.name = None  # Remove index title

cost_table

Unnamed: 0,# Jurisdictions,# Registered Voters,# BMDs,BMD Total Cost,# Hand-Fed Scanners,Hand-Fed Scanner Total Cost,# Batch-Fed Scanners,Batch-Fed Scanner Total Cost,Total Cost
BMD/DRE-for-all jurisdictions,839,70.9 M,202 K,$1.13 B,32.6 K,$245 M,620,$66.7 M,$1.44 B
HMPB+BMD jurisdictions,5.62 K,164 M,83 K,$465 M,83.5 K,$628 M,2.04 K,$219 M,$1.31 B
Total,6.46 K,235 M,285 K,$1.59 B,116 K,$873 M,2.66 K,$286 M,$2.75 B


In [33]:
cost_table[['# Jurisdictions', '# Registered Voters', 'BMD Total Cost', 'Hand-Fed Scanner Total Cost', 'Batch-Fed Scanner Total Cost', 'Total Cost']]

Unnamed: 0,# Jurisdictions,# Registered Voters,BMD Total Cost,Hand-Fed Scanner Total Cost,Batch-Fed Scanner Total Cost,Total Cost
BMD/DRE-for-all jurisdictions,839,70.9 M,$1.13 B,$245 M,$66.7 M,$1.44 B
HMPB+BMD jurisdictions,5.62 K,164 M,$465 M,$628 M,$219 M,$1.31 B
Total,6.46 K,235 M,$1.59 B,$873 M,$286 M,$2.75 B


## Analysis (flawed) of central vs precinct count

can't really use this.

In [11]:
eavs['batch_fed_total'].sum()

np.float64(2126.0)

In [12]:
has_scanner = eavs['F6a'] == 'Yes'

eavs[has_scanner]['F6b_1'].value_counts() + eavs[has_scanner]['F6b_2'].value_counts() + eavs[has_scanner]['F6b_3'].value_counts()

AccuVote-OS (Premier)                             NaN
BallotNow (Hart)                                  NaN
Chatsworth ACP (MicroVotes)                       NaN
ClearCast (Clear Ballot)                          NaN
ClearCount (Clear Ballot)                        49.0
DS200 (ES&S)                                   2170.0
DS300 (ES&S)                                    107.0
DS450 (ES&S)                                    413.0
DS850 (ES&S)                                    176.0
DS950 (ES&S)                                     39.0
Data not available                              378.0
Does not apply                                    NaN
ExpressVote Tabulator (ES&S)                     13.0
ExpressVoteXL (ES&S)                              NaN
IBML (Los Angeles County)                         NaN
ImageCast Central/ICC (Dominion)                407.0
ImageCast Evolution/ICE (Dominion)                NaN
ImageCast Precinct/ICP (Dominion)                 NaN
ImageCast Precint2/ICP2     

do batch-fed scanners as a function of number of mail-in ballots?

In [13]:
pd.to_numeric(eavs['C8a'], errors='coerce').sum()

np.float64(47629437.0)

In [None]:
eavs['ballots_counted_centrally'] = (eavs.loc[eavs['F12a'] == 'Central location', 'F1b'] + # ballots cast in person on election day
    eavs.loc[eavs['F12c'] == 'Central location', 'F1e'] + # provisional ballots
    eavs.loc[eavs['F12d'] == 'Central location', 'F1f'] + # ballots cast early in person
    eavs.loc[eavs['F12e'] == 'Central location', 'F1d'] + # mail ballots, jurisdictions that do not send mail ballots to all voters
    eavs.loc[eavs['F12e'] == 'Central location', 'F1g']) #  mail ballots, jurisdictions that send mail ballots to all voters

humanize.intword(eavs['ballots_counted_centrally'].sum())


# F1b/e/f/g (ballots counted) * F12a/c/d/e

'60.8 million'

In [None]:
# % of total ballots (F1a) counted centrally. this is way off because jurisdictions are incorrectly reporting a central scan
(eavs.groupby('State_Full')['ballots_counted_centrally'].sum().sort_values(ascending=False) / 
  eavs.groupby('State_Full')['F1a'].sum().sort_values(ascending=False)).sort_values(ascending=False)*100

State_Full
NORTHERN MARIANA ISLANDS    100.000000
HAWAII                      100.000000
GUAM                         99.838193
LOUISIANA                    99.795705
WEST VIRGINIA                99.779773
OKLAHOMA                     99.732087
DELAWARE                     99.673191
ILLINOIS                     99.558609
WYOMING                      99.512767
NEW YORK                     99.393537
MARYLAND                     99.381705
NEW MEXICO                   99.377642
NEVADA                       99.248939
COLORADO                     99.005355
NEBRASKA                     97.631651
CALIFORNIA                   83.172522
SOUTH DAKOTA                 82.570530
UTAH                         74.920785
TEXAS                        70.363092
KENTUCKY                     49.051815
MISSOURI                     45.564064
OHIO                         44.184359
ARIZONA                      35.581653
ARKANSAS                     22.799520
FLORIDA                      18.397282
INDIANA       