In [2]:
import pandas as pd 

In [170]:
sov_path = "data/raw/c0{county}_g{year}_sov_data_by_g{year}_srprec.csv"
sor_path = "data/raw/c0{county}_g{year}_voters_by_g{year}_srprec.csv"

sov_san_luis = []
sov_santa_barbara = []
sor_san_luis = []
sor_santa_barbara= []

df_sov_sb_combined = None
df_sov_sl_combined = None

df_election_totals = pd.DataFrame()
df_registration_totals = pd.DataFrame()

# Iterate over election cycles.
for year in ["12", "14", "16", "18", "20"]:
    
    # Read sov data for each county.
    df_sov_sb = pd.read_csv(sov_path.format(county="83", year=year))
    df_sov_sl = pd.read_csv(sov_path.format(county="79", year=year))

    # Read sor data for each county.
    df_sor_sb = pd.read_csv(sor_path.format(county="83", year=year))
    df_sor_sl = pd.read_csv(sor_path.format(county="83", year=year))

    # Filter precincts to only those that are in the correct district.
    df_sov_sb = df_sov_sb[df_sov_sb["addist"] == 35]
    df_sov_sl = df_sov_sl[df_sov_sl["addist"] == 35]

    # SOR files don't contain information about what AD they belong to. 
    # So we need to filter them by the related SOV file. 
    df_sor_sb = df_sor_sb[df_sor_sb["srprec"].isin(df_sov_sb["srprec"])]
    df_sor_sl = df_sor_sl[df_sor_sl["srprec"].isin(df_sor_sl["srprec"])]

    # Get totals. We can't use the reported county totals - because they don't
    # necessarily reflect the totals of assembly districts.
    sr_sov_sb_totals = df_sov_sb.iloc[:, 6:].sum()
    sr_sov_sl_totals = df_sov_sl.iloc[:, 6:].sum()
    sr_cycle_totals = sr_sov_sb_totals + sr_sov_sl_totals

    # Add the cycle year to summated Series. 
    sr_year = pd.Series(data=["20" + year], index=["year"])
    sr_cycle_totals = sr_cycle_totals.append(sr_year)

    # Total information about registration.
    sr_sor_sb_totals = df_sor_sb.iloc[:, 3:].sum()
    sr_sor_sl_totals = df_sor_sl.iloc[:, 3:].sum()
    sr_reg_totals = sr_sor_sb_totals + sr_sor_sl_totals

    sr_reg_totals = sr_reg_totals.append(sr_year)

    # Add summated totals to election total
    df_election_totals = df_election_totals.append(sr_cycle_totals, ignore_index=True)
    df_registration_totals = df_registration_totals.append(sr_reg_totals, ignore_index=True)
   
    # Determine the previous cycle - used to suffix columns labels during merge
    prev = str(int(year) - 2)

    # Merge datasets by precinct.  
    df_sov_sb_combined = df_sov_sb if df_sov_sb_combined is None else \
        df_sov_sb_combined.merge(
            right=df_sov_sb, 
            on="srprec", 
            how="inner", 
            suffixes=["_{}".format(prev), "_{}".format(year)])

    df_sov_sl_combined = df_sov_sl if df_sov_sl_combined is None else \
        df_sov_sl_combined.merge(
            right=df_sov_sl, 
            on="srprec", 
            how="inner", 
            suffixes=["_{}".format(prev), "_{}".format(year)])

    sov_san_luis.append(df_sov_sl)
    sov_santa_barbara.append(df_sov_sb)

df_election_totals.set_index("year", inplace=True)
df_registration_totals.set_index("year", inplace=True)
# df_election_totals

df_registration_totals


Unnamed: 0_level_0,aip,chidcl,chidem,chioth,chirep,dcl,dclf1824,dclf2534,dclf3544,dclf4554,...,rreg5g,rreg6g,rreg7g,rreg8g,rreg9g,totreg_r,vietdcl,vietdem,vietoth,vietrep
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012,5193.0,565.0,524.0,70.0,293.0,40798.0,4169.0,3516.0,2976.0,3423.0,...,8745.0,3966.0,5522.0,3547.0,26283.0,214522.0,215.0,227.0,28.0,101.0
2014,3695.0,286.0,297.0,46.0,224.0,27070.0,1487.0,1872.0,1739.0,2341.0,...,2859.0,5965.0,2953.0,4132.0,23905.0,153383.0,84.0,105.0,14.0,73.0
2016,5942.0,731.0,857.0,66.0,341.0,50586.0,4403.0,4823.0,3880.0,3933.0,...,5706.0,2347.0,6773.0,2857.0,27881.0,242385.0,253.0,417.0,29.0,125.0
2018,4974.0,550.0,675.0,50.0,267.0,44239.0,3127.0,3607.0,3384.0,3412.0,...,2322.0,3728.0,1579.0,4802.0,23008.0,203251.0,189.0,278.0,20.0,78.0
2020,8263.0,633.0,945.0,79.0,334.0,55274.0,3907.0,5295.0,4579.0,3740.0,...,2242.0,1448.0,2535.0,984.0,18759.0,276972.0,221.0,411.0,53.0,168.0


In [56]:
df_concat_sb = pd.concat(sov_santa_barbara, axis=0)
df_filtered_sb = df_concat_sb.groupby("srprec").filter(lambda x : len(x) > 2)
print(df_filtered_sb.shape)

df_concat_slo = pd.concat(sov_san_luis, axis=0)
df_filtered_slo = df_concat_slo.groupby("srprec").filter(lambda x : len(x) > 4)
print("Filtered precincts {} vs {} total precincts".format(df_filtered_slo.shape[0], df_concat_slo.shape[0]))

(171, 191)
Filtered precincts 380 vs 725 total precincts


## Validating Results   

We can perform a basic sanity check on the results of our filtered data, by comparing our own summated results of the assembly races with the certified totals. These totals were retrieved from BallotPedia 

In [153]:

confirmed_totals = { '2020' : [103206, 126579], 
#                     '2018' : [76994, 97749],
                     '2016' : [87168, 105247], 
                     '2014' : [46126, 77452],
                     '2012' : [65500, 103762]}

print(df_election_totals.loc[2018]["ASSREP01"])
for yr in confirmed_totals:
    demVotes = df_election_totals.loc[yr]["ASSDEM01"]
    repVotes = df_election_totals.loc[yr]["ASSREP01"]
    demExpected = confirmed_totals[yr][0]
    repExpected = confirmed_totals[yr][1]
    assert demExpected == demVotes, "({}) Data was {} but expected: {}".format(yr, demVotes, demExpected)
    assert repExpected == repVotes, "({}) Data was {} but expected: {}".format(yr, repVotes, repExpected)

Interesting, and rather frustratingly, it appears that Santa Barbara's 2018 county file does not actually contain information about the assembly's votes.

## Exporting Results

In [174]:
df_election_totals.to_csv("data/processed/election_totals.csv")
df_registration_totals.to_csv("data/processed/registration_totals.csv")