# North Carolina Scorecard

This notebook generates the county, school district, and statewide "future voter scorecards" for NC. It is generalize to be updated every month, with minimal changes.

Scorecard outputs (tables) are written back to BigQuery, where they are then read into Google Sheets for formatting

In [27]:
import pandas as pd
import numpy as np
import pandas_gbq

## Inputs
Update the fields below each month

In [28]:
# Inputs
as_of_data_date = pd.Timestamp("2025-05-03")
acs_year = '2022' # 2022 for 2024 scorecards. ACS vintages trail by 2 years


## Outputs
Run the cells below, without edits each month

In [29]:
# Derived variables
latest_18_yob = as_of_data_date.year - 18 # 2006 for 2024 scorecards
earliest_18_yob = latest_18_yob - 1 # 2005 for 2024 scorecards

latest_45_yob = as_of_data_date.year - 45 # 1979 for 2024 scorecards
earliest_45_yob = latest_45_yob - 1 # 1978 for 2024 scorecards

data_date_suffix = str(as_of_data_date.year) + str(as_of_data_date.month).rjust(2, "0") + str(as_of_data_date.day).rjust(2, "0")  # this is the rolling "as of" date, where we snap the line for 18yos

# Define columns

    # Voter file
REG_YOB_LATE_YEAR = 'REG_YOB_' + str(latest_18_yob) # col name for number of registrants with given birth year (2006 for 2024 scorecards)
REG_YOB_EARLY_YEAR = 'REG_YOB_' + str(earliest_18_yob) # col name for number of registrants with given birth year (2005 for 2024 scorecards)

REG_45_PLUS_YOB_LATE_YEAR = 'REG_45_PLUS_YOB_' + str(latest_45_yob) # col name for number of registrants with given birth year (1979 for 2024 scorecards)
REG_45_PLUS_YOB_EARLY_YEAR = 'REG_45_PLUS_YOB_' + str(earliest_45_yob) # col name for number of registrants with given birth year (1978 for 2024 scorecards)

    # ACS
EST_18_YO_THIS_YEAR = 'EST_18_YO_' + str(as_of_data_date.year) # col name for number of 18 yos this "as_of" year estimated from ACS
EST_18_AND_19_YO_THIS_YEAR = 'EST_18_AND_19_YO_' + str(as_of_data_date.year) # col name for number of 18 and 19 yos this "as_of" year estimated from ACS

EST_45_PLUS_YO_THIS_YEAR = 'EST_45_PLUS_YO_' + str(as_of_data_date.year) # col name for number of 45+ yos in the "as of" year estimated from ACS


### County Scorecard

In [30]:
# Define table names
voter_file_table = data_date_suffix + "_scorecard_nc"
voter_source_table = data_date_suffix + "_nc_voter_registration"

acs_S0101_table = "S0101_us_counties_acs5y_" + acs_year

#### Query from BQ
This query:
* Summarizes the voter file by county, counting the number of registrants in a given birth year.
* Then, left joins the county estimates for the total number of 18 (and 19) yos from the ACS
    * The estimates for the total number of 18 (and 19) yos are derived from the raw estimates of 15-17 yos, **assuming a uniform distribution of population across 15, 16, and 17 year olds.**
    * Since the ACS trails by 2 years, the ACS estimate of 15-17yos is used as a proxy for the number of 17-19yos today. (This means we are intentionally *not* trying to count the college student or "group quarters" population in our denominator)

In [31]:
# Define GCP project
project_id = "tcc-research"

# Define query, including variables and column names that adjust with time
sql = """
WITH addresses AS(
    SELECT
    ncid as VOTER_ID,
    CASE 
        -- WATAUGA 
        WHEN county_id = 95 AND (mail_addr1 LIKE 'ASU %') THEN FARM_FINGERPRINT('APPALACHIAN STATE UNIVERSITY DORM')
        -- ORANGE 
        WHEN county_id = 68 AND res_city_desc = 'CHAPEL HILL' THEN FARM_FINGERPRINT(CONCAT(COALESCE(REGEXP_REPLACE(res_street_address, '( #.*)$', '') ,''), COALESCE(res_city_desc,''), COALESCE(state_cd,''), COALESCE(zip_code,'')))
   
        ELSE FARM_FINGERPRINT(CONCAT(COALESCE(res_street_address,''), COALESCE(res_city_desc,''), COALESCE(state_cd,''), COALESCE(zip_code,'')))
    END as ADDRESS_ID
    FROM `tcc-research.nc_sources.""" + voter_source_table + """`


), young_voters AS(
    
    SELECT  
    a.*,
    ADDRESS_ID 
    FROM `tcc-research.nc_production.""" + voter_file_table + """` a
    LEFT JOIN addresses on addresses.VOTER_ID = a.VOTER_ID
    WHERE VOTER_STATUS IN ('ACTIVE')

),address_count_18 AS(    
    SELECT  
    ADDRESS_ID,
    COUNT(VOTER_ID) AS N_18_VOTERS_AT_ADDRESS
    FROM young_voters a
    WHERE YEAR_OF_BIRTH IN (""" + str(latest_18_yob) + ", " + str(earliest_18_yob) + """)
    GROUP BY ADDRESS_ID

),voter_file_county AS(
    SELECT
    COUNTY_FIPS,
    COUNTY_NAME,
    COUNT(VOTER_ID) AS N_VOTERS,
    COUNTIF(YEAR_OF_BIRTH = """ + str(latest_18_yob) + ") AS " + REG_YOB_LATE_YEAR + """,
    COUNTIF(YEAR_OF_BIRTH = """ + str(earliest_18_yob) + ") AS " + REG_YOB_EARLY_YEAR + """,
    COUNTIF(YEAR_OF_BIRTH <= """ + str(latest_45_yob) + ") AS " + REG_45_PLUS_YOB_LATE_YEAR + """,
    COUNTIF(YEAR_OF_BIRTH <= """ + str(earliest_45_yob) + ") AS " + REG_45_PLUS_YOB_EARLY_YEAR + """,
    FROM young_voters
    LEFT JOIN address_count_18 on young_voters.ADDRESS_ID = address_count_18.ADDRESS_ID
    WHERE COALESCE(N_18_VOTERS_AT_ADDRESS, 0)<4
    GROUP BY COUNTY_FIPS, COUNTY_NAME

), acs_county AS(
    SELECT
    COUNTY_FIPS,
    EST_15_TO_17_YO,
    MOE_15_TO_17_YO,
    EST_15_TO_17_YO / 3 AS """ + EST_18_YO_THIS_YEAR + """,
    EST_15_TO_17_YO * 2 / 3 AS """ + EST_18_AND_19_YO_THIS_YEAR + """,
    EST_45_TO_49_YO + EST_50_TO_54_YO + EST_55_TO_59_YO + EST_55_TO_59_YO + EST_60_AND_OVER AS """ + EST_45_PLUS_YO_THIS_YEAR + """
    FROM `tcc-research.acs_sources.""" + acs_S0101_table + """`
    WHERE STATE_FIPS = "37"

)

SELECT
voter_file_county.*,
acs_county.EST_15_TO_17_YO,
acs_county.MOE_15_TO_17_YO,
acs_county.""" + EST_18_YO_THIS_YEAR + """,
acs_county.""" + EST_18_AND_19_YO_THIS_YEAR + """,
acs_county.""" + EST_45_PLUS_YO_THIS_YEAR + """

FROM voter_file_county LEFT JOIN acs_county ON voter_file_county.COUNTY_FIPS = acs_county.COUNTY_FIPS
"""
# Query
df = pandas_gbq.read_gbq(sql, project_id=project_id)

Downloading: 100%|[32m██████████[0m|


In [32]:
# Preview
df.head()

Unnamed: 0,COUNTY_FIPS,COUNTY_NAME,N_VOTERS,REG_YOB_2007,REG_YOB_2006,REG_45_PLUS_YOB_1980,REG_45_PLUS_YOB_1979,EST_15_TO_17_YO,MOE_15_TO_17_YO,EST_18_YO_2025,EST_18_AND_19_YO_2025,EST_45_PLUS_YO_2025
0,81,GUILFORD,324115,1191,5336,190906,186058,21093,*****,7031.0,14062.0,253633
1,1,ALAMANCE,101623,399,1709,61909,60499,6617,42,2205.666667,4411.333333,85865
2,59,DAVIE,29676,109,457,19704,19325,1693,86,564.333333,1128.666667,24638
3,71,GASTON,136158,526,2071,82670,80665,9089,34,3029.666667,6059.333333,115639
4,127,NASH,60199,226,919,38373,37574,3786,40,1262.0,2524.0,50632


In [33]:
df_reg_est = df.copy()

#### Metric 1: Estimated registration rate of 18 year olds as of a rolling date (i.e. latest month)
Ex: In March 2024, we consider the registration rate among those born between March 2nd 2005 and March 1st 2006


Notes:
- The MI voter file does *not* include full birth dates for registrants – only year of birth is included
- 18 yos as of a given date in the middle of the calendar year can have 2 potential years of birth. We refer to these as the "later 18 yo year of birth" (2006 for 2024 scorecards) and the "earlier 18 yo year of birth" (2005 for 2024 scorecards)
- MI voter file includes 18yo registrants only: Those who are 18 years old prior to March 1st 2024. This was confirmed by the SOS. This means we can assume everyone born in the "later 18 year old year" is 18 (and there are no 17 year old).
- We still need to discount those born in the "earlier 18-year old year", because some of those born in that year are already 19
    - Ex: in March 2024, those born in Jan (31 days) and Feb (28 days) 2005 are already 19, so only those born March through December 2005 are 18

Estimation:

To estimate the number of 18 yos as of a rolling date, we "pro-rate" the number of registrants born in a given year based on the share of days in the year that could be 18yo birthdays. There are two steps:
- For the later 18 yo year of birth: Assume all are 18
- For the earlier 18 yo year of birth: Estimate the number of days that could be 18yo birthdays. Calculate the number of total potential birthdays included in the voter file (just ~365). Calculate the ratio of these numbers.


Assumptions:
- Voter file is as of 1st of the month (confirmed by SOS)
- Even distribution of birthdays across all days of year
- Uniform registration rates among older 18 yos, and younger 19yos

In [34]:
# Define column names
EST_REG_18_YO_AS_OF_ROLLING = 'EST_REG_18_YO_AS_OF_' + data_date_suffix # col name for estimated 18yo as of rolling date

# Birthday splits 18 yo vs. 19 yo in voter file (hypothetical)
earliest_bday_18 = as_of_data_date - pd.tseries.offsets.DateOffset(years=19) + pd.tseries.offsets.DateOffset(days=1) # earliest possible bday for 18yo

n_bdays_of_18_early = pd.Timestamp(str(earliest_18_yob) +"-12-31") - earliest_bday_18 # number of possible bdays of 18 yos in earlier 18 yo year of birth
n_total_days_of_early_year = pd.Timestamp(str(earliest_18_yob) +"-12-31") - pd.Timestamp(str(earliest_18_yob) +"-01-01") # number of total birthdays in earlier 18 yo year of birth (should be 365)

# Discounts
    # Share of 18yo in late year
share_18_late_year = 1

    # Share of 18yo in early year
share_18_early_year = n_bdays_of_18_early / n_total_days_of_early_year

    # CHECKS
print("share of 18 yo in late year: {}".format(share_18_late_year))
print("share of 18 yo in early year: {}".format(share_18_early_year))

share of 18 yo in late year: 1
share of 18 yo in early year: 0.6620879120879121


In [35]:
# Calculate numerator (registrants)
df_reg_est[EST_REG_18_YO_AS_OF_ROLLING] = df_reg_est[REG_YOB_LATE_YEAR] * share_18_late_year  + df_reg_est[REG_YOB_EARLY_YEAR] * share_18_early_year

# Calculate estimated registration rate
EST_REG_RATE_18_YO_AS_OF_ROLLING = 'EST_REG_RATE_18_YO_AS_OF_' + data_date_suffix # col name for estimated 18yo as of rolling date
df_reg_est[EST_REG_RATE_18_YO_AS_OF_ROLLING] = df_reg_est[EST_REG_18_YO_AS_OF_ROLLING] / df_reg_est[EST_18_YO_THIS_YEAR] # estimated registered 18yo over ACS 18yo population estimate

In [36]:
df_reg_est = df_reg_est.sort_values('N_VOTERS', ascending=False)
df_reg_est.head()

Unnamed: 0,COUNTY_FIPS,COUNTY_NAME,N_VOTERS,REG_YOB_2007,REG_YOB_2006,REG_45_PLUS_YOB_1980,REG_45_PLUS_YOB_1979,EST_15_TO_17_YO,MOE_15_TO_17_YO,EST_18_YO_2025,EST_18_AND_19_YO_2025,EST_45_PLUS_YO_2025,EST_REG_18_YO_AS_OF_20250503,EST_REG_RATE_18_YO_AS_OF_20250503
77,183,WAKE,741387,3323,12584,404034,391096,47558,30,15852.666667,31705.333333,499266,11654.714286,0.73519
39,119,MECKLENBURG,677094,2780,10538,347946,336817,42911,18,14303.666667,28607.333333,465789,9757.082418,0.682139
0,81,GUILFORD,324115,1191,5336,190906,186058,21093,*****,7031.0,14062.0,253633,4723.901099,0.671868
69,67,FORSYTH,229574,896,3599,137137,133873,15559,27,5186.333333,10372.666667,186737,3278.854396,0.632211
71,63,DURHAM,198669,665,2508,101667,98638,10224,39,3408.0,6816.0,139455,2325.516484,0.68237


#### Metric 2: Estimated registration rate of 45 year olds as of a rolling date (i.e. latest month)
To count the 45+ yo as of a rolling date, we need to discount some folks born in the latest year of 45 year olds, because they are still 44

Assumptions:
- Even distribution of birthdays across all days of year
- Uniform registration rates among older 44 yos, and younger 45yos

In [37]:
# Define column names
EST_REG_45_PLUS_YO_AS_OF_ROLLING = 'EST_REG_45_PLUS_YO_AS_OF_' + data_date_suffix # col name for estimated 45yo as of rolling date

# Birthday splits 44 yo vs. 45 yo in voter file (hypothetical)
lastest_bday_45 = as_of_data_date - pd.tseries.offsets.DateOffset(years=45)  # latest possible bday for 45yo

n_bdays_of_45 = lastest_bday_45 - pd.Timestamp(str(latest_45_yob) +"-01-01") # number of possible bdays of 45 yos in later 45yo year of birth
n_total_days_of_late_year = pd.Timestamp(str(latest_45_yob) +"-12-31") - pd.Timestamp(str(latest_45_yob) +"-01-01") # number of total birthdays in later 45yo year of birth


    # Share of 45yo in early year
share_45_late_year = n_bdays_of_45 / n_total_days_of_late_year

    # CHECKS
print("share of 45 yo in early year: {}".format(share_45_late_year))

share of 45 yo in early year: 0.336986301369863


In [38]:
# Calculate numerator (registrants)
df_reg_est[EST_REG_45_PLUS_YO_AS_OF_ROLLING] = ((df_reg_est[REG_45_PLUS_YOB_LATE_YEAR] - df_reg_est[REG_45_PLUS_YOB_EARLY_YEAR]) * share_45_late_year)  + df_reg_est[REG_45_PLUS_YOB_EARLY_YEAR]

# Calculate estimated registration rate
EST_REG_RATE_45_PLUS_YO_AS_OF_ROLLING = 'EST_REG_RATE_45_PLUS_YO_AS_OF_' + data_date_suffix # col name for estimated 45yo as of rolling date
df_reg_est[EST_REG_RATE_45_PLUS_YO_AS_OF_ROLLING] = df_reg_est[EST_REG_45_PLUS_YO_AS_OF_ROLLING] / df_reg_est[EST_45_PLUS_YO_THIS_YEAR] # estimated registered 45yo over ACS 45yo population estimate

In [39]:
df_reg_est = df_reg_est.sort_values('N_VOTERS', ascending=False)
df_reg_est.head(20)

Unnamed: 0,COUNTY_FIPS,COUNTY_NAME,N_VOTERS,REG_YOB_2007,REG_YOB_2006,REG_45_PLUS_YOB_1980,REG_45_PLUS_YOB_1979,EST_15_TO_17_YO,MOE_15_TO_17_YO,EST_18_YO_2025,EST_18_AND_19_YO_2025,EST_45_PLUS_YO_2025,EST_REG_18_YO_AS_OF_20250503,EST_REG_RATE_18_YO_AS_OF_20250503,EST_REG_45_PLUS_YO_AS_OF_20250503,EST_REG_RATE_45_PLUS_YO_AS_OF_20250503
77,183,WAKE,741387,3323,12584,404034,391096,47558,30,15852.666667,31705.333333,499266,11654.714286,0.73519,395455.928767,0.792075
39,119,MECKLENBURG,677094,2780,10538,347946,336817,42911,18,14303.666667,28607.333333,465789,9757.082418,0.682139,340567.320548,0.731162
0,81,GUILFORD,324115,1191,5336,190906,186058,21093,*****,7031.0,14062.0,253633,4723.901099,0.671868,187691.709589,0.740013
69,67,FORSYTH,229574,896,3599,137137,133873,15559,27,5186.333333,10372.666667,186737,3278.854396,0.632211,134972.923288,0.722797
71,63,DURHAM,198669,665,2508,101667,98638,10224,39,3408.0,6816.0,139455,2325.516484,0.68237,99658.731507,0.71463
16,21,BUNCOMBE,180596,587,2079,110990,108050,8663,38,2887.666667,5775.333333,144295,1963.480769,0.679954,109040.739726,0.755679
30,51,CUMBERLAND,173100,732,2972,98069,95386,12860,36,4286.666667,8573.333333,129853,2699.725275,0.629796,96290.134247,0.741532
63,129,NEW HANOVER,157088,484,2097,94539,92285,7279,81,2426.333333,4852.666667,115498,1872.398352,0.771699,93044.567123,0.805595
88,179,UNION,156262,936,3351,94704,91780,12970,5,4323.333333,8646.666667,115787,3154.656593,0.729682,92765.347945,0.801172
31,101,JOHNSTON,143230,652,2598,83048,80604,9984,98,3328.0,6656.0,103011,2372.104396,0.712772,81427.594521,0.790475


In [40]:
df_reg_est = df_reg_est.sort_values('EST_REG_RATE_18_YO_AS_OF_20250503', ascending=False)
df_reg_est

Unnamed: 0,COUNTY_FIPS,COUNTY_NAME,N_VOTERS,REG_YOB_2007,REG_YOB_2006,REG_45_PLUS_YOB_1980,REG_45_PLUS_YOB_1979,EST_15_TO_17_YO,MOE_15_TO_17_YO,EST_18_YO_2025,EST_18_AND_19_YO_2025,EST_45_PLUS_YO_2025,EST_REG_18_YO_AS_OF_20250503,EST_REG_RATE_18_YO_AS_OF_20250503,EST_REG_45_PLUS_YO_AS_OF_20250503,EST_REG_RATE_45_PLUS_YO_AS_OF_20250503
41,003,ALEXANDER,23225,242,344,15139,14847,1341,121,447.000000,894.000000,20246,469.758242,1.050913,14945.4,0.73819
72,189,WATAUGA,34169,119,408,17762,17333,1202,29,400.666667,801.333333,23535,389.131868,0.971211,17477.567123,0.74262
73,147,PITT,102343,406,2186,56124,54624,6294,28,2098.000000,4196.000000,71236,1853.324176,0.883377,55129.479452,0.773899
17,017,BLADEN,19690,74,355,12947,12679,1060,104,353.333333,706.666667,17034,309.041209,0.874645,12769.312329,0.749637
67,011,AVERY,10444,37,134,7334,7211,439,31,146.333333,292.666667,10289,125.71978,0.859133,7252.449315,0.704874
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,185,WARREN,11434,22,143,8282,8122,681,70,227.000000,454.000000,11456,116.678571,0.514003,8175.917808,0.71368
86,173,SWAIN,8223,26,108,5504,5366,584,98,194.666667,389.333333,7158,97.505495,0.500884,5412.50411,0.756148
89,103,JONES,6173,13,77,4222,4123,392,71,130.666667,261.333333,5813,63.980769,0.489649,4156.361644,0.715011
74,033,CASWELL,13381,25,152,9230,9069,783,46,261.000000,522.000000,13432,125.637363,0.481369,9123.254795,0.679218


#### Output
Write back to BQ

##### Wide

In [41]:
# Flag largest counties
df_reg_est['is_in_10_largest'] = np.where(df_reg_est.COUNTY_NAME.isin(df_reg_est.nlargest(10, columns='EST_18_YO_2025').COUNTY_NAME),1,0)

In [42]:
# write
project_id = "tcc-research"
table_id = 'nc_output.' + data_date_suffix+ '_nc_county_scorecard_output'

pandas_gbq.to_gbq(df_reg_est, table_id, project_id=project_id, if_exists='replace')

100%|██████████| 1/1 [00:00<00:00, 3806.08it/s]


### Statewide Scorecard

In [43]:
# Define table names
voter_file_table = data_date_suffix + "_scorecard_nc"
acs_S0101_table = "S0101_us_states_acs5y_" + acs_year

#### Query from BQ
This query:
* Summarizes the voter file for NC state, counting the number of registrants in a given birth year.
* Then, left joins the statewide estimates for the total number of 18 (and 19) yos from the ACS
    * The estimates for the total number of 18 (and 19) yos is derived from the raw estimates of 15-17 yos, **assuming a uniform distribution of population across 15, 16, and 17 year olds.**
    * Since the ACS trails by 2 years, the ACS estimate of 15-17yos is used as a proxy for the number of 17-19yos today. (This means we are intentionally *not* trying to count the college student or "group quarters" population in our denominator)

In [44]:
# Define GCP project
project_id = "tcc-research"

# Define query, including variables and column names that adjust with time
sql = """
WITH addresses AS(
    SELECT
    ncid as VOTER_ID,
    CASE 
        -- WATAUGA 
        WHEN county_id = 95 AND (mail_addr1 LIKE 'ASU %') THEN FARM_FINGERPRINT('APPALACHIAN STATE UNIVERSITY DORM')
        -- ORANGE 
        WHEN county_id = 68 AND res_city_desc = 'CHAPEL HILL' THEN FARM_FINGERPRINT(CONCAT(COALESCE(REGEXP_REPLACE(res_street_address, '( #.*)$', '') ,''), COALESCE(res_city_desc,''), COALESCE(state_cd,''), COALESCE(zip_code,'')))
   
        ELSE FARM_FINGERPRINT(CONCAT(COALESCE(res_street_address,''), COALESCE(res_city_desc,''), COALESCE(state_cd,''), COALESCE(zip_code,'')))
    END as ADDRESS_ID
    FROM `tcc-research.nc_sources.""" + voter_source_table + """`


), young_voters AS(
    
    SELECT  
    a.*,
    ADDRESS_ID 
    FROM `tcc-research.nc_production.""" + voter_file_table + """` a
    LEFT JOIN addresses on addresses.VOTER_ID = a.VOTER_ID
    WHERE VOTER_STATUS IN ('ACTIVE')

),address_count_18 AS(    
    SELECT  
    ADDRESS_ID,
    COUNT(VOTER_ID) AS N_18_VOTERS_AT_ADDRESS
    FROM young_voters a
    WHERE YEAR_OF_BIRTH IN (""" + str(latest_18_yob) + ", " + str(earliest_18_yob) + """)
    GROUP BY ADDRESS_ID

),voter_file_nc AS(
    SELECT
    STATE_FIPS,
    COUNT(VOTER_ID) AS N_VOTERS,
    COUNTIF(YEAR_OF_BIRTH = """ + str(latest_18_yob) + ") AS " + REG_YOB_LATE_YEAR + """,
    COUNTIF(YEAR_OF_BIRTH = """ + str(earliest_18_yob) + ") AS " + REG_YOB_EARLY_YEAR + """,
    COUNTIF(YEAR_OF_BIRTH <= """ + str(latest_45_yob) + ") AS " + REG_45_PLUS_YOB_LATE_YEAR + """,
    COUNTIF(YEAR_OF_BIRTH <= """ + str(earliest_45_yob) + ") AS " + REG_45_PLUS_YOB_EARLY_YEAR + """,
    FROM young_voters
    LEFT JOIN address_count_18 on young_voters.ADDRESS_ID = address_count_18.ADDRESS_ID
    WHERE COALESCE(N_18_VOTERS_AT_ADDRESS, 0)<4
    GROUP BY STATE_FIPS

), acs_nc AS(
    SELECT
    STATE_FIPS,
    EST_15_TO_17_YO,
    MOE_15_TO_17_YO,
    EST_15_TO_17_YO / 3 AS """ + EST_18_YO_THIS_YEAR + """,
    EST_15_TO_17_YO * 2 / 3 AS """ + EST_18_AND_19_YO_THIS_YEAR + """,
    EST_45_TO_49_YO + EST_50_TO_54_YO + EST_55_TO_59_YO + EST_55_TO_59_YO + EST_60_AND_OVER AS """ + EST_45_PLUS_YO_THIS_YEAR + """
    FROM `tcc-research.acs_sources.""" + acs_S0101_table + """`
    WHERE STATE_FIPS = "37"

)

SELECT
voter_file_nc.*,
acs_nc.EST_15_TO_17_YO,
acs_nc.MOE_15_TO_17_YO,
acs_nc.""" + EST_18_YO_THIS_YEAR + """,
acs_nc.""" + EST_18_AND_19_YO_THIS_YEAR + """,
acs_nc.""" + EST_45_PLUS_YO_THIS_YEAR + """

FROM voter_file_nc LEFT JOIN acs_nc ON voter_file_nc.STATE_FIPS = acs_nc.STATE_FIPS
"""
# Query
df = pandas_gbq.read_gbq(sql, project_id=project_id)

Downloading: 100%|[32m██████████[0m|


In [45]:
# Preview
df

Unnamed: 0,STATE_FIPS,N_VOTERS,REG_YOB_2007,REG_YOB_2006,REG_45_PLUS_YOB_1980,REG_45_PLUS_YOB_1979,EST_15_TO_17_YO,MOE_15_TO_17_YO,EST_18_YO_2025,EST_18_AND_19_YO_2025,EST_45_PLUS_YO_2025
0,37,6507793,25712,102191,3922257,3825319,404849,794,134949.666667,269899.333333,5139416


In [46]:
df_reg_est = df.copy()

#### Metric 1: Estimated registration rate of 18 year olds as of a rolling date (i.e. latest month)
Ex: In March 2024, we consider the registration rate among those born between March 2nd 2005 and March 1st 2006


Notes:
- The MI voter file does *not* include full birth dates for registrants – only year of birth is included
- 18 yos as of a given date in the middle of the calendar year can have 2 potential years of birth. We refer to these as the "later 18 yo year of birth" (2006 for 2024 scorecards) and the "earlier 18 yo year of birth" (2005 for 2024 scorecards)
- MI voter file includes 18yo registrants only: Those who are 18 years old prior to March 1st 2024. This was confirmed by the SOS. This means we can assume everyone born in the "later 18 year old year" is 18 (and there are no 17 year old).
- We still need to discount those born in the "earlier 18-year old year", because some of those born in that year are already 19
    - Ex: in March 2024, those born in Jan (31 days) and Feb (28 days) 2005 are already 19, so only those born March through December 2005 are 18

Estimation:

To estimate the number of 18 yos as of a rolling date, we "pro-rate" the number of registrants born in a given year based on the share of days in the year that could be 18yo birthdays. There are two steps:
- For the later 18 yo year of birth: Assume all are 18
- For the earlier 18 yo year of birth: Estimate the number of days that could be 18yo birthdays. Calculate the number of total potential birthdays included in the voter file (just ~365). Calculate the ratio of these numbers.


Assumptions:
- Voter file is as of 1st of the month (confirmed by SOS)
- Even distribution of birthdays across all days of year
- Uniform registration rates among older 18 yos, and younger 19yos

In [47]:
# Define column names
EST_REG_18_YO_AS_OF_ROLLING = 'EST_REG_18_YO_AS_OF_' + data_date_suffix # col name for estimated 18yo as of rolling date

# Birthday splits 18 yo vs. 19 yo in voter file (hypothetical)
earliest_bday_18 = as_of_data_date - pd.tseries.offsets.DateOffset(years=19) + pd.tseries.offsets.DateOffset(days=1) # earliest possible bday for 18yo

n_bdays_of_18_early = pd.Timestamp(str(earliest_18_yob) +"-12-31") - earliest_bday_18 # number of possible bdays of 18 yos in earlier 18 yo year of birth
n_total_days_of_early_year = pd.Timestamp(str(earliest_18_yob) +"-12-31") - pd.Timestamp(str(earliest_18_yob) +"-01-01") # number of total birthdays in earlier 18 yo year of birth (should be 365)

# Discounts
    # Share of 18yo in late year
share_18_late_year = 1

    # Share of 18yo in early year
share_18_early_year = n_bdays_of_18_early / n_total_days_of_early_year

    # CHECKS
print("share of 18 yo in late year: {}".format(share_18_late_year))
print("share of 18 yo in early year: {}".format(share_18_early_year))

share of 18 yo in late year: 1
share of 18 yo in early year: 0.6620879120879121


In [48]:
# Calculate numerator (registrants)
df_reg_est[EST_REG_18_YO_AS_OF_ROLLING] = df_reg_est[REG_YOB_LATE_YEAR] * share_18_late_year  + df_reg_est[REG_YOB_EARLY_YEAR] * share_18_early_year

# Calculate estimated registration rate
EST_REG_RATE_18_YO_AS_OF_ROLLING = 'EST_REG_RATE_18_YO_AS_OF_' + data_date_suffix # col name for estimated 18yo as of rolling date
df_reg_est[EST_REG_RATE_18_YO_AS_OF_ROLLING] = df_reg_est[EST_REG_18_YO_AS_OF_ROLLING] / df_reg_est[EST_18_YO_THIS_YEAR] # estimated registered 18yo over ACS 18yo population estimate

In [49]:
df_reg_est = df_reg_est.sort_values('N_VOTERS', ascending=False)
df_reg_est.head()

Unnamed: 0,STATE_FIPS,N_VOTERS,REG_YOB_2007,REG_YOB_2006,REG_45_PLUS_YOB_1980,REG_45_PLUS_YOB_1979,EST_15_TO_17_YO,MOE_15_TO_17_YO,EST_18_YO_2025,EST_18_AND_19_YO_2025,EST_45_PLUS_YO_2025,EST_REG_18_YO_AS_OF_20250503,EST_REG_RATE_18_YO_AS_OF_20250503
0,37,6507793,25712,102191,3922257,3825319,404849,794,134949.666667,269899.333333,5139416,93371.425824,0.691898


#### Metric 2: Estimated registration rate of 45 year olds as of a rolling date (i.e. latest month)
To count the 45+ yo as of a rolling date, we need to discount some folks born in the latest year of 45 year olds, because they are still 44

Assumptions:
- Even distribution of birthdays across all days of year
- Uniform registration rates among older 44 yos, and younger 45yos

In [50]:
# Define column names
EST_REG_45_PLUS_YO_AS_OF_ROLLING = 'EST_REG_45_PLUS_YO_AS_OF_' + data_date_suffix # col name for estimated 45yo as of rolling date

# Birthday splits 44 yo vs. 45 yo in voter file (hypothetical)
lastest_bday_45 = as_of_data_date - pd.tseries.offsets.DateOffset(years=45)  # latest possible bday for 45yo

n_bdays_of_45 = lastest_bday_45 - pd.Timestamp(str(latest_45_yob) +"-01-01") # number of possible bdays of 45 yos in later 45yo year of birth
n_total_days_of_late_year = pd.Timestamp(str(latest_45_yob) +"-12-31") - pd.Timestamp(str(latest_45_yob) +"-01-01") # number of total birthdays in later 45yo year of birth


    # Share of 45yo in early year
share_45_late_year = n_bdays_of_45 / n_total_days_of_late_year

    # CHECKS
print("share of 45 yo in early year: {}".format(share_45_late_year))

share of 45 yo in early year: 0.336986301369863


In [51]:
# Calculate numerator (registrants)
df_reg_est[EST_REG_45_PLUS_YO_AS_OF_ROLLING] = ((df_reg_est[REG_45_PLUS_YOB_LATE_YEAR] - df_reg_est[REG_45_PLUS_YOB_EARLY_YEAR]) * share_45_late_year)  + df_reg_est[REG_45_PLUS_YOB_EARLY_YEAR]

# Calculate estimated registration rate
EST_REG_RATE_45_PLUS_YO_AS_OF_ROLLING = 'EST_REG_RATE_45_PLUS_YO_AS_OF_' + data_date_suffix # col name for estimated 45yo as of rolling date
df_reg_est[EST_REG_RATE_45_PLUS_YO_AS_OF_ROLLING] = df_reg_est[EST_REG_45_PLUS_YO_AS_OF_ROLLING] / df_reg_est[EST_45_PLUS_YO_THIS_YEAR] # estimated registered 45yo over ACS 45yo population estimate

In [52]:
df_reg_est = df_reg_est.sort_values('N_VOTERS', ascending=False)
df_reg_est.head()

Unnamed: 0,STATE_FIPS,N_VOTERS,REG_YOB_2007,REG_YOB_2006,REG_45_PLUS_YOB_1980,REG_45_PLUS_YOB_1979,EST_15_TO_17_YO,MOE_15_TO_17_YO,EST_18_YO_2025,EST_18_AND_19_YO_2025,EST_45_PLUS_YO_2025,EST_REG_18_YO_AS_OF_20250503,EST_REG_RATE_18_YO_AS_OF_20250503,EST_REG_45_PLUS_YO_AS_OF_20250503,EST_REG_RATE_45_PLUS_YO_AS_OF_20250503
0,37,6507793,25712,102191,3922257,3825319,404849,794,134949.666667,269899.333333,5139416,93371.425824,0.691898,3857985.778082,0.750666


#### Output
Write back to BQ

In [53]:
# write
project_id = "tcc-research"
table_id = 'nc_output.' + data_date_suffix+ '_nc_statewide_scorecard_output'

pandas_gbq.to_gbq(df_reg_est, table_id, project_id=project_id, if_exists='replace')

100%|██████████| 1/1 [00:00<00:00, 4809.98it/s]
