# North Carolina Scorecard

This notebook generates the county and statewide "future voter scorecards" for NY. It is generalized to be updated every month, with minimal changes.

Scorecard outputs (tables) are written back to BigQuery, where they are then read into Google Sheets for formatting

In [2]:
import pandas as pd
import numpy as np
import pandas_gbq

## Inputs
Update the fields below each month

In [3]:
# Inputs
as_of_data_date = pd.Timestamp("2025-05-03")
as_of_election_date = None
acs_year = '2022' # 2022 for 2023 scorecards. ACS vintages trail by 2 years


## Outputs
Run the cells below, without edits each month

In [4]:
# Derived variables 

data_date_suffix = str(as_of_data_date.year) + str(as_of_data_date.month).rjust(2, "0") + str(as_of_data_date.day).rjust(2, "0")  # this is the rolling "as of" date, where we snap the line for 18yos

end_of_year_suffix = str(as_of_data_date.year) + str(12) # this is the "end-of-year" cohort date, where we alternatively snap the line for 18yos

# Define columns
    # ACS
EST_17_OR_18_OR_19_YO_THIS_YEAR = 'EST_17_OR_18_OR_19_YO_' + str(as_of_data_date.year) # col name for number of 17/18/19 yos in the "as of" year estimated from ACS
EST_45_PLUS_YO_THIS_YEAR = 'EST_45_PLUS_YO_' + str(as_of_data_date.year) # col name for number of 45+ yos in the "as of" year estimated from ACS


### County Scorecard

In [5]:
# Define table names
voter_file_table = data_date_suffix + "_scorecard_nc"
acs_S0101_table = "S0101_us_counties_acs5y_" + acs_year

#### Query from BQ
This query:
* Summarizes the voter file by county, counting the number of registrants at a given age
* Then, left joins the county estimates for the total number of 17, 18, and 19yos from the ACS
    * The estimates for the total number of 17-19yo yos are derived from the raw estimates of 15-17 yos, **assuming a uniform distribution of population across 15, 16, and 17 year olds.**
    * Since the ACS trails by 2 years, the ACS estimate of 15-17yos is used as a proxy for the number of 17-19yos today. (This means we are intentionally *not* trying to count the college student or "group quarters" population in our denominator)

* Ages are calculated as of the date of the voter file and as of the next general election 

In [6]:
str(as_of_data_date.day)

'3'

In [7]:
voter_file_table

'20250503_scorecard_nc'

In [8]:
sql = f"""
SELECT COUNT(*) 
FROM `tcc-research.nc_production.{voter_file_table}`
WHERE VOTER_STATUS IN ('A', 'I')
"""

In [9]:
# Define GCP project
project_id = "tcc-research"


# Define query, including variables and column names that adjust with time
as_of_date_str = as_of_data_date.strftime('%Y-%m-%d')

sql = f"""
WITH young_voters AS (
  SELECT  
    *,
    FLOOR(DATE_DIFF(
  DATE '2025-05-03', 
  DATE(CAST(YEAR_OF_BIRTH AS INT64), 1, 1), 
  DAY
) / 365.25) AS AGE_TODAY
  FROM `tcc-research.nc_production.{voter_file_table}`
  WHERE VOTER_STATUS IN ('ACTIVE', 'INACTIVE')
), young_voter_sum AS (
  SELECT
    COUNTY_FIPS,
    COUNTY_NAME,
    COUNTIF(AGE_TODAY = 18) AS N_VOTERS_18_AS_OF_{data_date_suffix},
    COUNTIF(AGE_TODAY = 19) AS N_VOTERS_19_AS_OF_{data_date_suffix},
    COUNTIF(AGE_TODAY >= 45) AS N_VOTERS_45_PLUS_AS_OF_{data_date_suffix}
  FROM young_voters
  GROUP BY COUNTY_FIPS, COUNTY_NAME
), acs_county AS (
  SELECT 
    LPAD(CAST(COUNTY_FIPS AS STRING), 5, '0') AS COUNTY_FIPS,
    EST_15_TO_17_YO,
    EST_15_TO_17_YO / 3 AS {EST_17_OR_18_OR_19_YO_THIS_YEAR},
    EST_45_TO_49_YO + EST_50_TO_54_YO + EST_55_TO_59_YO + EST_55_TO_59_YO + EST_60_AND_OVER AS {EST_45_PLUS_YO_THIS_YEAR}
  FROM `tcc-research.acs_sources.S0101_us_counties_acs5y_2022` 
  WHERE STATE_FIPS = '37'
)
SELECT
  young_voter_sum.*,
  acs_county.EST_15_TO_17_YO,
  acs_county.{EST_17_OR_18_OR_19_YO_THIS_YEAR},
  acs_county.{EST_45_PLUS_YO_THIS_YEAR}
FROM young_voter_sum
LEFT JOIN acs_county
  ON LPAD(CAST(young_voter_sum.COUNTY_FIPS AS STRING), 5, '0') = acs_county.COUNTY_FIPS
"""

# Query
df = pandas_gbq.read_gbq(sql, project_id=project_id)

Downloading: 100%|[32m██████████[0m|


In [10]:
# Preview
df.head()

Unnamed: 0,COUNTY_FIPS,COUNTY_NAME,N_VOTERS_18_AS_OF_20250503,N_VOTERS_19_AS_OF_20250503,N_VOTERS_45_PLUS_AS_OF_20250503,EST_15_TO_17_YO,EST_17_OR_18_OR_19_YO_2025,EST_45_PLUS_YO_2025
0,177,TYRRELL,0,15,1598,36,12.0,1839
1,59,DAVIE,109,459,21123,1693,564.333333,24638
2,197,YADKIN,65,344,15915,1415,471.666667,21399
3,193,WILKES,120,563,29048,2590,863.333333,38460
4,1,ALAMANCE,399,1954,67289,6617,2205.666667,85865


In [11]:
df_reg_est = df.copy()

#### Metric 1: Estimated registration rate of 18 year olds as of a rolling date (i.e. latest month)


In [12]:
# Calculate estimated registration rate

N_VOTERS_18_AS_OF_DATA_DATE = 'N_VOTERS_18_AS_OF_'+ data_date_suffix # col name for estimated 18yo voters as of rolling date


EST_REG_RATE_18_YO_AS_OF_DATA_DATE = 'EST_REG_RATE_18_YO_AS_OF_' + data_date_suffix # col name for estimated 18yo as of rolling date

df_reg_est[EST_REG_RATE_18_YO_AS_OF_DATA_DATE] = df_reg_est[N_VOTERS_18_AS_OF_DATA_DATE] / df_reg_est[EST_17_OR_18_OR_19_YO_THIS_YEAR] # estimated registered 18yo over ACS 18yo population estimate

df_reg_est.sort_values('EST_15_TO_17_YO', ascending=False)

Unnamed: 0,COUNTY_FIPS,COUNTY_NAME,N_VOTERS_18_AS_OF_20250503,N_VOTERS_19_AS_OF_20250503,N_VOTERS_45_PLUS_AS_OF_20250503,EST_15_TO_17_YO,EST_17_OR_18_OR_19_YO_2025,EST_45_PLUS_YO_2025,EST_REG_RATE_18_YO_AS_OF_20250503
80,183,WAKE,3326,13306,431805,47558,15852.666667,499266,0.209807
12,119,MECKLENBURG,2784,11408,391501,42911,14303.666667,465789,0.194635
26,081,GUILFORD,1194,6346,210449,21093,7031.000000,253633,0.169819
81,067,FORSYTH,898,4315,149748,15559,5186.333333,186737,0.173147
30,179,UNION,936,3385,102614,12970,4323.333333,115787,0.2165
...,...,...,...,...,...,...,...,...,...
38,073,GATES,27,106,5116,349,116.333333,6345,0.232092
49,005,ALLEGHANY,23,76,5736,328,109.333333,6936,0.210366
70,075,GRAHAM,17,74,4032,297,99.000000,4616,0.171717
64,095,HYDE,6,33,2136,121,40.333333,2830,0.14876


#### Metric 3: Estimated registration rate of 45+ year olds as of a rolling date (i.e. latest month)


In [13]:
# Calculate estimated registration rate

N_VOTERS_45_PLUS_AS_OF_DATA_DATE = 'N_VOTERS_45_PLUS_AS_OF_'+ data_date_suffix # col name for estimated 18yo voters as of rolling date

EST_REG_RATE_45_PLUS_YO_AS_OF_DATA_DATE = 'EST_REG_RATE_45_PLUS_YO_AS_OF_' + data_date_suffix # col name for estimated 18yo as of rolling date

df_reg_est[EST_REG_RATE_45_PLUS_YO_AS_OF_DATA_DATE] = df_reg_est[N_VOTERS_45_PLUS_AS_OF_DATA_DATE] / df_reg_est[EST_45_PLUS_YO_THIS_YEAR] # estimated registered 18yo over ACS 18yo population estimate

df_reg_est.sort_values(EST_45_PLUS_YO_THIS_YEAR, ascending=False)

Unnamed: 0,COUNTY_FIPS,COUNTY_NAME,N_VOTERS_18_AS_OF_20250503,N_VOTERS_19_AS_OF_20250503,N_VOTERS_45_PLUS_AS_OF_20250503,EST_15_TO_17_YO,EST_17_OR_18_OR_19_YO_2025,EST_45_PLUS_YO_2025,EST_REG_RATE_18_YO_AS_OF_20250503,EST_REG_RATE_45_PLUS_YO_AS_OF_20250503
80,183,WAKE,3326,13306,431805,47558,15852.666667,499266,0.209807,0.86488
12,119,MECKLENBURG,2784,11408,391501,42911,14303.666667,465789,0.194635,0.840511
26,081,GUILFORD,1194,6346,210449,21093,7031.000000,253633,0.169819,0.829738
81,067,FORSYTH,898,4315,149748,15559,5186.333333,186737,0.173147,0.801919
51,021,BUNCOMBE,587,2160,123486,8663,2887.666667,144295,0.203278,0.855788
...,...,...,...,...,...,...,...,...,...,...
14,103,JONES,13,78,4549,392,130.666667,5813,0.09949,0.782556
39,029,CAMDEN,42,122,4883,480,160.000000,5604,0.2625,0.871342
70,075,GRAHAM,17,74,4032,297,99.000000,4616,0.171717,0.873484
64,095,HYDE,6,33,2136,121,40.333333,2830,0.14876,0.75477


#### Output
Write back to BQ

In [14]:
# write
project_id = "tcc-research"
table_id = 'nc_output.' + data_date_suffix+ '_nc_county_scorecard_output'

pandas_gbq.to_gbq(df_reg_est, table_id, project_id=project_id, if_exists='replace')

100%|██████████| 1/1 [00:00<?, ?it/s]


### Statewide Scorecard

In [15]:
# Define table names
voter_file_table = data_date_suffix + "_scorecard_nc"
acs_S0101_table = "S0101_us_states_acs5y_" + acs_year

#### Query from BQ
This query:
* Summarizes the voter file for NY, counting the number of registrants at a given age
* Then, left joins the state estimates for the total number of 17, 18, and 19yos from the ACS
    * The estimates for the total number of 17-19yo yos are derived from the raw estimates of 15-17 yos, **assuming a uniform distribution of population across 15, 16, and 17 year olds.**
    * Since the ACS trails by 2 years, the ACS estimate of 15-17yos is used as a proxy for the number of 17-19yos today. (This means we are intentionally *not* trying to count the college student or "group quarters" population in our denominator)

* Ages are calculated as of the date of the voter file and as of the next general election 

In [26]:
# Define GCP project
project_id = "tcc-research"

# Define query, including variables and column names that adjust with time
sql = """
WITH young_voters AS (
  SELECT  
    *,
    FLOOR(DATE_DIFF(
  DATE '2025-05-03', 
  DATE(CAST(YEAR_OF_BIRTH AS INT64), 1, 1), 
  DAY
  
) / 365.25) AS AGE_TODAY
  FROM `nc_production.20250503_scorecard_nc`
  WHERE VOTER_STATUS IN ('ACTIVE', 'INACTIVE')

), young_voter_sum AS(

SELECT
"37" AS STATE_FIPS ,
COUNTIF(AGE_TODAY = 18) AS N_VOTERS_18_AS_OF_""" + data_date_suffix + """,
COUNTIF(AGE_TODAY = 19) AS N_VOTERS_19_AS_OF_""" + data_date_suffix + """,
COUNTIF(AGE_TODAY >= 45) AS N_VOTERS_45_PLUS_AS_OF_""" + data_date_suffix + """,
FROM young_voters
GROUP BY STATE_FIPS

), acs_state AS(

SELECT 
STATE_FIPS,
EST_15_TO_17_YO,
EST_15_TO_17_YO / 3 AS  """ + EST_17_OR_18_OR_19_YO_THIS_YEAR + """,
EST_45_TO_49_YO + EST_50_TO_54_YO + EST_55_TO_59_YO + EST_55_TO_59_YO + EST_60_AND_OVER AS """ + EST_45_PLUS_YO_THIS_YEAR + """
FROM `tcc-research.acs_sources.S0101_us_states_acs5y_2022` 
WHERE STATE_FIPS = "37"

)

SELECT
young_voter_sum.*,
acs_state.EST_15_TO_17_YO,
acs_state.""" + EST_17_OR_18_OR_19_YO_THIS_YEAR + """,
acs_state.""" + EST_45_PLUS_YO_THIS_YEAR + """,
FROM young_voter_sum LEFT JOIN acs_state ON young_voter_sum.STATE_FIPS = acs_state.STATE_FIPS
"""
# Query
df = pandas_gbq.read_gbq(sql, project_id=project_id)

Downloading: 100%|[32m██████████[0m|


In [27]:
# Preview
df.head()

Unnamed: 0,STATE_FIPS,N_VOTERS_18_AS_OF_20250503,N_VOTERS_19_AS_OF_20250503,N_VOTERS_45_PLUS_AS_OF_20250503,EST_15_TO_17_YO,EST_17_OR_18_OR_19_YO_2025,EST_45_PLUS_YO_2025
0,37,25758,110374,4310598,404849,134949.666667,5139416


In [28]:
df_reg_est = df.copy()

#### Metric 1: Estimated registration rate of 18 year olds as of a rolling date (i.e. latest month)


In [30]:
# Calculate estimated registration rate

N_VOTERS_18_AS_OF_DATA_DATE = 'N_VOTERS_18_AS_OF_'+ data_date_suffix # col name for estimated 18yo voters as of rolling date


EST_REG_RATE_18_YO_AS_OF_DATA_DATE = 'EST_REG_RATE_18_YO_AS_OF_' + data_date_suffix # col name for estimated 18yo as of rolling date

df_reg_est[EST_REG_RATE_18_YO_AS_OF_DATA_DATE] = df_reg_est[N_VOTERS_18_AS_OF_DATA_DATE] / df_reg_est[EST_17_OR_18_OR_19_YO_THIS_YEAR] # estimated registered 18yo over ACS 18yo population estimate

df_reg_est

Unnamed: 0,STATE_FIPS,N_VOTERS_18_AS_OF_20250503,N_VOTERS_19_AS_OF_20250503,N_VOTERS_45_PLUS_AS_OF_20250503,EST_15_TO_17_YO,EST_17_OR_18_OR_19_YO_2025,EST_45_PLUS_YO_2025,EST_REG_RATE_18_YO_AS_OF_20250503
0,37,25758,110374,4310598,404849,134949.666667,5139416,0.190871


#### Metric 3: Estimated registration rate of 45+ year olds as of a rolling date (i.e. latest month)


In [31]:
# Calculate estimated registration rate

N_VOTERS_45_PLUS_AS_OF_DATA_DATE = 'N_VOTERS_45_PLUS_AS_OF_'+ data_date_suffix # col name for estimated 18yo voters as of rolling date

EST_REG_RATE_45_PLUS_YO_AS_OF_DATA_DATE = 'EST_REG_RATE_45_PLUS_YO_AS_OF_' + data_date_suffix # col name for estimated 18yo as of rolling date

df_reg_est[EST_REG_RATE_45_PLUS_YO_AS_OF_DATA_DATE] = df_reg_est[N_VOTERS_45_PLUS_AS_OF_DATA_DATE] / df_reg_est[EST_45_PLUS_YO_THIS_YEAR] # estimated registered 18yo over ACS 18yo population estimate

df_reg_est.sort_values(EST_45_PLUS_YO_THIS_YEAR, ascending=False)

Unnamed: 0,STATE_FIPS,N_VOTERS_18_AS_OF_20250503,N_VOTERS_19_AS_OF_20250503,N_VOTERS_45_PLUS_AS_OF_20250503,EST_15_TO_17_YO,EST_17_OR_18_OR_19_YO_2025,EST_45_PLUS_YO_2025,EST_REG_RATE_18_YO_AS_OF_20250503,EST_REG_RATE_45_PLUS_YO_AS_OF_20250503
0,37,25758,110374,4310598,404849,134949.666667,5139416,0.190871,0.838733


#### Output
Write back to BQ

In [32]:
# write
project_id = "tcc-research"
table_id = 'nc_output.' + data_date_suffix+ '_nc_statewide_scorecard_output'

pandas_gbq.to_gbq(df_reg_est, table_id, project_id=project_id, if_exists='replace')

100%|██████████| 1/1 [00:00<?, ?it/s]
