# Statistical Analysis & Data Preparation
**Antoni Czolgowski** — CSCI 5502 Data Mining, Spring 2026

This notebook covers data loading, cleaning, integration, and statistical analysis for the swing state election project.

## Step 1 — Setup & Data Inventory

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 40)
pd.set_option('display.width', 200)

In [2]:
ant_raw = pd.read_csv('../data/raw/county_demographics_panel.csv')
will_raw = pd.read_csv('../data/raw/ACSData.csv')
elec_raw = pd.read_csv('../data/raw/countypres_2000-2024.csv')

ant = ant_raw.copy()
will = will_raw.copy()
elec = elec_raw.copy()

print(f"ant:  {ant.shape}")
print(f"will: {will.shape}")
print(f"elec: {elec.shape}")

ant:  (750, 35)
will: (750, 24)
elec: (94151, 12)


In [3]:
ant.info()
ant.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750 entries, 0 to 749
Data columns (total 35 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   county_fips              750 non-null    int64  
 1   gisjoin                  750 non-null    object 
 2   state                    750 non-null    object 
 3   county_name              750 non-null    object 
 4   election_year            750 non-null    int64  
 5   total_population         750 non-null    int64  
 6   median_age               750 non-null    float64
 7   pct_white                750 non-null    float64
 8   pct_black                750 non-null    float64
 9   pct_asian                750 non-null    float64
 10  pct_two_or_more_races    750 non-null    float64
 11  pct_hispanic             750 non-null    float64
 12  pct_non_hispanic_white   750 non-null    float64
 13  pct_hs_or_higher         750 non-null    float64
 14  pct_bachelors_plus       7

Unnamed: 0,county_fips,gisjoin,state,county_name,election_year,total_population,median_age,pct_white,pct_black,pct_asian,pct_two_or_more_races,pct_hispanic,pct_non_hispanic_white,pct_hs_or_higher,pct_bachelors_plus,pct_no_hs_diploma,pct_below_poverty,median_household_income,pct_income_under_25k,pct_income_50k_100k,pct_income_over_100k,unemployment_rate,pct_owner_occupied,pct_renter_occupied,median_gross_rent,median_home_value,pct_drive_alone,pct_carpool,pct_public_transit,pct_work_from_home,pct_family_households,pct_married_couple,pct_living_alone,land_area_sqmi,population_density
0,26001,G2600010,MI,Alcona County,2016,10461,57.4,97.35207,0.248542,0.411051,1.309626,1.386101,96.138037,88.587465,14.45276,11.412535,14.930016,38160,30.484617,28.252564,7.440177,10.297297,88.417454,11.582546,592,97500,82.127396,8.379716,0.21645,5.225727,62.718681,52.785039,33.259602,674.655,15.505703
1,26003,G2600030,MI,Alger County,2016,9396,49.0,85.483184,7.322265,0.276713,4.129417,1.351639,84.514687,89.385783,18.095238,10.614217,13.053835,41270,28.633848,31.284591,10.195165,7.619048,86.425867,13.574133,610,118500,72.304774,10.433133,1.802087,8.662662,64.608214,54.937373,31.255462,915.034,10.268471


In [4]:
will.info()
will.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750 entries, 0 to 749
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unnamed: 0               750 non-null    int64  
 1   NAME                     750 non-null    object 
 2   Total Population         750 non-null    float64
 3   Pop White Alone          750 non-null    float64
 4   Pop Black Alone          750 non-null    float64
 5   Pop Hispanic or Latino   750 non-null    float64
 6   Median Household Income  750 non-null    float64
 7   Edu Bachelors or Higher  750 non-null    float64
 8   Age 65-66                750 non-null    float64
 9   Age 67-69                750 non-null    float64
 10  Age 70-74                750 non-null    float64
 11  Age 75-79                750 non-null    float64
 12  Age 80-84                750 non-null    float64
 13  Age 85 Plus              750 non-null    float64
 14  Age 18-19                7

Unnamed: 0.1,Unnamed: 0,NAME,Total Population,Pop White Alone,Pop Black Alone,Pop Hispanic or Latino,Median Household Income,Edu Bachelors or Higher,Age 65-66,Age 67-69,Age 70-74,Age 75-79,Age 80-84,Age 85 Plus,Age 18-19,Age 20,Age 21,Age 22-24,Owner Occupied Units,Renter Occupied Units,Foreign Born Pop,state,county,year
0,0,"Pike County, Pennsylvania",56210.0,50416.0,3259.0,5657.0,61199.0,6187.0,733.0,1178.0,1460.0,909.0,709.0,543.0,761.0,342.0,330.0,1015.0,17793.0,3342.0,1657.0,42,103,2016
1,1,"Snyder County, Pennsylvania",40246.0,38891.0,394.0,864.0,51110.0,2824.0,375.0,624.0,746.0,592.0,382.0,266.0,916.0,448.0,415.0,677.0,10600.0,3928.0,279.0,42,109,2016


In [5]:
elec.info()
elec.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94151 entries, 0 to 94150
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   state           94151 non-null  object 
 1   county_name     94151 non-null  object 
 2   year            94151 non-null  int64  
 3   state_po        94151 non-null  object 
 4   county_fips     94099 non-null  float64
 5   office          94151 non-null  object 
 6   candidate       94151 non-null  object 
 7   party           93650 non-null  object 
 8   candidatevotes  94114 non-null  float64
 9   totalvotes      94151 non-null  int64  
 10  version         94151 non-null  int64  
 11  mode            91356 non-null  object 
dtypes: float64(2), int64(3), object(7)
memory usage: 8.6+ MB


Unnamed: 0,state,county_name,year,state_po,county_fips,office,candidate,party,candidatevotes,totalvotes,version,mode
0,ALABAMA,AUTAUGA,2024,AL,1001.0,US PRESIDENT,OTHER,OTHER,293.0,28281,20260211,TOTAL
1,ALABAMA,AUTAUGA,2024,AL,1001.0,US PRESIDENT,CHASE OLIVER,LIBERTARIAN,65.0,28281,20260211,TOTAL


In [6]:
for name, df in [('ant', ant), ('will', will), ('elec', elec)]:
    print(f"\n{'='*60}")
    print(f"{name.upper()} — {df.shape[0]} rows × {df.shape[1]} cols")
    print(f"{'='*60}")
    print(f"Columns: {list(df.columns)}")
    print(f"Dtypes:\n{df.dtypes.value_counts().to_string()}")
    print(f"Nulls: {df.isnull().sum().sum()} total ({df.isnull().any().sum()} cols affected)")


ANT — 750 rows × 35 cols
Columns: ['county_fips', 'gisjoin', 'state', 'county_name', 'election_year', 'total_population', 'median_age', 'pct_white', 'pct_black', 'pct_asian', 'pct_two_or_more_races', 'pct_hispanic', 'pct_non_hispanic_white', 'pct_hs_or_higher', 'pct_bachelors_plus', 'pct_no_hs_diploma', 'pct_below_poverty', 'median_household_income', 'pct_income_under_25k', 'pct_income_50k_100k', 'pct_income_over_100k', 'unemployment_rate', 'pct_owner_occupied', 'pct_renter_occupied', 'median_gross_rent', 'median_home_value', 'pct_drive_alone', 'pct_carpool', 'pct_public_transit', 'pct_work_from_home', 'pct_family_households', 'pct_married_couple', 'pct_living_alone', 'land_area_sqmi', 'population_density']
Dtypes:
float64    26
int64       6
object      3
Nulls: 0 total (0 cols affected)

WILL — 750 rows × 24 cols
Columns: ['Unnamed: 0', 'NAME', 'Total Population', 'Pop White Alone', 'Pop Black Alone', 'Pop Hispanic or Latino', 'Median Household Income', 'Edu Bachelors or Higher', 'A

## Step 2 — Pre-Merge: Cross-Check Antoni vs Will

In [7]:
# build matching FIPS key in Will's data
will['county_fips'] = will['state'].astype(str).str.zfill(2) + will['county'].astype(str).str.zfill(3)
will['county_fips'] = will['county_fips'].astype(int)

ant['key'] = ant['county_fips'].astype(str) + '_' + ant['election_year'].astype(str)
will['key'] = will['county_fips'].astype(str) + '_' + will['year'].astype(str)

keys_ant = set(ant['key'])
keys_will = set(will['key'])

print(f"Shared keys: {len(keys_ant & keys_will)}")
print(f"Only in ant:  {len(keys_ant - keys_will)}")
print(f"Only in will: {len(keys_will - keys_ant)}")

Shared keys: 750
Only in ant:  0
Only in will: 0


In [8]:
# compare overlapping fields on matched rows
merged_check = ant.merge(will, on='key', suffixes=('_ant', '_will'))

comparisons = {
    'total_population': ('total_population', 'Total Population'),
    'median_hh_income': ('median_household_income', 'Median Household Income'),
    'pct_bachelors':    ('pct_bachelors_plus', 'Edu Bachelors or Higher'),  # rate vs count
}

for label, (ant_col, will_col) in comparisons.items():
    a = merged_check[ant_col]
    w = merged_check[will_col]

    if label == 'pct_bachelors':
        # Will has raw count, convert to pct for comparison
        w = (w / merged_check['Total Population']) * 100

    diff = (a - w).abs()
    print(f"\n{label}:")
    print(f"  max abs diff:  {diff.max():.4f}")
    print(f"  mean abs diff: {diff.mean():.4f}")
    print(f"  rows with diff > 1%: {(diff > 1).sum()}")


total_population:
  max abs diff:  0.0000
  mean abs diff: 0.0000
  rows with diff > 1%: 0

median_hh_income:
  max abs diff:  0.0000
  mean abs diff: 0.0000
  rows with diff > 1%: 0

pct_bachelors:
  max abs diff:  44.6009
  mean abs diff: 12.9731
  rows with diff > 1%: 750


In [9]:
merged_check['pop_diff'] = (merged_check['total_population'] - merged_check['Total Population']).abs()
top_diffs = merged_check.nlargest(10, 'pop_diff')[['county_name', 'election_year', 'total_population', 'Total Population', 'pop_diff']]
print(top_diffs.to_string(index=False))

   county_name  election_year  total_population  Total Population  pop_diff
 Alcona County           2016             10461           10461.0       0.0
  Alger County           2016              9396            9396.0       0.0
Allegan County           2016            113666          113666.0       0.0
 Alpena County           2016             28929           28929.0       0.0
 Antrim County           2016             23215           23215.0       0.0
 Arenac County           2016             15327           15327.0       0.0
 Baraga County           2016              8612            8612.0       0.0
  Barry County           2016             59316           59316.0       0.0
    Bay County           2016            106107          106107.0       0.0
 Benzie County           2016             17462           17462.0       0.0


In [10]:
# identify columns unique to Will (not conceptually in Antoni data)
print("Will's unique columns to extract:")
print("  Age brackets (senior):  Age 65-66, 67-69, 70-74, 75-79, 80-84, 85 Plus")
print("  Age brackets (young):   Age 18-19, 20, 21, 22-24")
print("  Foreign Born Pop")

# cleanup temp columns
ant.drop(columns='key', inplace=True, errors='ignore')
will.drop(columns='key', inplace=True, errors='ignore')

Will's unique columns to extract:
  Age brackets (senior):  Age 65-66, 67-69, 70-74, 75-79, 80-84, 85 Plus
  Age brackets (young):   Age 18-19, 20, 21, 22-24
  Foreign Born Pop


## Step 3 — Prepare Election Data

In [11]:
states = ['MICHIGAN', 'NORTH CAROLINA', 'PENNSYLVANIA']
years = [2016, 2020, 2024]

elec = elec[(elec['state'].isin(states)) & (elec['year'].isin(years))].copy()
elec['county_fips'] = pd.to_numeric(elec['county_fips'], errors='coerce').astype('Int64')
elec = elec.dropna(subset=['county_fips', 'candidatevotes'])

# NC has no "TOTAL" mode — aggregate across modes for all states to be safe
elec_agg = (elec
    .groupby(['state', 'county_name', 'county_fips', 'year', 'party'], as_index=False)
    ['candidatevotes'].sum()
)

print(f"After filtering & aggregation: {elec_agg.shape}")
print(f"Unique counties: {elec_agg['county_fips'].nunique()}")
print(f"\nCounties per state-year:")
print(elec_agg.groupby(['state', 'year'])['county_fips'].nunique())

After filtering & aggregation: (2866, 6)
Unique counties: 250

Counties per state-year:
state           year
MICHIGAN        2016     83
                2020     83
                2024     83
NORTH CAROLINA  2016    100
                2020    100
                2024    100
PENNSYLVANIA    2016     67
                2020     67
                2024     67
Name: county_fips, dtype: int64


In [12]:
# pivot to DEM / REP per county-year
major = elec_agg[elec_agg['party'].isin(['DEMOCRAT', 'REPUBLICAN'])]
pivot = major.pivot_table(index=['state', 'county_name', 'county_fips', 'year'],
                          columns='party', values='candidatevotes', aggfunc='sum').reset_index()
pivot.columns.name = None
pivot.rename(columns={'DEMOCRAT': 'dem_votes', 'REPUBLICAN': 'rep_votes'}, inplace=True)

# total votes = sum of ALL parties (not just D+R)
totals = elec_agg.groupby(['county_fips', 'year'])['candidatevotes'].sum().reset_index()
totals.rename(columns={'candidatevotes': 'total_votes'}, inplace=True)

elec_clean = pivot.merge(totals, on=['county_fips', 'year'])
elec_clean['dem_pct'] = elec_clean['dem_votes'] / elec_clean['total_votes'] * 100
elec_clean['rep_pct'] = elec_clean['rep_votes'] / elec_clean['total_votes'] * 100
elec_clean['dem_margin'] = elec_clean['dem_pct'] - elec_clean['rep_pct']

print(f"elec_clean: {elec_clean.shape}")
elec_clean.head()

elec_clean: (750, 10)


Unnamed: 0,state,county_name,county_fips,year,dem_votes,rep_votes,total_votes,dem_pct,rep_pct,dem_margin
0,MICHIGAN,ALCONA,26001,2016,1732.0,4201.0,6198.0,27.944498,67.779929,-39.835431
1,MICHIGAN,ALCONA,26001,2020,2142.0,4848.0,7064.0,30.322763,68.629672,-38.306908
2,MICHIGAN,ALCONA,26001,2024,2140.0,5257.0,7483.0,28.598156,70.252572,-41.654417
3,MICHIGAN,ALGER,26003,2016,1663.0,2585.0,4518.0,36.808322,57.215582,-20.40726
4,MICHIGAN,ALGER,26003,2020,2053.0,3014.0,5128.0,40.035101,58.775351,-18.74025


In [13]:
# verify 250 counties × 3 years
expected = elec_clean.groupby('state')['county_fips'].nunique()
print(expected)
print(f"\nTotal rows: {len(elec_clean)} (expected 750)")

state
MICHIGAN           83
NORTH CAROLINA    100
PENNSYLVANIA       67
Name: county_fips, dtype: int64

Total rows: 750 (expected 750)


## Step 4 — Data Integration: Merge Will's Unique Columns

In [14]:
# compute rates from Will's raw counts
will['pct_senior_65plus'] = (
    will[['Age 65-66','Age 67-69','Age 70-74','Age 75-79','Age 80-84','Age 85 Plus']].sum(axis=1)
    / will['Total Population'] * 100
)
will['pct_young_adult_18_24'] = (
    will[['Age 18-19','Age 20','Age 21','Age 22-24']].sum(axis=1)
    / will['Total Population'] * 100
)
will['pct_foreign_born'] = will['Foreign Born Pop'] / will['Total Population'] * 100

will_to_merge = will[['county_fips', 'year', 'pct_senior_65plus', 'pct_young_adult_18_24', 'pct_foreign_born']]
will_to_merge.head()

Unnamed: 0,county_fips,year,pct_senior_65plus,pct_young_adult_18_24,pct_foreign_born
0,42103,2016,9.841665,4.355097,2.947874
1,42109,2016,7.416886,6.10247,0.693237
2,42115,2016,9.440142,4.011283,0.69803
3,42039,2016,8.399692,4.753697,0.590621
4,42049,2016,6.842258,5.457255,2.486986


In [15]:
ant = ant.merge(will_to_merge, left_on=['county_fips', 'election_year'],
                right_on=['county_fips', 'year'], how='left')
ant.drop(columns='year', inplace=True)

print(f"Data after merge: {ant.shape}")
print(f"New nulls: {ant[['pct_senior_65plus','pct_young_adult_18_24','pct_foreign_born']].isnull().sum().to_string()}")

Data after merge: (750, 38)
New nulls: pct_senior_65plus        0
pct_young_adult_18_24    0
pct_foreign_born         0


## Step 5 — Merge Election Results

In [16]:
df = ant.merge(elec_clean[['county_fips', 'year', 'dem_votes', 'rep_votes', 'total_votes',
                            'dem_pct', 'rep_pct', 'dem_margin']],
               left_on=['county_fips', 'election_year'],
               right_on=['county_fips', 'year'], how='left')
df.drop(columns='year', inplace=True)

print(f"Master dataset: {df.shape}")
print(f"Election nulls: {df[['dem_votes','rep_votes','total_votes']].isnull().sum().to_string()}")
print(f"\nStates: {df['state'].unique()}")
print(f"Years:  {df['election_year'].unique()}")
df.head(2)

Master dataset: (750, 44)
Election nulls: dem_votes      0
rep_votes      0
total_votes    0

States: ['MI' 'NC' 'PA']
Years:  [2016 2020 2024]


Unnamed: 0,county_fips,gisjoin,state,county_name,election_year,total_population,median_age,pct_white,pct_black,pct_asian,pct_two_or_more_races,pct_hispanic,pct_non_hispanic_white,pct_hs_or_higher,pct_bachelors_plus,pct_no_hs_diploma,pct_below_poverty,median_household_income,pct_income_under_25k,pct_income_50k_100k,...,median_gross_rent,median_home_value,pct_drive_alone,pct_carpool,pct_public_transit,pct_work_from_home,pct_family_households,pct_married_couple,pct_living_alone,land_area_sqmi,population_density,pct_senior_65plus,pct_young_adult_18_24,pct_foreign_born,dem_votes,rep_votes,total_votes,dem_pct,rep_pct,dem_margin
0,26001,G2600010,MI,Alcona County,2016,10461,57.4,97.35207,0.248542,0.411051,1.309626,1.386101,96.138037,88.587465,14.45276,11.412535,14.930016,38160,30.484617,28.252564,...,592,97500,82.127396,8.379716,0.21645,5.225727,62.718681,52.785039,33.259602,674.655,15.505703,17.32148,2.619252,0.630915,1732.0,4201.0,6198.0,27.944498,67.779929,-39.835431
1,26003,G2600030,MI,Alger County,2016,9396,49.0,85.483184,7.322265,0.276713,4.129417,1.351639,84.514687,89.385783,18.095238,10.614217,13.053835,41270,28.633848,31.284591,...,610,118500,72.304774,10.433133,1.802087,8.662662,64.608214,54.937373,31.255462,915.034,10.268471,11.430396,5.1192,0.521499,1663.0,2585.0,4518.0,36.808322,57.215582,-20.40726


## Step 6 — Data Quality Assessment

In [17]:
# missing values
na_counts = df.isnull().sum()
print("Columns with missing values:")
print(na_counts[na_counts > 0] if na_counts.sum() > 0 else "None — dataset is complete.")

Columns with missing values:
None — dataset is complete.


In [18]:
# duplicate check
dupes = df.duplicated(subset=['county_fips', 'election_year']).sum()
print(f"Duplicate county-year rows: {dupes}")

Duplicate county-year rows: 0


In [19]:
# dtype cleanup
df['county_fips'] = df['county_fips'].astype(str).str.zfill(5)
df['election_year'] = df['election_year'].astype(int)
df['state'] = df['state'].astype('category')

# drop metadata columns not needed for analysis
df.drop(columns=['gisjoin'], inplace=True)

print(df.dtypes)
print(f"\nFinal shape: {df.shape}")

county_fips                  object
state                      category
county_name                  object
election_year                 int64
total_population              int64
median_age                  float64
pct_white                   float64
pct_black                   float64
pct_asian                   float64
pct_two_or_more_races       float64
pct_hispanic                float64
pct_non_hispanic_white      float64
pct_hs_or_higher            float64
pct_bachelors_plus          float64
pct_no_hs_diploma           float64
pct_below_poverty           float64
median_household_income       int64
pct_income_under_25k        float64
pct_income_50k_100k         float64
pct_income_over_100k        float64
unemployment_rate           float64
pct_owner_occupied          float64
pct_renter_occupied         float64
median_gross_rent             int64
median_home_value             int64
pct_drive_alone             float64
pct_carpool                 float64
pct_public_transit          

## Step 7 — Outlier & Anomaly Detection

In [20]:
numeric_cols = df.select_dtypes(include='number').columns.drop(['election_year', 'dem_votes', 'rep_votes', 'total_votes'])

outlier_report = []
for col in numeric_cols:
    q1, q3 = df[col].quantile([0.25, 0.75])
    iqr = q3 - q1
    lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
    n_out = ((df[col] < lower) | (df[col] > upper)).sum()
    if n_out > 0:
        outlier_report.append({'column': col, 'n_outliers': n_out, 'pct': round(n_out/len(df)*100, 1),
                               'lower': round(lower, 2), 'upper': round(upper, 2)})

outlier_df = pd.DataFrame(outlier_report).sort_values('n_outliers', ascending=False)
print(outlier_df.to_string(index=False))

                 column  n_outliers  pct      lower     upper
              pct_asian          92 12.3      -0.93      2.66
     population_density          82 10.9    -217.53    495.05
       total_population          72  9.6 -142442.75 305701.25
  pct_young_adult_18_24          69  9.2       2.07      6.41
              pct_black          66  8.8     -18.48     33.91
     pct_public_transit          64  8.5      -0.70      1.54
      median_home_value          36  4.8   -6275.00 320525.00
     pct_work_from_home          34  4.5      -2.54     14.71
     pct_bachelors_plus          33  4.4       0.76     43.71
              pct_white          33  4.4      44.36    123.02
      median_gross_rent          31  4.1     333.00   1251.00
        pct_drive_alone          30  4.0      69.39     90.53
       pct_foreign_born          28  3.7      -2.79      6.57
  pct_two_or_more_races          25  3.3      -2.46      9.38
      unemployment_rate          24  3.2       0.30     12.11
      pc

In [21]:
# decision: flag outliers, don't remove — real data, not errors
# population density and public transit have structural outliers (urban counties)
# these are valid and important for the analysis

top_density = df.nlargest(5, 'population_density')[['county_name', 'state', 'election_year', 'population_density']]
print("Top 5 population density (expected urban outliers):")
print(top_density.to_string(index=False))

Top 5 population density (expected urban outliers):
        county_name state  election_year  population_density
Philadelphia County    PA           2020        11773.914015
Philadelphia County    PA           2024        11760.327564
Philadelphia County    PA           2016        11613.162107
    Delaware County    PA           2024         3151.233896
    Delaware County    PA           2020         3075.644150


## Step 8 — Feature Classification & Transformations

In [22]:
identifiers = ['county_fips', 'state', 'county_name', 'election_year']
continuous = [c for c in df.select_dtypes(include='number').columns if c not in ['election_year']]
categorical = ['state', 'election_year']

print(f"Identifiers:  {len(identifiers)} — {identifiers}")
print(f"Continuous:   {len(continuous)}")
print(f"Categorical:  {len(categorical)}")
print(f"\nRates/pct (ready for cross-county comparison):")
print([c for c in continuous if c.startswith('pct_') or c.endswith('_pct') or c.endswith('_rate')])
print(f"\nRaw counts (use with caution, scale-dependent):")
print([c for c in continuous if c in ['total_population','dem_votes','rep_votes','total_votes']])

Identifiers:  4 — ['county_fips', 'state', 'county_name', 'election_year']
Continuous:   39
Categorical:  2

Rates/pct (ready for cross-county comparison):
['pct_white', 'pct_black', 'pct_asian', 'pct_two_or_more_races', 'pct_hispanic', 'pct_non_hispanic_white', 'pct_hs_or_higher', 'pct_bachelors_plus', 'pct_no_hs_diploma', 'pct_below_poverty', 'pct_income_under_25k', 'pct_income_50k_100k', 'pct_income_over_100k', 'unemployment_rate', 'pct_owner_occupied', 'pct_renter_occupied', 'pct_drive_alone', 'pct_carpool', 'pct_public_transit', 'pct_work_from_home', 'pct_family_households', 'pct_married_couple', 'pct_living_alone', 'pct_senior_65plus', 'pct_young_adult_18_24', 'pct_foreign_born', 'dem_pct', 'rep_pct']

Raw counts (use with caution, scale-dependent):
['total_population', 'dem_votes', 'rep_votes', 'total_votes']


In [26]:
import os
os.makedirs('../data/processed', exist_ok=True)
df.to_csv('../data/processed/master_dataset.csv', index=False)
print(f"Saved: data/processed/master_dataset.csv — {df.shape}")
df.head(2)

Saved: data/processed/master_dataset.csv — (750, 43)


Unnamed: 0,county_fips,state,county_name,election_year,total_population,median_age,pct_white,pct_black,pct_asian,pct_two_or_more_races,pct_hispanic,pct_non_hispanic_white,pct_hs_or_higher,pct_bachelors_plus,pct_no_hs_diploma,pct_below_poverty,median_household_income,pct_income_under_25k,pct_income_50k_100k,pct_income_over_100k,...,median_gross_rent,median_home_value,pct_drive_alone,pct_carpool,pct_public_transit,pct_work_from_home,pct_family_households,pct_married_couple,pct_living_alone,land_area_sqmi,population_density,pct_senior_65plus,pct_young_adult_18_24,pct_foreign_born,dem_votes,rep_votes,total_votes,dem_pct,rep_pct,dem_margin
0,26001,MI,Alcona County,2016,10461,57.4,97.35207,0.248542,0.411051,1.309626,1.386101,96.138037,88.587465,14.45276,11.412535,14.930016,38160,30.484617,28.252564,7.440177,...,592,97500,82.127396,8.379716,0.21645,5.225727,62.718681,52.785039,33.259602,674.655,15.505703,17.32148,2.619252,0.630915,1732.0,4201.0,6198.0,27.944498,67.779929,-39.835431
1,26003,MI,Alger County,2016,9396,49.0,85.483184,7.322265,0.276713,4.129417,1.351639,84.514687,89.385783,18.095238,10.614217,13.053835,41270,28.633848,31.284591,10.195165,...,610,118500,72.304774,10.433133,1.802087,8.662662,64.608214,54.937373,31.255462,915.034,10.268471,11.430396,5.1192,0.521499,1663.0,2585.0,4518.0,36.808322,57.215582,-20.40726


## Step 9 — Basic Statistical Analysis

In [27]:
# summary stats across all numeric columns
desc = df.select_dtypes(include='number').describe().T
desc['skew'] = df.select_dtypes(include='number').skew()
desc['kurtosis'] = df.select_dtypes(include='number').kurtosis()
desc.round(3)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,skew,kurtosis
election_year,750.0,2020.0,3.268,2016.0,2016.0,2020.0,2024.0,2024.0,0.0,-1.502
total_population,750.0,132821.637,234273.181,2102.0,25611.25,54794.0,137647.25,1772259.0,4.108,19.787
median_age,750.0,43.603,5.281,26.4,40.6,43.3,46.775,60.0,0.128,0.688
pct_white,750.0,81.467,16.158,24.174,73.86,88.242,93.524,99.217,-1.254,0.741
pct_black,750.0,10.477,13.61,0.0,1.163,3.938,14.261,62.035,1.679,2.118
pct_asian,750.0,1.252,1.601,0.0,0.419,0.632,1.316,9.234,2.623,7.041
pct_two_or_more_races,750.0,3.792,2.485,0.018,1.981,3.123,4.94,15.287,1.38,2.362
pct_hispanic,750.0,5.606,4.441,0.095,2.17,4.349,7.534,27.977,1.558,2.852
pct_non_hispanic_white,750.0,79.025,16.719,23.252,69.951,85.249,92.255,97.677,-1.121,0.346
pct_hs_or_higher,750.0,88.765,4.408,72.049,86.35,89.547,91.777,97.118,-0.849,0.526


In [28]:
# summary by state
for state in ['MI', 'NC', 'PA']:
    print(f"\n{'='*40} {state} {'='*40}")
    sub = df[df['state'] == state]
    print(sub[['dem_pct', 'rep_pct', 'median_household_income', 'pct_bachelors_plus',
               'population_density', 'pct_non_hispanic_white']].describe().round(2).to_string())


       dem_pct  rep_pct  median_household_income  pct_bachelors_plus  population_density  pct_non_hispanic_white
count   249.00   249.00                   249.00              249.00              249.00                  249.00
mean     37.44    59.71                 55097.24               23.10              190.84                   86.95
std       9.35     9.45                 12955.37                9.14              407.21                    9.00
min      21.25    25.99                 30824.00                9.05                3.89                   47.93
25%      31.47    54.97                 45184.00               16.78               34.11                   82.98
50%      34.99    61.95                 52732.00               20.58               60.32                   90.65
75%      41.72    66.21                 63304.00               26.69              160.96                   93.45
max      72.61    77.21                103039.00               58.71             2896.61       

In [31]:
# correlation matrix — top correlated pairs
corr_cols = [c for c in df.select_dtypes(include='number').columns
             if c not in ['election_year', 'dem_votes', 'rep_votes', 'total_votes']]
corr = df[corr_cols].corr()

# extract top pairs
pairs = (corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
         .stack().reset_index())
pairs.columns = ['var1', 'var2', 'r']
pairs['abs_r'] = pairs['r'].abs()

print("Top 50 correlations:")
print(pairs.nlargest(50, 'abs_r').to_string(index=False))

Top 50 correlations:
                   var1                    var2         r    abs_r
       pct_hs_or_higher       pct_no_hs_diploma -1.000000 1.000000
     pct_owner_occupied     pct_renter_occupied -1.000000 1.000000
                rep_pct              dem_margin -0.997907 0.997907
                dem_pct              dem_margin  0.997889 0.997889
                dem_pct                 rep_pct -0.991600 0.991600
              pct_white  pct_non_hispanic_white  0.990732 0.990732
median_household_income    pct_income_over_100k  0.973989 0.973989
              pct_white               pct_black -0.934407 0.934407
              pct_black  pct_non_hispanic_white -0.926596 0.926596
             median_age       pct_senior_65plus  0.921577 0.921577
median_household_income    pct_income_under_25k -0.907357 0.907357
  pct_family_households        pct_living_alone -0.883489 0.883489
   pct_income_over_100k       median_home_value  0.870118 0.870118
   pct_income_under_25k    pct_income_ove

In [32]:
# panel coherence — compare distributions across election years
print("Mean of key variables by election year:\n")
panel_check = df.groupby('election_year')[['dem_pct', 'median_household_income',
    'pct_bachelors_plus', 'pct_non_hispanic_white', 'population_density']].mean()
print(panel_check.round(2).to_string())

Mean of key variables by election year:

               dem_pct  median_household_income  pct_bachelors_plus  pct_non_hispanic_white  population_density
election_year                                                                                                  
2016             36.55                 46045.69               21.11                   80.37              270.73
2020             38.41                 53480.83               23.41                   79.37              275.36
2024             37.34                 66340.84               25.89                   77.33              280.39


In [33]:
drop_cols = ['pct_renter_occupied', 'pct_no_hs_diploma', 'pct_white', 'land_area_sqmi']
df.drop(columns=drop_cols, inplace=True)

print(f"Dropped: {drop_cols}")
print(f"Shape: {df.shape}")
print(f"\nRemaining columns: {list(df.columns)}")

Dropped: ['pct_renter_occupied', 'pct_no_hs_diploma', 'pct_white', 'land_area_sqmi']
Shape: (750, 39)

Remaining columns: ['county_fips', 'state', 'county_name', 'election_year', 'total_population', 'median_age', 'pct_black', 'pct_asian', 'pct_two_or_more_races', 'pct_hispanic', 'pct_non_hispanic_white', 'pct_hs_or_higher', 'pct_bachelors_plus', 'pct_below_poverty', 'median_household_income', 'pct_income_under_25k', 'pct_income_50k_100k', 'pct_income_over_100k', 'unemployment_rate', 'pct_owner_occupied', 'median_gross_rent', 'median_home_value', 'pct_drive_alone', 'pct_carpool', 'pct_public_transit', 'pct_work_from_home', 'pct_family_households', 'pct_married_couple', 'pct_living_alone', 'population_density', 'pct_senior_65plus', 'pct_young_adult_18_24', 'pct_foreign_born', 'dem_votes', 'rep_votes', 'total_votes', 'dem_pct', 'rep_pct', 'dem_margin']


In [34]:
df.to_csv('../data/processed/master_dataset.csv', index=False)
print(f"Updated: data/processed/master_dataset.csv — {df.shape}")

Updated: data/processed/master_dataset.csv — (750, 39)


## Step 9c — Final Validation

In [35]:
assert df.shape == (750, 39), "Wrong shape"
assert df.duplicated(subset=['county_fips', 'election_year']).sum() == 0, "Duplicates found"
assert df.isnull().sum().sum() == 0, "Nulls found"
assert set(df['election_year'].unique()) == {2016, 2020, 2024}, "Wrong years"
assert df.groupby('state')['county_fips'].nunique().to_dict() == {'MI': 83, 'NC': 100, 'PA': 67}, "Wrong county counts"

# sanity: percentages in 0-100 range
pct_cols = [c for c in df.columns if c.startswith('pct_')]
for col in pct_cols:
    assert df[col].min() >= 0, f"{col} has negative values"
    assert df[col].max() <= 100, f"{col} exceeds 100"

# sanity: votes make sense
assert (df['dem_votes'] + df['rep_votes'] <= df['total_votes']).all(), "Votes don't add up"

print("All checks passed ✓")
print(f"Shape: {df.shape}")
print(f"States: {sorted(df['state'].unique())}")
print(f"Counties: {df['county_fips'].nunique()}")
print(f"Years: {sorted(df['election_year'].unique())}")

All checks passed ✓
Shape: (750, 39)
States: ['MI', 'NC', 'PA']
Counties: 250
Years: [np.int64(2016), np.int64(2020), np.int64(2024)]


  assert df.groupby('state')['county_fips'].nunique().to_dict() == {'MI': 83, 'NC': 100, 'PA': 67}, "Wrong county counts"


## Step 10 — Transformations for Modeling

In [36]:
from sklearn.preprocessing import StandardScaler

df_model = df.copy()

# log transform right-skewed variables
log_cols = ['total_population', 'population_density', 'median_household_income', 'median_home_value', 'median_gross_rent']
for col in log_cols:
    df_model[f'log_{col}'] = np.log1p(df_model[col])
    print(f"{col}: skew {df[col].skew():.2f} → {df_model[f'log_{col}'].skew():.2f}")

df_model.drop(columns=log_cols, inplace=True)

total_population: skew 4.11 → 0.27
population_density: skew 11.09 → 0.36
median_household_income: skew 1.05 → 0.28
median_home_value: skew 1.49 → 0.39
median_gross_rent: skew 1.36 → 0.68


In [37]:
# standardize all continuous features
exclude = ['county_fips', 'state', 'county_name', 'election_year', 'dem_votes', 'rep_votes', 'total_votes']
scale_cols = [c for c in df_model.select_dtypes(include='number').columns if c not in exclude]

scaler = StandardScaler()
df_model[scale_cols] = scaler.fit_transform(df_model[scale_cols])

print(f"Scaled {len(scale_cols)} features")
print(f"Mean check: {df_model[scale_cols].mean().mean():.6f}")
print(f"Std check:  {df_model[scale_cols].std().mean():.6f}")

Scaled 32 features
Mean check: -0.000000
Std check:  1.000667


In [38]:
df_model.to_csv('../data/processed/master_dataset_scaled.csv', index=False)
print(f"Saved: data/processed/master_dataset_scaled.csv — {df_model.shape}")

Saved: data/processed/master_dataset_scaled.csv — (750, 39)
