# Machine Learning
This notebook houses logic for training models and performing machine learning.


## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline

In [2]:
data_dir = "data/processed/"

# Import precinct data
df_precincts = pd.read_csv(data_dir + "precincts_with_registration.csv")
df_precincts.drop("Unnamed: 0", axis=1, inplace=True)

# Import election total data
df_election_totals = pd.read_csv(data_dir + "election_totals.csv")
df_election_totals.set_index("year", inplace=True)

df_registration_totals = pd.read_csv(data_dir + "registration_totals.csv")
df_registration_totals.set_index("year", inplace=True)

df_elections_with_registration = df_election_totals.join(df_registration_totals, how="inner")


## Using Precinct Data to Predict a Democratic Assembly Candidate's Performance


#### Normalizing characteristics of training and test data

In [3]:
party_labels = ["dem","rep","aip","paf","msc","lib","nlp","grn","ref","dcl"]

# Normalize party registration per year.
df_elections_with_registration[["dem","rep","aip","paf","msc","lib","nlp","grn","ref","dcl"]] = \
    df_elections_with_registration[["dem","rep","aip","paf","msc","lib","nlp","grn","ref","dcl"]].div(df_elections_with_registration["totreg_r"], axis=0)

# Normalize assembly candidate performance
df_elections_with_registration[["ASSDEM01", "ASSREP01"]] = \
    df_elections_with_registration[["ASSDEM01", "ASSREP01"]].div(df_elections_with_registration[["ASSDEM01", "ASSREP01"]].sum(axis=1), axis=0)

# Print the actual data we're hoping to predict
print("Actual Results: ", df_elections_with_registration["ASSDEM01"])

# Normalize party registration per year.
df_precincts[party_labels] = \
    df_precincts[party_labels].div(df_precincts["totreg_r"], axis=0)

# Normalize assembly candidate performance
df_precincts[["ASSDEM01", "ASSREP01"]] = \
    df_precincts[["ASSDEM01", "ASSREP01"]].div(df_precincts[["ASSDEM01", "ASSREP01"]].sum(axis=1), axis=0)


# Dem age and gender cols
dem_cols = [col for col in df_registration_totals.columns if col.startswith('dem') and len(col) > 3]
# Rep age and gender cols
rep_cols = [col for col in df_registration_totals.columns if col.startswith('rep') and len(col) > 3]
# Dcl age and gender cols
dcl_cols = [col for col in df_registration_totals.columns if col.startswith('dcl') and len(col) > 3]

# Normalize party affiliation, gender, and age groups.
df_precincts[dem_cols] = df_precincts[dem_cols].div(df_precincts["dem"], axis=0)
df_precincts[rep_cols] = df_precincts[rep_cols].div(df_precincts["rep"], axis=0)
df_precincts[dcl_cols] = df_precincts[dcl_cols].div(df_precincts["dcl"], axis=0)

df_elections_with_registration[dem_cols] = df_elections_with_registration[dem_cols].div(df_elections_with_registration["dem"], axis=0)
df_elections_with_registration[rep_cols] = df_elections_with_registration[rep_cols].div(df_elections_with_registration["rep"], axis=0)
df_elections_with_registration[dcl_cols] = df_elections_with_registration[dcl_cols].div(df_elections_with_registration["dcl"], axis=0)


party_gender_age = [dem_cols, rep_cols , dcl_cols]

# Determine cols for different demographics
chi_cols = [col for col in df_registration_totals.columns if col.startswith('chi')]
jpn_cols = [col for col in df_registration_totals.columns if col.startswith('jpn')]
hisp_cols = [col for col in df_registration_totals.columns if col.startswith('hisp')]
ai_cols = [col for col in df_registration_totals.columns if col.startswith('ai')]
kor_cols = [col for col in df_registration_totals.columns if col.startswith('kor')]
viet_cols = [col for col in df_registration_totals.columns if col.startswith('viet')]
fil_cols = [col for col in df_registration_totals.columns if col.startswith('fil')]
jew_cols = [col for col in df_registration_totals.columns if col.startswith('jew')]

demos = [chi_cols, jpn_cols, hisp_cols, ai_cols, kor_cols, viet_cols, fil_cols, jew_cols]

# Normalize accross each race demographic
for demo in demos:
    df_precincts[demo] = df_precincts[demo].div(df_precincts[demo].sum(axis=1), axis=0)
    df_precincts[demo] = df_precincts[demo].fillna(0)
    df_elections_with_registration[demo] = df_elections_with_registration[demo].div(df_elections_with_registration[demo].sum(axis=1), axis=0)
    df_elections_with_registration[demo] = df_elections_with_registration[demo].fillna(0)




Actual Results:  year
2012    0.386974
2014    0.373254
2016    0.453021
2018    0.454786
2020    0.449142
Name: ASSDEM01, dtype: float64


In [4]:
# Compile a list of different combinations of training labels. They'll be used to determine/construct the best model for fitting our data.
voter_reg_train_labels = party_labels
party_gender_age_labels = list(np.concatenate(party_gender_age).flat)
demo_reg_labels = list(np.concatenate(demos).flat)

voter_and_party_labels = list(np.concatenate([voter_reg_train_labels, demo_reg_labels]).flat)
party_and_demo_labels = list(np.concatenate([demo_reg_labels, party_gender_age_labels]).flat)
voter_and_demo_labels = list(np.concatenate([voter_reg_train_labels, demo_reg_labels]).flat)

all_labels = list(np.concatenate([voter_reg_train_labels, party_gender_age_labels, demo_reg_labels]).flat)

# Create every combination of labels to evaluate.
labels = [ voter_reg_train_labels, party_gender_age_labels, demo_reg_labels,
    voter_and_party_labels, party_and_demo_labels, voter_and_demo_labels, all_labels
]

y_train = df_precincts["ASSDEM01"]
y_test = df_elections_with_registration["ASSDEM01"]


#### Determining the best model and training parameters
Below is a function that accepts a number of neighbors a scaling method and a list of training labels, to determine the mean squared error of a resulting model.

In [5]:
def make_and_test_pipeline(neighbors, scaleMethod, train_labels):

    X_train = df_precincts[train_labels]
    X_test = df_elections_with_registration[train_labels]

    pipeline = make_pipeline(
        scaleMethod,
        KNeighborsRegressor(n_neighbors=neighbors)
    )

    pipeline.fit(X=X_train, y=y_train)

    _y_test = pipeline.predict(X=X_test)
    mse = mean_squared_error(_y_test, y_test)

    return mse

best_k = best_method = None
minMSE = 10000
best_labels = None

for label_group in labels:
    for n in [5, 10, 20, 30]:
        for sM in [StandardScaler(), MinMaxScaler(), Normalizer()]:
            best_n = n if n == None else best_k 
            best_method = sM if best_method == None else best_method
            
            curMSE = make_and_test_pipeline(n, sM, label_group)

            if curMSE < minMSE:
                best_k = n
                bset_method = sM 
                best_labels = label_group
                minMSE = curMSE

print("Best combination of labels: {}, Best k: {}, Best Scaling Method: {} produced a model with a MSE of {}".format(best_labels, best_n, best_method, minMSE))


Best combination of labels: ['dem', 'rep', 'aip', 'paf', 'msc', 'lib', 'nlp', 'grn', 'ref', 'dcl'], Best k: 5, Best Scaling Method: StandardScaler() produced a model with a MSE of 0.00018202591438314287


Given a range of possible k values, and different scaling methods, we found that the combination that minimized the mean squared error of our model, was a KNeighborsRegressor with 5 neighbors, and with training labels that only took into consideration the registration proportions of a precinct.

### Using the model to predict general election results

In [6]:
# Use the model to predict election results. 
X_train = df_precincts[best_labels]
X_test = df_elections_with_registration[best_labels]

pipeline = make_pipeline(
    best_method,
    KNeighborsRegressor(n_neighbors=best_n)
)

pipeline.fit(X=X_train, y=y_train)

_y_test = pipeline.predict(X=X_test)

df_predicted = pd.DataFrame(data=_y_test, index=["2012", "2014", "2016", "2018", "2020"])

print("Predicted Results: ", df_predicted)


Predicted Results:               0
2012  0.360598
2014  0.400715
2016  0.436573
2018  0.439017
2020  0.428631


### Finding Scenarios That Would Predict Democratic Gains

Working from 2020 results: Say Democrats wished to reach people who've previously registered as "Decline to State", in hopes of expanding their margins. What percent of DCL's would Democrats have to convert, before our model would predict a favorable outcome for the Democratic assembly candidate?

In [7]:
# Assume we start from the previous election registration.
prev_performance = df_elections_with_registration.loc[2020]

high_dem_registration = prev_performance[["dem","rep","aip","paf","msc","lib","nlp","grn","ref","dcl"]]

point_swing = 0.0
while (pipeline.predict(X=[high_dem_registration])[0] < 0.5):
    point_swing = point_swing + 0.01
    high_dem_registration["dem"] = high_dem_registration["dem"] + point_swing
    high_dem_registration["dcl"] = high_dem_registration["dcl"] - point_swing

pred = pipeline.predict(X=[high_dem_registration])
print("{} point gain predicted results: {}%".format(point_swing, pred[0]))


0.03 point gain predicted results: 0.5314638424486151%


Answer: Our model predicts that about 3% of DCL's would need to be re-registered as a Democrat, before we could see a favorable outcome for the Democratic assembly candidate.

Say Democrats decide they wish to make make inroads with registered Republican voters: what percent of registered Republicans would need to be flipped for the Democratic nominee to take a lead in the vote share.

In [8]:
# Assume we start from the previous election registration.
prev_performance = df_elections_with_registration.loc[2020]

high_dem_registration = prev_performance[["dem","rep","aip","paf","msc","lib","nlp","grn","ref","dcl"]]

point_swing = 0.0
while (pipeline.predict(X=[high_dem_registration])[0] < 0.5):
    point_swing = point_swing + 0.005
    high_dem_registration["dem"] = high_dem_registration["dem"] + point_swing
    high_dem_registration["rep"] = high_dem_registration["rep"] - point_swing

pred = pipeline.predict(X=[high_dem_registration])

print("{} point gain predicted results: {}%".format(point_swing, pred[0]))


0.03 point gain predicted results: 0.5516549857516135%


Answer: Our model predicts that flipping 3% of registered Republicans would lead to the Democratic candidate winning the majority of the vote.