# Machine Learning
This notebook houses logic for training models and performing machine learning.


## Imports

In [1]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline

In [2]:
data_dir = "data/processed/"

# Import precinct data
df_precincts = pd.read_csv(data_dir + "precincts_with_registration.csv")
df_precincts.drop("Unnamed: 0", axis=1, inplace=True)

# Import election total data
df_election_totals = pd.read_csv(data_dir + "election_totals.csv")
df_election_totals.set_index("year", inplace=True)

df_registration_totals = pd.read_csv(data_dir + "registration_totals.csv")
df_registration_totals.set_index("year", inplace=True)

df_elections_with_registration = df_election_totals.join(df_registration_totals, how="inner")
df_elections_with_registration


Unnamed: 0_level_0,ABSVOTE,AIPREG,AIPVOTE,ASSDEM01,ASSDEM02,ASSIND01,ASSPAF01,ASSREP01,ASSREP02,CNGDEM01,...,rreg5g,rreg6g,rreg7g,rreg8g,rreg9g,totreg_r,vietdcl,vietdem,vietoth,vietrep
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012,0.0,0.0,0.0,65500.0,0.0,0.0,0.0,103762.0,0.0,86907.0,...,10967.0,4656.0,5996.0,3922.0,23554.0,219403.0,70.0,82.0,18.0,89.0
2014,0.0,0.0,0.0,46126.0,0.0,0.0,,77452.0,0.0,59214.0,...,4937.0,9098.0,3954.0,5177.0,24026.0,216298.0,93.0,94.0,24.0,80.0
2016,0.0,0.0,0.0,87168.0,0.0,0.0,,105247.0,0.0,89992.0,...,6849.0,3395.0,7910.0,3179.0,25474.0,249413.0,143.0,174.0,21.0,92.0
2018,0.0,0.0,0.0,56257.0,0.0,0.0,,67443.0,,91599.0,...,3537.0,4856.0,2506.0,5984.0,22393.0,248773.0,158.0,165.0,23.0,93.0
2020,0.0,0.0,0.0,103206.0,0.0,0.0,,126579.0,0.0,120883.0,...,3025.0,1985.0,2737.0,1425.0,17286.0,279110.0,164.0,232.0,51.0,152.0


## Using Party Registration to Predict Democratic Candidate Performance


In [3]:
# Normalize party registration per year.
df_elections_with_registration[["dem","rep","aip","paf","msc","lib","nlp","grn","ref","dcl"]] = \
    df_elections_with_registration[["dem","rep","aip","paf","msc","lib","nlp","grn","ref","dcl"]].div(df_elections_with_registration["totreg_r"], axis=0)

# Normalize assembly candidate performance
df_elections_with_registration[["ASSDEM01", "ASSREP01"]] = \
    df_elections_with_registration[["ASSDEM01", "ASSREP01"]].div(df_elections_with_registration[["ASSDEM01", "ASSREP01"]].sum(axis=1), axis=0)


# Print the actual data we're hoping to pr
print("Actual Results: ", df_elections_with_registration["ASSDEM01"])

Actual Results:  year
2012    0.386974
2014    0.373254
2016    0.453021
2018    0.454786
2020    0.449142
Name: ASSDEM01, dtype: float64


In [4]:
df_precincts[["dem","rep","aip","paf","msc","lib","nlp","grn","ref","dcl"]] = \
    df_precincts[["dem","rep","aip","paf","msc","lib","nlp","grn","ref","dcl"]].div(df_precincts["totreg_r"], axis=0)

# Normalize assembly candidate performance
df_precincts[["ASSDEM01", "ASSREP01"]] = \
    df_precincts[["ASSDEM01", "ASSREP01"]].div(df_precincts[["ASSDEM01", "ASSREP01"]].sum(axis=1), axis=0)

In [5]:
# 
train_labels = ["dem","rep","aip","paf","msc","lib","nlp","grn","ref","dcl"]
X_train = df_precincts[train_labels]
X_test = df_elections_with_registration[train_labels]

y_train = df_precincts["ASSDEM01"]
y_test = df_elections_with_registration["ASSDEM01"]


### Determining the best model
Below is a function that accepts a number of neighbors and a scaling method, to determine the mean squared error of a resulting model.

In [11]:
def make_and_test_pipeline(neighbors, scaleMethod):

    pipeline = make_pipeline(
        scaleMethod,
        KNeighborsRegressor(n_neighbors=20)
    )

    pipeline.fit(X=X_train, y=y_train)

    _y_test = pipeline.predict(X=X_test)
    mse = mean_squared_error(_y_test, y_test)

    return mse

best_k = best_method = None
minMSE = 10000

for n in [5, 10, 20, 30]:
    for sM in [StandardScaler(), MinMaxScaler(), Normalizer()]:
        best_n = n if n == None else best_k 
        best_method = sM if best_method == None else best_method
        
        curMSE = make_and_test_pipeline(n, sM)

        if curMSE < minMSE:
            best_k = n
            bset_method = sM 
            minMSE = curMSE

print("Best k: {}, Best Scaling Method: {}".format(best_n, best_method))


Best k: 5, Best Scaling Method: StandardScaler()


Given a range of possible k values, and different scaling methods, we found that the combination that minimized the mean squared error of our model, was a KNeighborsRessor that used 5 

### Using the model to predict general election results

In [7]:
# Use the model to predict election results. 

pipeline = make_pipeline(
    best_method,
    KNeighborsRegressor(n_neighbors=best_n)
)

pipeline.fit(X=X_train, y=y_train)

_y_test = pipeline.predict(X=X_test)

print("Mean Squared Error {}".format(minMSE))

df_predicted = pd.DataFrame(data=_y_test, index=["2012", "2014", "2016", "2018", "2020"])

print("Predicted Results: ", df_predicted)


Mean Squared Error 0.0005701971959409406
Predicted Results:               0
2012  0.368530
2014  0.396668
2016  0.432859
2018  0.442137
2020  0.430095


### Finding Scenarios That Would Predict Democratic Gains

Working from 2020 results: Say Democrats registered 2.5 percent of DCL's, to roughly match the average registraion percents of Republicans. What would the Democrat's predicted share of the vote be?

In [8]:
# Assume we start from the previous election registration.
prev_performance = df_elections_with_registration.loc[2020]

high_dem_registration = prev_performance[["dem","rep","aip","paf","msc","lib","nlp","grn","ref","dcl"]]

high_dem_registration["dem"] = high_dem_registration["dem"] + 0.025
high_dem_registration["dcl"] = high_dem_registration["dcl"] - 0.025

pred = pipeline.predict(X=[high_dem_registration])

print("2.5 point gain predicted results: {}%".format(pred[0]))

high_dem_registration["dem"] = high_dem_registration["dem"] + 0.08
high_dem_registration["dcl"] = high_dem_registration["dcl"] - 0.08

pred = pipeline.predict(X=[high_dem_registration])
print("3.3 point gain predicted results: {}%".format(pred[0]))

2.5 point gain predicted results: 0.42046123220333714%
3.3 point gain predicted results: 0.5089716618188358%


Answer: Not quite enough. However, a 3.3 percent improvement from 2020 turnout might be enought to tip the scales.

Say Democrats decide they wish to make make inroads with registered Republican voters: what percent of registered Republicans would need to be flipped for the Democratic nominee to take a lead in the vote share.

In [9]:
# Assume we start from the previous election registration.
prev_performance = df_elections_with_registration.loc[2020]

high_dem_registration = prev_performance[["dem","rep","aip","paf","msc","lib","nlp","grn","ref","dcl"]]

point_swing = 0.0
while (pipeline.predict(X=[high_dem_registration])[0] < 0.5):
    point_swing = point_swing + 0.01
    high_dem_registration["dem"] = high_dem_registration["dem"] + point_swing
    high_dem_registration["rep"] = high_dem_registration["rep"] - point_swing

pred = pipeline.predict(X=[high_dem_registration])

print("{} point gain predicted results: {}%".format(point_swing, pred[0]))


0.03 point gain predicted results: 0.5288883286378189%


Answer: Our model predicts that flipping 3% of registered Republicans would lead to the Democratic candidate winning the majority of the vote.