# Machine Learning
This notebook houses tasks related to


In [106]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline

In [107]:
data_dir = "data/processed/"

# Import precinct data
df_precincts = pd.read_csv(data_dir + "precincts_with_registration.csv")
df_precincts.drop("Unnamed: 0", axis=1, inplace=True)

# Import election total data
df_election_totals = pd.read_csv(data_dir + "election_totals.csv")
df_election_totals.set_index("year", inplace=True)

df_registration_totals = pd.read_csv(data_dir + "registration_totals.csv")
df_registration_totals.set_index("year", inplace=True)

df_elections_with_registration = df_election_totals.join(df_registration_totals, how="inner")
df_elections_with_registration


Unnamed: 0_level_0,ABSVOTE,AIPREG,AIPVOTE,ASSDEM01,ASSDEM02,ASSIND01,ASSPAF01,ASSREP01,ASSREP02,CNGDEM01,...,rreg5g,rreg6g,rreg7g,rreg8g,rreg9g,totreg_r,vietdcl,vietdem,vietoth,vietrep
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012,0.0,0.0,0.0,65500.0,0.0,0.0,0.0,103762.0,0.0,86907.0,...,10967.0,4656.0,5996.0,3922.0,23554.0,219403.0,70.0,82.0,18.0,89.0
2014,0.0,0.0,0.0,46126.0,0.0,0.0,,77452.0,0.0,59214.0,...,4937.0,9098.0,3954.0,5177.0,24026.0,216298.0,93.0,94.0,24.0,80.0
2016,0.0,0.0,0.0,87168.0,0.0,0.0,,105247.0,0.0,89992.0,...,6849.0,3395.0,7910.0,3179.0,25474.0,249413.0,143.0,174.0,21.0,92.0
2018,0.0,0.0,0.0,56257.0,0.0,0.0,,67443.0,,91599.0,...,3537.0,4856.0,2506.0,5984.0,22393.0,248773.0,158.0,165.0,23.0,93.0
2020,0.0,0.0,0.0,103206.0,0.0,0.0,,126579.0,0.0,120883.0,...,3025.0,1985.0,2737.0,1425.0,17286.0,279110.0,164.0,232.0,51.0,152.0


## Using Party Registration to Predict Democratic Candidate Performance


In [108]:
# Normalize party registration per year.
df_elections_with_registration[["dem","rep","aip","paf","msc","lib","nlp","grn","ref","dcl"]] = \
    df_elections_with_registration[["dem","rep","aip","paf","msc","lib","nlp","grn","ref","dcl"]].div(df_elections_with_registration["totreg_r"], axis=0)

# Normalize assembly candidate performance
df_elections_with_registration[["ASSDEM01", "ASSREP01"]] = \
    df_elections_with_registration[["ASSDEM01", "ASSREP01"]].div(df_elections_with_registration[["ASSDEM01", "ASSREP01"]].sum(axis=1), axis=0)

df_elections_with_registration[["dem","rep","aip","paf","msc","lib","nlp","grn","ref","dcl"]]
df_elections_with_registration[["ASSDEM01", "ASSREP01"]]

Unnamed: 0_level_0,ASSDEM01,ASSREP01
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2012,0.386974,0.613026
2014,0.373254,0.626746
2016,0.453021,0.546979
2018,0.454786,0.545214
2020,0.449142,0.550858


In [109]:
df_precincts[["dem","rep","aip","paf","msc","lib","nlp","grn","ref","dcl"]] = \
    df_precincts[["dem","rep","aip","paf","msc","lib","nlp","grn","ref","dcl"]].div(df_precincts["totreg_r"], axis=0)

# Normalize assembly candidate performance
df_precincts[["ASSDEM01", "ASSREP01"]] = \
    df_precincts[["ASSDEM01", "ASSREP01"]].div(df_precincts[["ASSDEM01", "ASSREP01"]].sum(axis=1), axis=0)

In [110]:
train_labels = ["dem","rep","aip","paf","msc","lib","nlp","grn","ref","dcl"]
X_train = df_precincts[train_labels]
X_test = df_elections_with_registration[train_labels]

y_train = df_precincts["ASSDEM01"]
y_test = df_elections_with_registration["ASSDEM01"]

pipeline = make_pipeline(
    StandardScaler(),
    KNeighborsRegressor(n_neighbors=20)
)

pipeline.fit(X=X_train, y=y_train)

_y_test = model.predict(X=X_test)
print(_y_test)
mean_squared_error(y_test, _y_test)


[0.3922748  0.39704532 0.41196982 0.42671006 0.43819921]


0.000637456747098332

## Using Voter Demographics To Predict Democratic Candidate Performance


In [111]:
df_precincts[["dem","rep","aip","paf","msc","lib","nlp","grn","ref","dcl"]] = \
    df_precincts[["dem","rep","aip","paf","msc","lib","nlp","grn","ref","dcl"]].div(df_precincts["totreg_r"], axis=0)

# Normalize assembly candidate performance
df_precincts[["ASSDEM01", "ASSREP01"]] = \
    df_precincts[["ASSDEM01", "ASSREP01"]].div(df_precincts[["ASSDEM01", "ASSREP01"]].sum(axis=1), axis=0)

## Finding Model Precincts