In [1]:
import pandas as pd
import numpy as np
import csv
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import tensorflow.keras
from keras.utils import to_categorical

## Voting results based on 2016 Data from MIT
MIT Election Data and Science Lab, 2018, "County Presidential Election Returns 2000-2016", https://doi.org/10.7910/DVN/VOQCHQ, Harvard Dataverse, V6, UNF:6:ZZe1xuZ5H2l4NUiSRcRf8Q== [fileUNF]

In [110]:
pres_results = pd.read_csv("../data/presidential_results_by_county_2000-2016.csv")

In [111]:
cleaned_results = pres_results[pres_results["year"] == 2016].dropna()

In [215]:
test = cleaned_results[["state", "county", "party", "candidatevotes"]]

In [216]:
test = test.groupby(["state", "county"]).agg({'candidatevotes':'max', 'party': 'count'})

In [217]:
test.drop("party", axis=1, inplace=True)

In [218]:
test

Unnamed: 0_level_0,Unnamed: 1_level_0,candidatevotes
state,county,Unnamed: 2_level_1
Alabama,Autauga,18172.0
Alabama,Baldwin,72883.0
Alabama,Barbour,5454.0
Alabama,Bibb,6738.0
Alabama,Blount,22859.0
...,...,...
Wyoming,Sweetwater,12154.0
Wyoming,Teton,7314.0
Wyoming,Uinta,6154.0
Wyoming,Washakie,2911.0


In [233]:
party_votes = cleaned_results[["party", "candidatevotes"]]

In [234]:
party_final = test.merge(cleaned_results, how="inner", on="candidatevotes").drop_duplicates()

In [235]:
party_final.reset_index(inplace=True, drop=True)

In [236]:
party_final.drop(columns={"year", "FIPS", "office", "candidate", "totalvotes", "version", "candidatevotes", "state_po"}, inplace=True)

In [237]:
data = pd.read_csv("../data/county-cases-latest-master.csv")
data.drop(columns={"Unnamed: 0"}, inplace=True)

In [238]:
data.columns

Index(['date', 'state', 'county', 'fips', 'cases', 'deaths', 'code',
       '2019_population_est', 'poverty_est_all_ages',
       'median_household_income', 'less_than_high_school_diploma_2015-19',
       'high_school_diploma_only_2015-19',
       'some_college_or_associate_ degree_2015-19',
       'bachelor_degree_or_higher_2015-19'],
      dtype='object')

In [240]:
analysis_df = data.merge(party_final, left_on=["state", "county"], right_on=["state", "county"]).drop_duplicates()

In [242]:
analysis_df.drop(columns={"date", "fips",}, inplace=True)

In [244]:
analysis_df = analysis_df[["state", "county", "party", "cases", "deaths", "median_household_income", "2019_population_est"]]

In [247]:
analysis_df = pd.get_dummies(analysis_df, columns=["party"])

In [251]:
analysis_df.drop(columns={"party_republican"}, inplace=True)

In [252]:
analysis_df

Unnamed: 0,state,county,cases,deaths,median_household_income,2019_population_est,party_democrat
0,Alabama,Autauga,6364.0,92.0,58233.0,55869.0,0
1,Alabama,Baldwin,19942.0,289.0,59871.0,223234.0,0
2,Alabama,Barbour,2143.0,51.0,35972.0,24686.0,0
3,Alabama,Bibb,2464.0,58.0,47918.0,22394.0,0
4,Alabama,Blount,6256.0,128.0,52902.0,57826.0,0
...,...,...,...,...,...,...,...
3464,Wyoming,Teton,3397.0,9.0,98837.0,23464.0,1
3465,Wyoming,Uinta,2076.0,12.0,70756.0,20226.0,1
3466,Wyoming,Uinta,2076.0,12.0,70756.0,20226.0,0
3467,Wyoming,Washakie,887.0,26.0,55122.0,7805.0,0


# Training  / Testing Model

In [259]:
# PLot each of these factors agains coivd, what trends factor by factor
# Python grid plot
# Possibly just remove some ind. variables w/ high correlation to each other. 
X = analysis_df.iloc[:, 4:]
X.shape

(3460, 3)

In [260]:
y = analysis_df["cases"].values.reshape(-1, 1)
y.shape

(3460, 1)

In [263]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [264]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

model.fit(X_train, y_train)
training_score = model.score(X_train, y_train)
testing_score = model.score(X_test, y_test)

print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.6370047059047098
Testing Score: 0.570606458650952


In [None]:
census_county_data = pd.read_csv("../data/census-bureau-population-by-county.csv")
state_keys = pd.read_csv("../data/state-names-codes.csv")
state_keys.rename(columns={"State": "state"}, inplace=True)

In [None]:
census_county_data["county"] = census_county_data["county"].str.rstrip()
census_county_data = census_county_data.merge(state_keys, how="inner", on="state")
census_county_data.rename(columns={"Code": "code"}, inplace=True)

In [None]:
poverty_income_data = pd.read_csv("../data/poverty-and-median-household-income-data-by-us-county-2019.csv")
poverty_income_data = poverty_income_data.rename(columns={"Postal Code": "code"})
poverty_income_data = poverty_income_data[["code", "county", "Poverty Estimate, All Ages", "Median Household Income"]]

In [None]:
master = poverty_income_data.merge(census_county_data, how="inner", left_on=["code", "county"], right_on=["code", "county"])
master = master[["code", "county", "Poverty Estimate, All Ages", "Median Household Income", "state", "2019_population_est"]]

In [None]:
latest_state_data = pd.read_csv("../data/county-cases-latest-master.csv")
latest_state_data.drop("Unnamed: 0", axis=1, inplace=True)

In [None]:
education = pd.read_csv("../data/county-level-education-stats-2015-2019.csv")
education.rename(columns={"state": "code"}, inplace=True)
education["county"] = education["county"].str.rstrip()

In [None]:
master = master.merge(education, how="inner", left_on=["code", "county"], right_on=["code", "county"])
master.rename(columns={"Poverty Estimate, All Ages": "poverty_est_all_ages", "Median Household Income": "median_household_income"}, inplace=True)

In [None]:
missing = master[~master.county.isin(latest_state_data.county)]
missing = missing[~(missing['county'].isin(state_keys.state))]
missing = pd.get_dummies(missing)

In [None]:
missing

In [None]:
# predict_missing = missing[["2019_population_est", "poverty_est_all_ages", "median_household_income", "less_than_high_school_diploma_2015-19", "high_school_diploma_only_2015-19", "some_college_or_associate_ degree_2015-19", "bachelor_degree_or_higher_2015-19"]]

In [None]:
# predict_missing

In [None]:
predict_counties = model.predict(missing)
predict_counties

In [None]:
county_predictions_df = pd.DataFrame(predict_counties)
county_predictions_df.rename(columns={0: "cases"}, inplace=True)

In [None]:
missing.reset_index(inplace=True)

In [None]:
missing_results = missing.merge(county_predictions_df, how="inner", left_index=True, right_index=True)

In [None]:
missing_results = missing_results[["code", "county", "2019_population_est", "cases"]]

In [None]:
missing_results

In [None]:
data_X = data["median_household_income"].values.reshape(-1,1)
data_y = data["cases"].values.reshape(-1,1)

In [None]:
data_X.shape
data_y.shape

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

model.fit(data_X, data_y)

In [None]:
results = model.predict(predict_missing)

In [None]:
pd.DataFrame(results)