In [1]:
import pandas as pd
import numpy as np
import csv
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import tensorflow.keras
from keras.utils import to_categorical

## Voting results based on 2016 Data from MIT
MIT Election Data and Science Lab, 2018, "County Presidential Election Returns 2000-2016", https://doi.org/10.7910/DVN/VOQCHQ, Harvard Dataverse, V6, UNF:6:ZZe1xuZ5H2l4NUiSRcRf8Q== [fileUNF]

In [231]:
pres_results = pd.read_csv("../data/presidential_results_by_county_2000-2016.csv")

In [232]:
cleaned_results = pres_results[pres_results["year"] == 2016].dropna()

In [233]:
cleaned_results.rename(columns={"FIPS": "fips"}, inplace=True)

In [234]:
counties_df = cleaned_results.drop_duplicates(subset=["state", "county"])[["state", "state_po", "county"]]

In [235]:
cleaned_results = cleaned_results[["state", "county", "fips", "party", "candidatevotes"]]

In [236]:
cleaned_results

Unnamed: 0,state,county,fips,party,candidatevotes
40517,Alabama,Autauga,1001.0,democrat,5936.0
40518,Alabama,Autauga,1001.0,republican,18172.0
40520,Alabama,Baldwin,1003.0,democrat,18458.0
40521,Alabama,Baldwin,1003.0,republican,72883.0
40523,Alabama,Barbour,1005.0,democrat,4871.0
...,...,...,...,...,...
50513,Alaska,District 38,2038.0,republican,1143.0
50515,Alaska,District 39,2039.0,democrat,3142.0
50516,Alaska,District 39,2039.0,republican,1405.0
50518,Alaska,District 40,2040.0,democrat,2338.0


In [237]:
voting_df = cleaned_results.groupby(["state", "county", "fips"]).agg({'candidatevotes':'max', 'party': 'count'})
voting_df.drop("party", axis=1, inplace=True)
voting_df.reset_index(inplace=True)

In [238]:
voting_df

Unnamed: 0,state,county,fips,candidatevotes
0,Alabama,Autauga,1001.0,18172.0
1,Alabama,Baldwin,1003.0,72883.0
2,Alabama,Barbour,1005.0,5454.0
3,Alabama,Bibb,1007.0,6738.0
4,Alabama,Blount,1009.0,22859.0
...,...,...,...,...
3148,Wyoming,Sweetwater,56037.0,12154.0
3149,Wyoming,Teton,56039.0,7314.0
3150,Wyoming,Uinta,56041.0,6154.0
3151,Wyoming,Washakie,56043.0,2911.0


In [239]:
party_votes = cleaned_results[["state", "county", "fips", "candidatevotes"]]
political_leaning_df = pd.merge(voting_df, cleaned_results, how="inner", on=["state", "county", "candidatevotes"])

In [240]:
political_leaning_df = pd.get_dummies(political_leaning_df, columns=["party"])

In [241]:
political_leaning_df.drop(columns={"party_democrat"}, inplace=True)

In [242]:
political_leaning_df.drop(columns={"fips_y"}, inplace=True)
political_leaning_df.rename(columns={"fips_x": "fips"}, inplace=True)

In [243]:
data = pd.read_csv("../data/county-cases-latest-master.csv")
data.drop(columns={"Unnamed: 0"}, inplace=True)

In [244]:
data.drop(columns={"date", "deaths", "code", "poverty_est_all_ages", "less_than_high_school_diploma_2015-19", "high_school_diploma_only_2015-19", "some_college_or_associate_ degree_2015-19", "bachelor_degree_or_higher_2015-19"}, inplace=True)

In [245]:
data

Unnamed: 0,state,county,fips,cases,2019_population_est,median_household_income
0,Alabama,Autauga,1001.0,6364.0,55869.0,58233.0
1,Alabama,Baldwin,1003.0,19942.0,223234.0,59871.0
2,Alabama,Barbour,1005.0,2143.0,24686.0,35972.0
3,Alabama,Bibb,1007.0,2464.0,22394.0,47918.0
4,Alabama,Blount,1009.0,6256.0,57826.0,52902.0
...,...,...,...,...,...,...
3102,Wyoming,Sweetwater,56037.0,3853.0,42343.0,80639.0
3103,Wyoming,Teton,56039.0,3397.0,23464.0,98837.0
3104,Wyoming,Uinta,56041.0,2076.0,20226.0,70756.0
3105,Wyoming,Washakie,56043.0,887.0,7805.0,55122.0


In [246]:
analysis_df = political_leaning_df.merge(data, how="inner")

In [247]:
analysis_df = analysis_df[["state", "county", "fips", "cases", "party_republican", "candidatevotes", "2019_population_est", "median_household_income"]]

In [248]:
analysis_df.head()

Unnamed: 0,state,county,fips,cases,party_republican,candidatevotes,2019_population_est,median_household_income
0,Alabama,Autauga,1001.0,6364.0,1,18172.0,55869.0,58233.0
1,Alabama,Baldwin,1003.0,19942.0,1,72883.0,223234.0,59871.0
2,Alabama,Barbour,1005.0,2143.0,1,5454.0,24686.0,35972.0
3,Alabama,Bibb,1007.0,2464.0,1,6738.0,22394.0,47918.0
4,Alabama,Blount,1009.0,6256.0,1,22859.0,57826.0,52902.0


In [516]:
X = analysis_df.iloc[:,4:]
X.shape

(3039, 4)

In [517]:
y = analysis_df["cases"].values.reshape(-1, 1)
y.shape

(3039, 1)

# Training  / Testing Model

In [518]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [519]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

model.fit(X_train, y_train)
training_score = model.score(X_train, y_train)
testing_score = model.score(X_test, y_test)

print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.8822114911410123
Testing Score: 0.794706584622583


## Find Missing Data

In [345]:
# missing = analysis_df[~analysis_df[["state", "county"]].isin(counties_df[["state", "county"]])]
# missing = missing[~(missing['county'].isin(state_keys.state))]
# missing = pd.get_dummies(missing)

In [385]:
analysis_df.head()

Unnamed: 0,state,county,fips,cases,party_republican,candidatevotes,2019_population_est,median_household_income
0,Alabama,Autauga,1001.0,6364.0,1,18172.0,55869.0,58233.0
1,Alabama,Baldwin,1003.0,19942.0,1,72883.0,223234.0,59871.0
2,Alabama,Barbour,1005.0,2143.0,1,5454.0,24686.0,35972.0
3,Alabama,Bibb,1007.0,2464.0,1,6738.0,22394.0,47918.0
4,Alabama,Blount,1009.0,6256.0,1,22859.0,57826.0,52902.0


In [386]:
missing = political_leaning_df[~political_leaning_df.county.isin(analysis_df.county)]
missing.head()

Unnamed: 0,state,county,fips,candidatevotes,party_republican
67,Alaska,District 1,2001.0,3180.0,1
68,Alaska,District 10,2010.0,6255.0,1
69,Alaska,District 11,2011.0,6444.0,1
70,Alaska,District 12,2012.0,6629.0,1
71,Alaska,District 13,2013.0,4028.0,1


In [395]:
fips = pd.read_csv("../data/fips-codes.csv")
fips = fips[["fips", "county_name", "state_name"]]
fips.rename(columns={"county_name": "county"}, inplace=True)

In [391]:
population_data = pd.read_csv("../data/census-bureau-population-by-county.csv")

In [403]:
fips_final = population_data.merge(fips).drop_duplicates(subset=["state", "county"])
fips_final.drop(columns={"state_name"}, inplace=True)

In [414]:
missing["fips"] = missing["fips"].astype("int64")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [422]:
missing_df = pd.merge(missing, fips, how="inner", on="fips")

In [423]:
missing_df

Unnamed: 0,state,county_x,fips,candidatevotes,party_republican,county_y,state_name
0,Alaska,District 13,2013,4028.0,1,Aleutians East Borough,Alaska
1,Alaska,District 16,2016,3294.0,0,Aleutians West Census Area,Alaska
2,Alaska,District 20,2020,4151.0,0,Anchorage Municipality,Alaska
3,Florida,Desoto,12027,6778.0,1,DeSoto,Florida
4,Illinois,LaSalle,17099,26689.0,1,LaSalle,Illinois
...,...,...,...,...,...,...,...
77,Virginia,Radford,51750,2925.0,0,Radford city,Virginia
78,Virginia,Staunton,51790,5333.0,0,Staunton city,Virginia
79,Virginia,Virginia Beach,51810,98224.0,1,Virginia Beach city,Virginia
80,Virginia,Waynesboro,51820,4801.0,1,Waynesboro city,Virginia


In [431]:
fips.rename(columns={"state_name": "state"}, inplace=True)

In [434]:
population_df = fips.merge(population_data, how="inner", left_on=["state", "county"], right_on=["state", "county"])

In [437]:
merged_df = population_df.merge(missing_df, how="inner", on="fips")

In [440]:
merged_df = merged_df[["fips", "county", "state_x", "2019_population_est", "candidatevotes", "party_republican"]]

In [442]:
merged_df.rename(columns={"state_x": "state"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [443]:
merged_df

Unnamed: 0,fips,county,state,2019_population_est,candidatevotes,party_republican
0,2013,Aleutians East Borough,Alaska,3337,4028.0,1
1,2016,Aleutians West Census Area,Alaska,5634,3294.0,0
2,2020,Anchorage Municipality,Alaska,288000,4151.0,0
3,12027,DeSoto,Florida,38001,6778.0,1
4,17099,LaSalle,Illinois,108669,26689.0,1
...,...,...,...,...,...,...
75,51750,Radford city,Virginia,18249,2925.0,0
76,51790,Staunton city,Virginia,24932,5333.0,0
77,51810,Virginia Beach city,Virginia,449974,98224.0,1
78,51820,Waynesboro city,Virginia,22630,4801.0,1


In [466]:
income_data = pd.read_csv("../data/poverty-and-median-household-income-data-by-us-county-2019.csv")
income_data["county"] = income_data["county"].str.rstrip()
# income_data.rename(columns={"Name": "state"}, inplace=True)

In [467]:
income_data = income_data[["Postal Code", "county", "Median Household Income"]]

In [468]:
income_data.rename(columns={"Postal Code": "code"}, inplace=True)

In [472]:
state_keys = pd.read_csv("../data/state-names-codes.csv")
state_keys.rename(columns={"State": "state", "Code": "code"}, inplace=True)

In [473]:
income_data = state_keys.merge(income_data, how="inner", left_on=["code"], right_on=["code"])
income_data

Unnamed: 0,state,Abbrev,code,county,Median Household Income
0,Alabama,Ala.,AL,Alabama,51771
1,Alabama,Ala.,AL,Autauga,58233
2,Alabama,Ala.,AL,Baldwin,59871
3,Alabama,Ala.,AL,Barbour,35972
4,Alabama,Ala.,AL,Bibb,47918
...,...,...,...,...,...
3188,Wyoming,Wyo.,WY,Sweetwater,80639
3189,Wyoming,Wyo.,WY,Teton,98837
3190,Wyoming,Wyo.,WY,Uinta,70756
3191,Wyoming,Wyo.,WY,Washakie,55122


In [483]:
predict_df = merged_df.merge(income_data)

In [486]:
predict_df.drop(columns={"Abbrev", "code"}, inplace=True)

In [504]:
predict_df.rename(columns={"Median Household Income": "median_household_income"}, inplace=True)

In [505]:
predict_df = predict_df[["fips", "county", "state", "party_republican", "candidatevotes", "2019_population_est", "median_household_income"]]

In [506]:
predict_df.iloc[:, 3:]

Unnamed: 0,party_republican,candidatevotes,2019_population_est,median_household_income
0,1,4028.0,3337,66923
1,0,3294.0,5634,84726
2,1,21162.0,62045,44728
3,1,36143.0,126604,83072
4,1,6714.0,21891,51763
...,...,...,...,...
61,0,2925.0,18249,41530
62,0,5333.0,24932,54296
63,1,98224.0,449974,78491
64,1,4801.0,22630,44619


In [514]:
# model.fit(X, y)

In [515]:
# model.score(X, y)

In [520]:
missing_x = predict_df.iloc[:, 3:]

In [521]:
predictions = model.predict(missing_x)

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 4 is different from 5)

In [522]:
for x in predictions:
    print(x[0])

-1812.6304745576226
-10313.003635033438
9156.664469483261
8106.868361122899
2275.158191545329
6625.543044702783
3729.7440123530796
3873.4686819193785
13351.974382689532
21181.90251513743
3789.5741848058788
4731.176995496586
3660.1956865694674
36153.235397812176
-458.83795405312503
2166.8452436957623
6832.0604614772
8644.130897178842
-2584.8119359610655
11634.93767663106
5430.098526265143
3741.128824015578
659.5152090375996
3004.125264995697
14675.449503946404
5031.608883974851
-2423.435693096935
-3814.2164655333345
-900.0901272043229
10754.156575654277
7092.098196023151
7369.615630248092
31015.095452656773
14155.24425034904
-954.7945527130123
12303.569699485028
-129.64604064960622
2881.5816236170667
-1500.0062760967285
3046.323603289896
73551.77400545168
36001.61658103162
7138.543214300389
222.36353234093895
17297.787302988298
732.1742285950877
2068.3250899729674
-2225.6685897919024
-19313.358053362528
-4492.44438967415
3023.290542276093
-264.4364821717727
-784.493608128224
7345.479167

In [526]:
predictions_list = []

for x in predictions:
    if x < 0:
        x = 0
        predictions_list.append(x)
    else:
        predictions_list.append(round(x[0]))
predict_df["predicted_cases"] = predictions_list

In [527]:
predict_df

Unnamed: 0,fips,county,state,party_republican,candidatevotes,2019_population_est,median_household_income,predicted_cases
0,2013,Aleutians East Borough,Alaska,1,4028.0,3337,66923,0
1,2016,Aleutians West Census Area,Alaska,0,3294.0,5634,84726,0
2,22001,Acadia Parish,Louisiana,1,21162.0,62045,44728,9157
3,22005,Ascension Parish,Louisiana,1,36143.0,126604,83072,8107
4,22007,Assumption Parish,Louisiana,1,6714.0,21891,51763,2275
...,...,...,...,...,...,...,...,...
61,51750,Radford city,Virginia,0,2925.0,18249,41530,0
62,51790,Staunton city,Virginia,0,5333.0,24932,54296,0
63,51810,Virginia Beach city,Virginia,1,98224.0,449974,78491,35553
64,51820,Waynesboro city,Virginia,1,4801.0,22630,44619,2994
