In [1]:
import pandas as pd
import numpy as np
import csv
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import tensorflow.keras
from keras.utils import to_categorical

## Voting results based on 2016 Data from MIT
MIT Election Data and Science Lab, 2018, "County Presidential Election Returns 2000-2016", https://doi.org/10.7910/DVN/VOQCHQ, Harvard Dataverse, V6, UNF:6:ZZe1xuZ5H2l4NUiSRcRf8Q== [fileUNF]

In [2]:
pres_results = pd.read_csv("../data/presidential_results_by_county_2000-2016.csv")

In [3]:
cleaned_results = pres_results[pres_results["year"] == 2016].dropna()

In [4]:
cleaned_results.rename(columns={"FIPS": "fips"}, inplace=True)

In [5]:
counties_df = cleaned_results.drop_duplicates(subset=["state", "county"])[["state", "state_po", "county"]]

In [6]:
cleaned_results = cleaned_results[["state", "county", "fips", "party", "candidatevotes"]]

In [7]:
cleaned_results

Unnamed: 0,state,county,fips,party,candidatevotes
40517,Alabama,Autauga,1001.0,democrat,5936.0
40518,Alabama,Autauga,1001.0,republican,18172.0
40520,Alabama,Baldwin,1003.0,democrat,18458.0
40521,Alabama,Baldwin,1003.0,republican,72883.0
40523,Alabama,Barbour,1005.0,democrat,4871.0
...,...,...,...,...,...
50513,Alaska,District 38,2038.0,republican,1143.0
50515,Alaska,District 39,2039.0,democrat,3142.0
50516,Alaska,District 39,2039.0,republican,1405.0
50518,Alaska,District 40,2040.0,democrat,2338.0


In [8]:
voting_df = cleaned_results.groupby(["state", "county", "fips"]).agg({'candidatevotes':'max', 'party': 'count'})
voting_df.drop("party", axis=1, inplace=True)
voting_df.reset_index(inplace=True)

In [9]:
voting_df

Unnamed: 0,state,county,fips,candidatevotes
0,Alabama,Autauga,1001.0,18172.0
1,Alabama,Baldwin,1003.0,72883.0
2,Alabama,Barbour,1005.0,5454.0
3,Alabama,Bibb,1007.0,6738.0
4,Alabama,Blount,1009.0,22859.0
...,...,...,...,...
3148,Wyoming,Sweetwater,56037.0,12154.0
3149,Wyoming,Teton,56039.0,7314.0
3150,Wyoming,Uinta,56041.0,6154.0
3151,Wyoming,Washakie,56043.0,2911.0


In [10]:
party_votes = cleaned_results[["state", "county", "fips", "candidatevotes"]]
political_leaning_df = pd.merge(voting_df, cleaned_results, how="inner", on=["state", "county", "candidatevotes"])

In [11]:
political_leaning_df = pd.get_dummies(political_leaning_df, columns=["party"])

In [12]:
political_leaning_df.drop(columns={"party_democrat"}, inplace=True)

In [13]:
political_leaning_df.drop(columns={"fips_y"}, inplace=True)
political_leaning_df.rename(columns={"fips_x": "fips"}, inplace=True)

In [27]:
data = pd.read_csv("../data/county-cases-latest-master.csv")
data.drop(columns={"Unnamed: 0"}, inplace=True)

In [28]:
data

Unnamed: 0,date,state,county,fips,cases,deaths,code,population_est,poverty_est_all_ages,median_household_income,less_than_high_school_diploma_2015-19,high_school_diploma_only_2015-19,some_college_or_associate_ degree_2015-19,bachelor_degree_or_higher_2015-19
0,2021-03-12,Alabama,Autauga,1001.0,6409.0,95.0,AL,55869.0,6723.0,58233.0,4291.0,12551.0,10596.0,9929.0
1,2021-03-12,Alabama,Baldwin,1003.0,20072.0,294.0,AL,223234.0,22360.0,59871.0,13893.0,41797.0,47274.0,48148.0
2,2021-03-12,Alabama,Barbour,1005.0,2175.0,52.0,AL,24686.0,5909.0,35972.0,4812.0,6396.0,4676.0,2080.0
3,2021-03-12,Alabama,Bibb,1007.0,2475.0,58.0,AL,22394.0,4101.0,47918.0,3386.0,7256.0,3848.0,1678.0
4,2021-03-12,Alabama,Blount,1009.0,6282.0,129.0,AL,57826.0,9324.0,52902.0,7763.0,13299.0,13519.0,5210.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3102,2021-03-12,Wyoming,Sweetwater,56037.0,3871.0,36.0,WY,42343.0,3453.0,80639.0,2017.0,9239.0,10415.0,6291.0
3103,2021-03-12,Wyoming,Teton,56039.0,3427.0,9.0,WY,23464.0,1396.0,98837.0,834.0,2577.0,4037.0,9875.0
3104,2021-03-12,Wyoming,Uinta,56041.0,2088.0,12.0,WY,20226.0,1699.0,70756.0,941.0,5383.0,4562.0,2078.0
3105,2021-03-12,Wyoming,Washakie,56043.0,888.0,26.0,WY,7805.0,845.0,55122.0,568.0,1650.0,2031.0,1297.0


In [29]:
data.drop(columns={"date", "deaths", "code", "poverty_est_all_ages", "less_than_high_school_diploma_2015-19", "high_school_diploma_only_2015-19", "some_college_or_associate_ degree_2015-19", "bachelor_degree_or_higher_2015-19"}, inplace=True)

In [32]:
data

Unnamed: 0,state,county,fips,cases,population_est,median_household_income
0,Alabama,Autauga,1001.0,6409.0,55869.0,58233.0
1,Alabama,Baldwin,1003.0,20072.0,223234.0,59871.0
2,Alabama,Barbour,1005.0,2175.0,24686.0,35972.0
3,Alabama,Bibb,1007.0,2475.0,22394.0,47918.0
4,Alabama,Blount,1009.0,6282.0,57826.0,52902.0
...,...,...,...,...,...,...
3102,Wyoming,Sweetwater,56037.0,3871.0,42343.0,80639.0
3103,Wyoming,Teton,56039.0,3427.0,23464.0,98837.0
3104,Wyoming,Uinta,56041.0,2088.0,20226.0,70756.0
3105,Wyoming,Washakie,56043.0,888.0,7805.0,55122.0


In [33]:
analysis_df = political_leaning_df.merge(data, how="inner")

In [34]:
analysis_df = analysis_df[["state", "county", "fips", "cases", "population_est", "party_republican", "candidatevotes", "median_household_income"]]

In [35]:
analysis_df.head()

Unnamed: 0,state,county,fips,cases,population_est,party_republican,candidatevotes,median_household_income
0,Alabama,Autauga,1001.0,6409.0,55869.0,1,18172.0,58233.0
1,Alabama,Baldwin,1003.0,20072.0,223234.0,1,72883.0,59871.0
2,Alabama,Barbour,1005.0,2175.0,24686.0,1,5454.0,35972.0
3,Alabama,Bibb,1007.0,2475.0,22394.0,1,6738.0,47918.0
4,Alabama,Blount,1009.0,6282.0,57826.0,1,22859.0,52902.0


In [36]:
X = analysis_df.iloc[:,4:]
X.shape

(3039, 4)

In [37]:
y = analysis_df["cases"].values.reshape(-1, 1)
y.shape

(3039, 1)

# Training  / Testing Model

In [38]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [39]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

model.fit(X_train, y_train)
training_score = model.score(X_train, y_train)
testing_score = model.score(X_test, y_test)

print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.8825158845254277
Testing Score: 0.796487655446795


## Find Missing Data

In [40]:
# missing = analysis_df[~analysis_df[["state", "county"]].isin(counties_df[["state", "county"]])]
# missing = missing[~(missing['county'].isin(state_keys.state))]
# missing = pd.get_dummies(missing)

In [41]:
analysis_df.head()

Unnamed: 0,state,county,fips,cases,population_est,party_republican,candidatevotes,median_household_income
0,Alabama,Autauga,1001.0,6409.0,55869.0,1,18172.0,58233.0
1,Alabama,Baldwin,1003.0,20072.0,223234.0,1,72883.0,59871.0
2,Alabama,Barbour,1005.0,2175.0,24686.0,1,5454.0,35972.0
3,Alabama,Bibb,1007.0,2475.0,22394.0,1,6738.0,47918.0
4,Alabama,Blount,1009.0,6282.0,57826.0,1,22859.0,52902.0


In [42]:
missing = political_leaning_df[~political_leaning_df.county.isin(analysis_df.county)]
missing.head()

Unnamed: 0,state,county,fips,candidatevotes,party_republican
67,Alaska,District 1,2001.0,3180.0,1
68,Alaska,District 10,2010.0,6255.0,1
69,Alaska,District 11,2011.0,6444.0,1
70,Alaska,District 12,2012.0,6629.0,1
71,Alaska,District 13,2013.0,4028.0,1


In [43]:
fips = pd.read_csv("../data/fips-codes.csv")
fips = fips[["fips", "county_name", "state_name"]]
fips.rename(columns={"county_name": "county"}, inplace=True)

In [44]:
population_data = pd.read_csv("../data/census-bureau-population-by-county.csv")

In [45]:
fips_final = population_data.merge(fips).drop_duplicates(subset=["state", "county"])
fips_final.drop(columns={"state_name"}, inplace=True)

In [46]:
missing["fips"] = missing["fips"].astype("int64")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [47]:
missing_df = pd.merge(missing, fips, how="inner", on="fips")

In [48]:
missing_df

Unnamed: 0,state,county_x,fips,candidatevotes,party_republican,county_y,state_name
0,Alaska,District 13,2013,4028.0,1,Aleutians East Borough,Alaska
1,Alaska,District 16,2016,3294.0,0,Aleutians West Census Area,Alaska
2,Alaska,District 20,2020,4151.0,0,Anchorage Municipality,Alaska
3,Florida,Desoto,12027,6778.0,1,DeSoto,Florida
4,Illinois,LaSalle,17099,26689.0,1,LaSalle,Illinois
...,...,...,...,...,...,...,...
77,Virginia,Radford,51750,2925.0,0,Radford city,Virginia
78,Virginia,Staunton,51790,5333.0,0,Staunton city,Virginia
79,Virginia,Virginia Beach,51810,98224.0,1,Virginia Beach city,Virginia
80,Virginia,Waynesboro,51820,4801.0,1,Waynesboro city,Virginia


In [49]:
fips.rename(columns={"state_name": "state"}, inplace=True)

In [50]:
population_df = fips.merge(population_data, how="inner", left_on=["state", "county"], right_on=["state", "county"])

In [51]:
merged_df = population_df.merge(missing_df, how="inner", on="fips")

In [55]:
merged_df = merged_df[["fips", "county", "state", "population_est", "candidatevotes", "party_republican"]]

In [56]:
merged_df.rename(columns={"state_x": "state"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [57]:
merged_df

Unnamed: 0,fips,county,state,population_est,candidatevotes,party_republican
0,2013,Aleutians East Borough,Alaska,3337,4028.0,1
1,2016,Aleutians West Census Area,Alaska,5634,3294.0,0
2,2020,Anchorage Municipality,Alaska,288000,4151.0,0
3,12027,DeSoto,Florida,38001,6778.0,1
4,17099,LaSalle,Illinois,108669,26689.0,1
...,...,...,...,...,...,...
75,51750,Radford city,Virginia,18249,2925.0,0
76,51790,Staunton city,Virginia,24932,5333.0,0
77,51810,Virginia Beach city,Virginia,449974,98224.0,1
78,51820,Waynesboro city,Virginia,22630,4801.0,1


In [58]:
income_data = pd.read_csv("../data/poverty-and-median-household-income-data-by-us-county-2019.csv")
income_data["county"] = income_data["county"].str.rstrip()
# income_data.rename(columns={"Name": "state"}, inplace=True)

In [59]:
income_data = income_data[["Postal Code", "county", "Median Household Income"]]

In [60]:
income_data.rename(columns={"Postal Code": "code"}, inplace=True)

In [61]:
state_keys = pd.read_csv("../data/state-names-codes.csv")
state_keys.rename(columns={"State": "state", "Code": "code"}, inplace=True)

In [62]:
income_data = state_keys.merge(income_data, how="inner", left_on=["code"], right_on=["code"])
income_data

Unnamed: 0,state,Abbrev,code,county,Median Household Income
0,Alabama,Ala.,AL,Alabama,51771
1,Alabama,Ala.,AL,Autauga,58233
2,Alabama,Ala.,AL,Baldwin,59871
3,Alabama,Ala.,AL,Barbour,35972
4,Alabama,Ala.,AL,Bibb,47918
...,...,...,...,...,...
3188,Wyoming,Wyo.,WY,Sweetwater,80639
3189,Wyoming,Wyo.,WY,Teton,98837
3190,Wyoming,Wyo.,WY,Uinta,70756
3191,Wyoming,Wyo.,WY,Washakie,55122


In [63]:
predict_df = merged_df.merge(income_data)

In [64]:
predict_df.drop(columns={"Abbrev", "code"}, inplace=True)

In [65]:
predict_df.rename(columns={"Median Household Income": "median_household_income"}, inplace=True)

In [85]:
predict_df = predict_df[["fips", "county", "state", "population_est", "party_republican", "candidatevotes", "median_household_income"]]

In [86]:
predict_df.iloc[:, 3:]

Unnamed: 0,population_est,party_republican,candidatevotes,median_household_income
0,3337,1,4028.0,66923
1,5634,0,3294.0,84726
2,62045,1,21162.0,44728
3,126604,1,36143.0,83072
4,21891,1,6714.0,51763
...,...,...,...,...
61,18249,0,2925.0,41530
62,24932,0,5333.0,54296
63,449974,1,98224.0,78491
64,22630,1,4801.0,44619


In [87]:
# model.fit(X, y)

In [88]:
# model.score(X, y)

In [89]:
missing_x = predict_df.iloc[:, 3:]

In [90]:
predictions = model.predict(missing_x)

In [91]:
for x in predictions:
    print(x[0])

-1822.986153556727
-11081.498179805065
9276.144911450592
7955.561762810956
2399.0771566686044
6848.6355025435005
3806.684448815502
4114.547719120947
13423.086412950204
21261.260136567573
4025.7367259509692
4976.060722425616
3816.7315466739083
35681.83702313639
-803.4839201336681
2309.139255927033
7067.982196041443
8768.156421526106
-3117.1732012860466
11660.07098686433
5689.270388109704
3902.751029460838
718.7556057374313
3154.1958600726216
14770.239995101221
5232.425560275829
-2875.443494444382
-4372.656565700407
-1452.6254435412347
10961.820075937496
7219.682619317754
7562.1810057354
30680.501702416695
14313.817917239207
-1311.8127747770122
12385.87770903481
-94.30975535100697
3073.009547123129
-1453.7935761503713
3235.1348753200377
72367.24183945202
35166.640416415205
6040.995849189372
-408.72434994178
17324.9498904168
811.6258451651474
1665.5920531291104
-2634.9061741993546
-20475.877014112317
-5066.302621827015
3269.762751397935
-706.7400596554744
-1178.6559711332275
7521.24722545

In [92]:
predictions_list = []

for x in predictions:
    if x[0] < 0:
        x = 0
        predictions_list.append(x)
    else:
        predictions_list.append(round(x[0]))
predict_df["predicted_cases"] = predictions_list

In [93]:
predict_df

Unnamed: 0,fips,county,state,population_est,party_republican,candidatevotes,median_household_income,predicted_cases
0,2013,Aleutians East Borough,Alaska,3337,1,4028.0,66923,0
1,2016,Aleutians West Census Area,Alaska,5634,0,3294.0,84726,0
2,22001,Acadia Parish,Louisiana,62045,1,21162.0,44728,9276
3,22005,Ascension Parish,Louisiana,126604,1,36143.0,83072,7956
4,22007,Assumption Parish,Louisiana,21891,1,6714.0,51763,2399
...,...,...,...,...,...,...,...,...
61,51750,Radford city,Virginia,18249,0,2925.0,41530,0
62,51790,Staunton city,Virginia,24932,0,5333.0,54296,0
63,51810,Virginia Beach city,Virginia,449974,1,98224.0,78491,35649
64,51820,Waynesboro city,Virginia,22630,1,4801.0,44619,3196
