In [1]:
import pandas as pd
import numpy as np
import csv
import os
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("../data/county-cases-latest-master.csv")
data.drop(columns={"Unnamed: 0"}, inplace=True)

# Training  / Testing Model

In [3]:
data

Unnamed: 0,date,state,county,fips,cases,deaths,code,2019_population_est,poverty_est_all_ages,median_household_income,less_than_high_school_diploma_2015-19,high_school_diploma_only_2015-19,some_college_or_associate_ degree_2015-19,bachelor_degree_or_higher_2015-19
0,2021-03-06,Alabama,Autauga,1001.0,6344.0,92.0,AL,55869.0,6723.0,58233.0,4291.0,12551.0,10596.0,9929.0
1,2021-03-06,Alabama,Baldwin,1003.0,19915.0,289.0,AL,223234.0,22360.0,59871.0,13893.0,41797.0,47274.0,48148.0
2,2021-03-06,Alabama,Barbour,1005.0,2138.0,51.0,AL,24686.0,5909.0,35972.0,4812.0,6396.0,4676.0,2080.0
3,2021-03-06,Alabama,Bibb,1007.0,2460.0,58.0,AL,22394.0,4101.0,47918.0,3386.0,7256.0,3848.0,1678.0
4,2021-03-06,Alabama,Blount,1009.0,6252.0,128.0,AL,57826.0,9324.0,52902.0,7763.0,13299.0,13519.0,5210.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3102,2021-03-06,Wyoming,Sweetwater,56037.0,3829.0,36.0,WY,42343.0,3453.0,80639.0,2017.0,9239.0,10415.0,6291.0
3103,2021-03-06,Wyoming,Teton,56039.0,3375.0,9.0,WY,23464.0,1396.0,98837.0,834.0,2577.0,4037.0,9875.0
3104,2021-03-06,Wyoming,Uinta,56041.0,2070.0,12.0,WY,20226.0,1699.0,70756.0,941.0,5383.0,4562.0,2078.0
3105,2021-03-06,Wyoming,Washakie,56043.0,886.0,26.0,WY,7805.0,845.0,55122.0,568.0,1650.0,2031.0,1297.0


In [4]:
X = data[["2019_population_est", "poverty_est_all_ages", "median_household_income", "less_than_high_school_diploma_2015-19", "high_school_diploma_only_2015-19", "some_college_or_associate_ degree_2015-19", "bachelor_degree_or_higher_2015-19"]]
X.shape

(3107, 7)

In [5]:
y = data["cases"].values.reshape(-1, 1)
y.shape

(3107, 1)

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [7]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

model.fit(X_train, y_train)
training_score = model.score(X_train, y_train)
testing_score = model.score(X_test, y_test)

print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.8170231791341349
Testing Score: 0.921612632269395


In [8]:
model.get_params()

{'copy_X': True,
 'fit_intercept': True,
 'n_jobs': None,
 'normalize': False,
 'positive': False}

In [9]:
census_county_data = pd.read_csv("../data/census-bureau-population-by-county.csv")
state_keys = pd.read_csv("../data/state-names-codes.csv")
state_keys.rename(columns={"State": "state"}, inplace=True)

In [10]:
census_county_data["county"] = census_county_data["county"].str.rstrip()

In [11]:
census_county_data = census_county_data.merge(state_keys, how="inner", on="state")
census_county_data.rename(columns={"Code": "code"}, inplace=True)

In [12]:
census_county_data

Unnamed: 0,state,county,2019_population_est,Abbrev,code
0,Alabama,Alabama,4903185,Ala.,AL
1,Alabama,Autauga,55869,Ala.,AL
2,Alabama,Baldwin,223234,Ala.,AL
3,Alabama,Barbour,24686,Ala.,AL
4,Alabama,Bibb,22394,Ala.,AL
...,...,...,...,...,...
3188,Wyoming,Sweetwater,42343,Wyo.,WY
3189,Wyoming,Teton,23464,Wyo.,WY
3190,Wyoming,Uinta,20226,Wyo.,WY
3191,Wyoming,Washakie,7805,Wyo.,WY


In [13]:
poverty_income_data = pd.read_csv("../data/poverty-and-median-household-income-data-by-us-county-2019.csv")

In [14]:
poverty_income_data

Unnamed: 0,State FIPS Code,FIPS Code,Postal Code,Name,county,"Poverty Estimate, All Ages",90% CI Lower Bound,90% CI Upper Bound,"Poverty Percent, All Ages",90% CI Lower Bound.1,...,90% CI Upper Bound.5,Median Household Income,90% CI Lower Bound.6,90% CI Upper Bound.6,"Poverty Estimate, Age 0-4",90% CI Lower Bound.7,90% CI Upper Bound.7,"Poverty Percent, Age 0-4",90% CI Lower Bound.8,90% CI Upper Bound.8
0,0,0,US,United States,United States,39490096,39248096,39732096,12.3,12.2,...,16,65712,65594,65830,3457689,3405854,3509524,18.2,17.9,18.5
1,1,0,AL,Alabama,Alabama,747478,730491,764465,15.6,15.2,...,21.6,51771,51179,52363,69236,65296,73176,24.2,22.8,25.6
2,1,1,AL,Autauga,Autauga,6723,5517,7929,12.1,9.9,...,19.4,58233,52517,63949,.,.,.,.,.,.
3,1,3,AL,Baldwin,Baldwin,22360,18541,26179,10.1,8.4,...,17.2,59871,54593,65149,.,.,.,.,.,.
4,1,5,AL,Barbour,Barbour,5909,4787,7031,27.1,22,...,49,35972,31822,40122,.,.,.,.,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3189,56,37,WY,Sweetwater,Sweetwater,3453,2743,4163,8.3,6.6,...,11.1,80639,73437,87841,.,.,.,.,.,.
3190,56,39,WY,Teton,Teton,1396,1073,1719,6,4.6,...,6.7,98837,86531,111143,.,.,.,.,.,.
3191,56,41,WY,Uinta,Uinta,1699,1264,2134,8.5,6.3,...,11.1,70756,63191,78321,.,.,.,.,.,.
3192,56,43,WY,Washakie,Washakie,845,626,1064,11.1,8.2,...,17.4,55122,50050,60194,.,.,.,.,.,.


In [15]:
poverty_income_data = poverty_income_data.rename(columns={"Postal Code": "code"})

In [16]:
poverty_income_data = poverty_income_data[["code", "county", "Poverty Estimate, All Ages", "Median Household Income"]]

In [17]:
poverty_income_data

Unnamed: 0,code,county,"Poverty Estimate, All Ages",Median Household Income
0,US,United States,39490096,65712
1,AL,Alabama,747478,51771
2,AL,Autauga,6723,58233
3,AL,Baldwin,22360,59871
4,AL,Barbour,5909,35972
...,...,...,...,...
3189,WY,Sweetwater,3453,80639
3190,WY,Teton,1396,98837
3191,WY,Uinta,1699,70756
3192,WY,Washakie,845,55122


In [18]:
master = poverty_income_data.merge(census_county_data, how="inner", left_on=["code", "county"], right_on=["code", "county"])

In [19]:
master = master[["code", "county", "Poverty Estimate, All Ages", "Median Household Income", "state", "2019_population_est"]]

In [20]:
master

Unnamed: 0,code,county,"Poverty Estimate, All Ages",Median Household Income,state,2019_population_est
0,AL,Alabama,747478,51771,Alabama,4903185
1,AL,Autauga,6723,58233,Alabama,55869
2,AL,Baldwin,22360,59871,Alabama,223234
3,AL,Barbour,5909,35972,Alabama,24686
4,AL,Bibb,4101,47918,Alabama,22394
...,...,...,...,...,...,...
3192,WY,Sweetwater,3453,80639,Wyoming,42343
3193,WY,Teton,1396,98837,Wyoming,23464
3194,WY,Uinta,1699,70756,Wyoming,20226
3195,WY,Washakie,845,55122,Wyoming,7805


In [21]:
latest_state_data = pd.read_csv("../data/county-cases-latest-master.csv")

In [22]:
latest_state_data.drop("Unnamed: 0", axis=1, inplace=True)

In [23]:
latest_state_data

Unnamed: 0,date,state,county,fips,cases,deaths,code,2019_population_est,poverty_est_all_ages,median_household_income,less_than_high_school_diploma_2015-19,high_school_diploma_only_2015-19,some_college_or_associate_ degree_2015-19,bachelor_degree_or_higher_2015-19
0,2021-03-06,Alabama,Autauga,1001.0,6344.0,92.0,AL,55869.0,6723.0,58233.0,4291.0,12551.0,10596.0,9929.0
1,2021-03-06,Alabama,Baldwin,1003.0,19915.0,289.0,AL,223234.0,22360.0,59871.0,13893.0,41797.0,47274.0,48148.0
2,2021-03-06,Alabama,Barbour,1005.0,2138.0,51.0,AL,24686.0,5909.0,35972.0,4812.0,6396.0,4676.0,2080.0
3,2021-03-06,Alabama,Bibb,1007.0,2460.0,58.0,AL,22394.0,4101.0,47918.0,3386.0,7256.0,3848.0,1678.0
4,2021-03-06,Alabama,Blount,1009.0,6252.0,128.0,AL,57826.0,9324.0,52902.0,7763.0,13299.0,13519.0,5210.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3102,2021-03-06,Wyoming,Sweetwater,56037.0,3829.0,36.0,WY,42343.0,3453.0,80639.0,2017.0,9239.0,10415.0,6291.0
3103,2021-03-06,Wyoming,Teton,56039.0,3375.0,9.0,WY,23464.0,1396.0,98837.0,834.0,2577.0,4037.0,9875.0
3104,2021-03-06,Wyoming,Uinta,56041.0,2070.0,12.0,WY,20226.0,1699.0,70756.0,941.0,5383.0,4562.0,2078.0
3105,2021-03-06,Wyoming,Washakie,56043.0,886.0,26.0,WY,7805.0,845.0,55122.0,568.0,1650.0,2031.0,1297.0


In [24]:
education = pd.read_csv("../data/county-level-education-stats-2015-2019.csv")

In [25]:
education.rename(columns={"state": "code"}, inplace=True)

In [26]:
education["county"] = education["county"].str.rstrip()

In [27]:
master = master.merge(education, how="inner", left_on=["code", "county"], right_on=["code", "county"])

In [28]:
master.rename(columns={"Poverty Estimate, All Ages": "poverty_est_all_ages", "Median Household Income": "median_household_income"}, inplace=True)

In [29]:
master[["2019_population_est", "poverty_est_all_ages", "median_household_income", "less_than_high_school_diploma_2015-19", "high_school_diploma_only_2015-19", "some_college_or_associate_ degree_2015-19", "bachelor_degree_or_higher_2015-19"]]

Unnamed: 0,2019_population_est,poverty_est_all_ages,median_household_income,less_than_high_school_diploma_2015-19,high_school_diploma_only_2015-19,some_college_or_associate_ degree_2015-19,bachelor_degree_or_higher_2015-19
0,4903185,747478,51771,458922.0,1022839.0,993344.0,845772.0
1,55869,6723,58233,4291.0,12551.0,10596.0,9929.0
2,223234,22360,59871,13893.0,41797.0,47274.0,48148.0
3,24686,5909,35972,4812.0,6396.0,4676.0,2080.0
4,22394,4101,47918,3386.0,7256.0,3848.0,1678.0
...,...,...,...,...,...,...,...
3222,42343,3453,80639,2017.0,9239.0,10415.0,6291.0
3223,23464,1396,98837,834.0,2577.0,4037.0,9875.0
3224,20226,1699,70756,941.0,5383.0,4562.0,2078.0
3225,7805,845,55122,568.0,1650.0,2031.0,1297.0


In [30]:
master

Unnamed: 0,code,county,poverty_est_all_ages,median_household_income,state,2019_population_est,less_than_high_school_diploma_2015-19,high_school_diploma_only_2015-19,some_college_or_associate_ degree_2015-19,bachelor_degree_or_higher_2015-19
0,AL,Alabama,747478,51771,Alabama,4903185,458922.0,1022839.0,993344.0,845772.0
1,AL,Autauga,6723,58233,Alabama,55869,4291.0,12551.0,10596.0,9929.0
2,AL,Baldwin,22360,59871,Alabama,223234,13893.0,41797.0,47274.0,48148.0
3,AL,Barbour,5909,35972,Alabama,24686,4812.0,6396.0,4676.0,2080.0
4,AL,Bibb,4101,47918,Alabama,22394,3386.0,7256.0,3848.0,1678.0
...,...,...,...,...,...,...,...,...,...,...
3222,WY,Sweetwater,3453,80639,Wyoming,42343,2017.0,9239.0,10415.0,6291.0
3223,WY,Teton,1396,98837,Wyoming,23464,834.0,2577.0,4037.0,9875.0
3224,WY,Uinta,1699,70756,Wyoming,20226,941.0,5383.0,4562.0,2078.0
3225,WY,Washakie,845,55122,Wyoming,7805,568.0,1650.0,2031.0,1297.0


In [31]:
missing = master[~master.county.isin(latest_state_data.county)]

In [32]:
missing

Unnamed: 0,code,county,poverty_est_all_ages,median_household_income,state,2019_population_est,less_than_high_school_diploma_2015-19,high_school_diploma_only_2015-19,some_college_or_associate_ degree_2015-19,bachelor_degree_or_higher_2015-19
0,AL,Alabama,747478,51771,Alabama,4903185,458922.0,1022839.0,993344.0,845772.0
68,AK,Alaska,73033,77203,Alaska,731545,34376.0,134582.0,169609.0,142019.0
72,AK,Bristol Bay Borough,80,87950,Alaska,836,38.0,235.0,219.0,143.0
77,AK,Hoonah-Angoon Census Area,336,53141,Alaska,2148,107.0,659.0,580.0,365.0
82,AK,Lake and Peninsula Borough,275,51693,Alaska,1592,110.0,406.0,254.0,148.0
...,...,...,...,...,...,...,...,...,...,...
2498,TN,Tennessee,919850,56047,Tennessee,6829174,575128.0,1472003.0,1286117.0,1254145.0
2885,VT,Vermont,60624,63293,Vermont,623989,32276.0,126832.0,113869.0,167483.0
2900,VA,Virginia,822944,76471,Virginia,8535519,595348.0,1383769.0,1557409.0,2240360.0
3074,WV,West Virginia,281175,48659,West Virginia,1792147,168624.0,519091.0,334314.0,265398.0


In [33]:
missing = missing[~(missing['county'].isin(state_keys.state))]

In [34]:
missing.reset_index(inplace=True, drop=True)

In [35]:
predict_missing = missing[["2019_population_est", "poverty_est_all_ages", "median_household_income", "less_than_high_school_diploma_2015-19", "high_school_diploma_only_2015-19", "some_college_or_associate_ degree_2015-19", "bachelor_degree_or_higher_2015-19"]]

In [36]:
predict_missing

Unnamed: 0,2019_population_est,poverty_est_all_ages,median_household_income,less_than_high_school_diploma_2015-19,high_school_diploma_only_2015-19,some_college_or_associate_ degree_2015-19,bachelor_degree_or_higher_2015-19
0,836,80,87950,38.0,235.0,219.0,143.0
1,2148,336,53141,107.0,659.0,580.0,365.0
2,1592,275,51693,110.0,406.0,254.0,148.0
3,62045,12365,44728,8673.0,17035.0,10019.0,5478.0
4,25627,4610,47111,3633.0,7928.0,4314.0,2020.0
...,...,...,...,...,...,...,...
63,10830,2177,42614,1629.0,3518.0,1642.0,901.0
64,15568,2261,65296,2070.0,3680.0,3014.0,2792.0
65,13904,2852,43166,1849.0,4660.0,2253.0,1420.0
66,1418207,361834,41470,253249.0,259572.0,230175.0,186550.0


In [37]:
model.predict(predict_missing)
# predict_missing

array([[  4584.34162571],
       [  2799.55150035],
       [  2767.34540135],
       [  4421.01100386],
       [  2878.94065719],
       [  7386.04789693],
       [  3528.7714636 ],
       [  4577.94004014],
       [  3534.26433983],
       [  2003.49040614],
       [  7664.35093677],
       [ 11285.74099329],
       [  9207.30677921],
       [  2758.42090938],
       [  3266.18992846],
       [  2679.54228576],
       [  2398.29225376],
       [  2669.91880875],
       [  2803.97526005],
       [ 23322.17034545],
       [  2295.53536801],
       [  2955.36007714],
       [  3661.7975482 ],
       [  2984.35105359],
       [  2338.43815159],
       [  3827.8255085 ],
       [  3819.56384886],
       [  2629.99620528],
       [ 24719.6135256 ],
       [  3406.90632613],
       [ 12776.75352158],
       [  6514.39029492],
       [  5096.57421378],
       [  5347.34206935],
       [  2798.06994595],
       [  2705.97221458],
       [  3442.28709714],
       [ 39269.15994744],
       [  79

In [38]:
predict_missing.shape

(68, 7)