# tables_us_data.ipynb

Build tables of the latest COVID-19 statistics for U.S. counties.

Inputs:
* `outputs/us_counties_clean.csv`: The contents of `data/us_counties.csv` after data cleaning by [clean_us_data.ipynb](./clean_us_data.ipynb)
* `outputs/us_counties_clean_meta.json`: Column type metadata for reading `data/us_counties_clean.csv` with `pd.read_csv()`

**Note:** You can redirect these input files by setting the environment variable `COVID_OUTPUTS_DIR` to a replacement for the prefix `outputs` in the above paths.

In [1]:
# Initialization boilerplate
import os
import json
import pandas as pd
import numpy as np
from typing import *

import text_extensions_for_pandas as tp

# Local file of utility functions
import util

# Allow environment variables to override data file locations.
_OUTPUTS_DIR = os.getenv("COVID_OUTPUTS_DIR", "outputs")
util.ensure_dir_exists(_OUTPUTS_DIR)  # create if necessary

In [2]:
# Read in the CSV file and apply the saved type information
csv_file = os.path.join(_OUTPUTS_DIR, "us_counties_clean.csv")
meta_file = os.path.join(_OUTPUTS_DIR, "us_counties_clean_meta.json")

# Read column type metadata
with open(meta_file) as f:
    cases_meta = json.load(f)

# Pandas does not currently support parsing datetime64 from CSV files.
# As a workaround, read the "Date" column as objects and manually 
# convert after.
cases_meta["Date"] = "object"

cases_vertical = (
    pd
    .read_csv(csv_file, dtype=cases_meta, parse_dates=["Date"])   
    .set_index(["FIPS", "Date"], verify_integrity=True)
)
cases_vertical

Unnamed: 0_level_0,Unnamed: 1_level_0,State,County,Population,Confirmed,Deaths,Recovered,Confirmed_Outlier,Deaths_Outlier,Recovered_Outlier
FIPS,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1001,2020-01-22,Alabama,Autauga,55869,0,0,0,False,False,False
1001,2020-01-23,Alabama,Autauga,55869,0,0,0,False,False,False
1001,2020-01-24,Alabama,Autauga,55869,0,0,0,False,False,False
1001,2020-01-25,Alabama,Autauga,55869,0,0,0,False,False,False
1001,2020-01-26,Alabama,Autauga,55869,0,0,0,False,False,False
...,...,...,...,...,...,...,...,...,...,...
56045,2020-09-10,Wyoming,Weston,6927,21,0,0,False,False,False
56045,2020-09-11,Wyoming,Weston,6927,21,0,0,False,False,False
56045,2020-09-12,Wyoming,Weston,6927,23,0,0,False,False,False
56045,2020-09-13,Wyoming,Weston,6927,23,0,0,False,False,False


In [3]:
# Normalize the Confirmed and Deaths counts by population.
cases = cases_vertical.copy()
cases["Confirmed_per_100"] = cases["Confirmed"] / cases["Population"] * 100
cases["Deaths_per_100"] = cases["Deaths"] / cases["Population"] * 100
cases

Unnamed: 0_level_0,Unnamed: 1_level_0,State,County,Population,Confirmed,Deaths,Recovered,Confirmed_Outlier,Deaths_Outlier,Recovered_Outlier,Confirmed_per_100,Deaths_per_100
FIPS,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1001,2020-01-22,Alabama,Autauga,55869,0,0,0,False,False,False,0.000000,0.0
1001,2020-01-23,Alabama,Autauga,55869,0,0,0,False,False,False,0.000000,0.0
1001,2020-01-24,Alabama,Autauga,55869,0,0,0,False,False,False,0.000000,0.0
1001,2020-01-25,Alabama,Autauga,55869,0,0,0,False,False,False,0.000000,0.0
1001,2020-01-26,Alabama,Autauga,55869,0,0,0,False,False,False,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
56045,2020-09-10,Wyoming,Weston,6927,21,0,0,False,False,False,0.303162,0.0
56045,2020-09-11,Wyoming,Weston,6927,21,0,0,False,False,False,0.303162,0.0
56045,2020-09-12,Wyoming,Weston,6927,23,0,0,False,False,False,0.332034,0.0
56045,2020-09-13,Wyoming,Weston,6927,23,0,0,False,False,False,0.332034,0.0


In [4]:
# Pull off the most recent element of each time series.
cases_without_index = cases.reset_index()
last_date = cases_without_index["Date"].max()
cases_by_county = (
    cases_without_index[cases_without_index["Date"] == last_date]
    .set_index("FIPS")
    .drop(columns=["Confirmed_Outlier", "Deaths_Outlier", "Recovered_Outlier"]))
cases_by_county

Unnamed: 0_level_0,Date,State,County,Population,Confirmed,Deaths,Recovered,Confirmed_per_100,Deaths_per_100
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1001,2020-09-14,Alabama,Autauga,55869,1447,23,0,2.589987,0.041168
1003,2020-09-14,Alabama,Baldwin,223234,4800,43,0,2.150210,0.019262
1005,2020-09-14,Alabama,Barbour,24686,626,7,0,2.535850,0.028356
1007,2020-09-14,Alabama,Bibb,22394,581,6,0,2.594445,0.026793
1009,2020-09-14,Alabama,Blount,57826,1128,13,0,1.950680,0.022481
...,...,...,...,...,...,...,...,...,...
56037,2020-09-14,Wyoming,Sweetwater,42343,317,2,0,0.748648,0.004723
56039,2020-09-14,Wyoming,Teton,23464,476,1,0,2.028640,0.004262
56041,2020-09-14,Wyoming,Uinta,20226,312,2,0,1.542569,0.009888
56043,2020-09-14,Wyoming,Washakie,7805,111,6,0,1.422165,0.076874


In [5]:
# Build a second table of the same data aggregated at the state level
cases_by_state = (cases_by_county
 .groupby("State")
 .aggregate({
     "Population": "sum",
     "Confirmed": "sum",
     "Deaths": "sum"
 }))
cases_by_state["Confirmed_per_100"] = cases_by_state["Confirmed"] / cases_by_state["Population"] * 100
cases_by_state["Deaths_per_100"] = cases_by_state["Deaths"] / cases_by_state["Population"] * 100

cases_by_state = cases_by_state[["Population", "Confirmed", "Deaths", \
                                 "Confirmed_per_100", "Deaths_per_100"]]
cases_by_state

Unnamed: 0_level_0,Population,Confirmed,Deaths,Confirmed_per_100,Deaths_per_100
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama,4903185,126299,2221,2.575856,0.045297
Alaska,731545,6341,44,0.866796,0.006015
Arizona,7278717,208725,5321,2.867607,0.073104
Arkansas,3017804,68999,992,2.286398,0.032872
California,39512223,765134,14439,1.936449,0.036543
Colorado,5758736,61667,1990,1.070843,0.034556
Connecticut,3565287,54769,4485,1.536174,0.125796
Delaware,973764,18467,617,1.896455,0.063362
District of Columbia,705749,14622,616,2.071841,0.087283
Florida,21477737,664692,12642,3.094795,0.058861


In [6]:
# Now our data prep is done and we can start analyzing.

# The latest nationwide totals
cases_by_state[["Confirmed", "Deaths"]].sum()

Confirmed    6456904
Deaths        197891
dtype: int64

In [7]:
# Today's nationwide totals, computed from the county-level data
cases_by_county[["Confirmed", "Deaths"]].sum()

Confirmed    6456904
Deaths        197891
dtype: int64

In [8]:
# Top 10 states by confirmed cases:
cases_by_state.sort_values("Confirmed", ascending=False).head(10)

Unnamed: 0_level_0,Population,Confirmed,Deaths,Confirmed_per_100,Deaths_per_100
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
California,39512223,765134,14439,1.936449,0.036543
Texas,28995881,685748,14451,2.364984,0.049838
Florida,21477737,664692,12642,3.094795,0.058861
New York,19453561,455859,38230,2.343319,0.196519
Georgia,10617423,276285,6208,2.602185,0.05847
Illinois,12671821,262692,8314,2.073041,0.06561
Arizona,7278717,208725,5321,2.867607,0.073104
New Jersey,8882190,196655,16034,2.214037,0.180519
North Carolina,10488084,185780,3060,1.771344,0.029176
Tennessee,6829174,168800,2066,2.471748,0.030253


In [9]:
# Top 10 states by confirmed cases per 100 residents
cases_by_state.sort_values("Confirmed_per_100", ascending=False).head(10)

Unnamed: 0_level_0,Population,Confirmed,Deaths,Confirmed_per_100,Deaths_per_100
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Louisiana,4648794,157751,5082,3.393375,0.109319
Florida,21477737,664692,12642,3.094795,0.058861
Mississippi,2976149,90018,2706,3.024647,0.090923
Arizona,7278717,208725,5321,2.867607,0.073104
Georgia,10617423,276285,6208,2.602185,0.05847
South Carolina,5148714,132680,3077,2.576954,0.059762
Alabama,4903185,126299,2221,2.575856,0.045297
Tennessee,6829174,168800,2066,2.471748,0.030253
Nevada,3080156,73814,1456,2.396437,0.04727
Iowa,3155070,75051,1224,2.378743,0.038795


In [10]:
# Top 10 states by deaths per 100 residents
cases_by_state.sort_values("Deaths_per_100", ascending=False).head(10)

Unnamed: 0_level_0,Population,Confirmed,Deaths,Confirmed_per_100,Deaths_per_100
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
New York,19453561,455859,38230,2.343319,0.196519
New Jersey,8882190,196655,16034,2.214037,0.180519
Massachusetts,6892503,122667,9213,1.779716,0.133667
Connecticut,3565287,54769,4485,1.536174,0.125796
Louisiana,4648794,157751,5082,3.393375,0.109319
Rhode Island,1059361,20824,1049,1.965713,0.099022
Mississippi,2976149,90018,2706,3.024647,0.090923
District of Columbia,705749,14622,616,2.071841,0.087283
Arizona,7278717,208725,5321,2.867607,0.073104
Michigan,9986857,118322,6843,1.184777,0.06852


In [11]:
# Top 20 counties by total confirmed cases.
cases_by_county.sort_values("Confirmed", ascending=False).head(20)

Unnamed: 0_level_0,Date,State,County,Population,Confirmed,Deaths,Recovered,Confirmed_per_100,Deaths_per_100
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
6037,2020-09-14,California,Los Angeles,10039107,254656,6231,0,2.53664,0.062067
12086,2020-09-14,Florida,Miami-Dade,2716940,164299,2894,0,6.047208,0.106517
4013,2020-09-14,Arizona,Maricopa,4485414,137589,3154,0,3.067476,0.070317
17031,2020-09-14,Illinois,Cook,5150233,135274,5134,0,2.626561,0.099685
48201,2020-09-14,Texas,Harris,4713325,117568,2423,0,2.494375,0.051407
36081,2020-09-14,New York,Queens,2253858,76673,8521,0,3.401856,0.378063
48113,2020-09-14,Texas,Dallas,2635516,75648,1046,0,2.87033,0.039689
12011,2020-09-14,Florida,Broward,1952778,74525,1284,0,3.816358,0.065752
36047,2020-09-14,New York,Kings,2559903,67965,9157,0,2.654983,0.357709
32003,2020-09-14,Nevada,Clark,2266715,62812,1262,0,2.771059,0.055675


In [12]:
# Top 20 counties by confirmed cases per 100 residents
cases_by_county.sort_values("Confirmed_per_100", ascending=False).head(20)

Unnamed: 0_level_0,Date,State,County,Population,Confirmed,Deaths,Recovered,Confirmed_per_100,Deaths_per_100
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
47169,2020-09-14,Tennessee,Trousdale,11284,1641,7,0,14.542715,0.062035
13053,2020-09-14,Georgia,Chattahoochee,10907,1584,2,0,14.522784,0.018337
12067,2020-09-14,Florida,Lafayette,8422,1220,10,0,14.48587,0.118737
5079,2020-09-14,Arkansas,Lincoln,13024,1785,13,0,13.705467,0.099816
47095,2020-09-14,Tennessee,Lake,7016,863,2,0,12.300456,0.028506
5077,2020-09-14,Arkansas,Lee,8857,1024,15,0,11.561477,0.169358
31043,2020-09-14,Nebraska,Dakota,20026,2085,43,0,10.411465,0.214721
19021,2020-09-14,Iowa,Buena Vista,19620,1901,12,0,9.689093,0.061162
5017,2020-09-14,Arkansas,Chicot,10118,976,16,0,9.646175,0.158134
47181,2020-09-14,Tennessee,Wayne,16673,1477,4,0,8.858634,0.023991


In [13]:
# Top 20 counties by *deaths* per 100 residents
cases_by_county.sort_values("Deaths_per_100", ascending=False).head(20)

Unnamed: 0_level_0,Date,State,County,Population,Confirmed,Deaths,Recovered,Confirmed_per_100,Deaths_per_100
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
13141,2020-09-14,Georgia,Hancock,8457,379,41,0,4.481495,0.484805
51640,2020-09-14,Virginia,Galax,6347,406,28,0,6.396723,0.441153
36005,2020-09-14,New York,Bronx,1418207,56593,6074,0,3.990461,0.428287
13243,2020-09-14,Georgia,Randolph,6778,327,27,0,4.824432,0.398348
36081,2020-09-14,New York,Queens,2253858,76673,8521,0,3.401856,0.378063
13273,2020-09-14,Georgia,Terrell,8531,325,31,0,3.809635,0.363381
36047,2020-09-14,New York,Kings,2559903,67965,9157,0,2.654983,0.357709
51595,2020-09-14,Virginia,Emporia,5346,243,19,0,4.545455,0.355406
35031,2020-09-14,New Mexico,McKinley,71367,4266,251,0,5.977553,0.351703
28099,2020-09-14,Mississippi,Neshoba,29118,1497,102,0,5.14115,0.350299
