# tables_us_data.ipynb

Build tables of the latest COVID-19 statistics for U.S. counties.

Inputs:
* `outputs/us_counties_clean.csv`: The contents of `data/us_counties.csv` after data cleaning by [clean_us_data.ipynb](./clean_us_data.ipynb)
* `outputs/us_counties_clean_meta.json`: Column type metadata for reading `data/us_counties_clean.csv` with `pd.read_csv()`

**Note:** You can redirect these input files by setting the environment variable `COVID_OUTPUTS_DIR` to a replacement for the prefix `outputs` in the above paths.

In [1]:
# Initialization boilerplate
import os
import pandas as pd

# Local file of utility functions
import util

# Allow environment variables to override data file locations.
_OUTPUTS_DIR = os.getenv("COVID_OUTPUTS_DIR", "outputs")
util.ensure_dir_exists(_OUTPUTS_DIR)  # create if necessary

# Read and Reformat Input Data

In [2]:
# Read time series data from the binary file that clean_us_data.ipynb produces
dates_file = os.path.join(_OUTPUTS_DIR, "dates.feather")
cases_file = os.path.join(_OUTPUTS_DIR, "us_counties_clean.feather")
cases = pd.read_feather(cases_file).set_index("FIPS")
dates = pd.read_feather(dates_file)["date"].to_numpy()
cases.head()

Unnamed: 0_level_0,State,County,Population,Confirmed,Deaths,Recovered,Confirmed_Outlier,Deaths_Outlier,Recovered_Outlier,Confirmed_7_Days,Deaths_7_Days
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1001,Alabama,Autauga,55869,"[ 0, 0, 0, 0, 0, 0, ...","[ 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",124,8
1003,Alabama,Baldwin,223234,"[ 0, 0, 0, 0, 0, 0, ...","[ 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",319,9
1005,Alabama,Barbour,24686,"[ 0, 0, 0, 0, 0, 0, ...","[ 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",20,3
1007,Alabama,Bibb,22394,"[ 0, 0, 0, 0, 0, 0, ...","[ 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",40,2
1009,Alabama,Blount,57826,"[ 0, 0, 0, 0, 0, 0, ...","[ 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",116,3


## Extract the most recent element of each time series

The tables below focus on the most recent day's data, so we generate a DataFrame with
just the last element of each time series.

In [3]:
cases_by_county = cases[[
    "State", "County", "Population", 
    "Confirmed", "Deaths", 
    "Confirmed_7_Days", "Deaths_7_Days"
]].reset_index()  # Implicitly makes a copy
cases_by_county["Confirmed"] = cases_by_county["Confirmed"].to_numpy()[:,-1]
cases_by_county["Deaths"] = cases_by_county["Deaths"].to_numpy()[:,-1]

cases_by_county

Unnamed: 0,FIPS,State,County,Population,Confirmed,Deaths,Confirmed_7_Days,Deaths_7_Days
0,1001,Alabama,Autauga,55869,15431,184,124,8
1,1003,Alabama,Baldwin,223234,54763,635,319,9
2,1005,Alabama,Barbour,24686,5429,92,20,3
3,1007,Alabama,Bibb,22394,6354,99,40,2
4,1009,Alabama,Blount,57826,14672,216,116,3
...,...,...,...,...,...,...,...,...
3139,56037,Wyoming,Sweetwater,42343,10937,122,120,3
3140,56039,Wyoming,Teton,23464,9692,15,114,0
3141,56041,Wyoming,Uinta,20226,5599,36,30,0
3142,56043,Wyoming,Washakie,7805,2303,42,29,0


## Normalize the Confirmed and Deaths counts by population

The populations of U.S. counties vary by several orders of magnitude, so it's useful to 
normalize the case count for each count to the county's population. Compute confirmed cases
and deaths per 100 residents.

In [4]:
for col in ["Confirmed", "Deaths", "Confirmed_7_Days", "Deaths_7_Days"]:
    cases_by_county[f"{col}_per_100"] = cases_by_county[col] / cases_by_county["Population"] * 100

cases_by_county

Unnamed: 0,FIPS,State,County,Population,Confirmed,Deaths,Confirmed_7_Days,Deaths_7_Days,Confirmed_per_100,Deaths_per_100,Confirmed_7_Days_per_100,Deaths_7_Days_per_100
0,1001,Alabama,Autauga,55869,15431,184,124,8,27.619968,0.329342,0.221948,0.014319
1,1003,Alabama,Baldwin,223234,54763,635,319,9,24.531657,0.284455,0.142899,0.004032
2,1005,Alabama,Barbour,24686,5429,92,20,3,21.992222,0.372681,0.081018,0.012153
3,1007,Alabama,Bibb,22394,6354,99,40,2,28.373672,0.442083,0.178619,0.008931
4,1009,Alabama,Blount,57826,14672,216,116,3,25.372670,0.373534,0.200602,0.005188
...,...,...,...,...,...,...,...,...,...,...,...,...
3139,56037,Wyoming,Sweetwater,42343,10937,122,120,3,25.829535,0.288123,0.283400,0.007085
3140,56039,Wyoming,Teton,23464,9692,15,114,0,41.305830,0.063928,0.485851,0.000000
3141,56041,Wyoming,Uinta,20226,5599,36,30,0,27.682191,0.177989,0.148324,0.000000
3142,56043,Wyoming,Washakie,7805,2303,42,29,0,29.506726,0.538117,0.371557,0.000000


# Generate tables

Now that we have read and formatted the input data, we can use Pandas to generate summary tables of the 
latest COVID-19 data.

## Table: COVID-19 Cases and Deaths by State

Aggregate the most recent county-level numbers by state to build up a table of statewide totals.

In [5]:
cases_by_state = (
    cases_by_county
    .groupby("State")
    .aggregate({
         "Population": "sum",
         "Confirmed": "sum",
         "Deaths": "sum"
    }))
cases_by_state["Confirmed_per_100"] = cases_by_state["Confirmed"] / cases_by_state["Population"] * 100
cases_by_state["Deaths_per_100"] = cases_by_state["Deaths"] / cases_by_state["Population"] * 100

cases_by_state = cases_by_state[["Population", "Confirmed", "Deaths", \
                                 "Confirmed_per_100", "Deaths_per_100"]]
cases_by_state

Unnamed: 0_level_0,Population,Confirmed,Deaths,Confirmed_per_100,Deaths_per_100
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama,4903185,1273092,17933,25.964592,0.365742
Alaska,740995,235131,1140,31.731793,0.153847
Arizona,7278717,1967702,27622,27.033638,0.37949
Arkansas,3017804,789039,10292,26.146131,0.341043
California,39512223,8888375,83999,22.495254,0.21259
Colorado,5758736,1301503,11681,22.600498,0.20284
Connecticut,3565287,716560,10357,20.098242,0.290496
Delaware,973764,254183,2695,26.103142,0.276761
District of Columbia,705749,133697,1314,18.943987,0.186185
Florida,21477737,5815744,67572,27.078011,0.314614


In [6]:
# Now our data prep is done and we can start analyzing.

# The latest nationwide totals
cases_by_state[["Confirmed", "Deaths"]].sum()

Confirmed    77244176
Deaths         878871
dtype: int64

In [7]:
# Today's nationwide totals, computed from the county-level data
cases_by_county[["Confirmed", "Deaths"]].sum()

Confirmed    77244176
Deaths         878871
dtype: int64

## Table: Top 10 states by total confirmed cases

In [8]:
cases_by_state.sort_values("Confirmed", ascending=False).head(10)

Unnamed: 0_level_0,Population,Confirmed,Deaths,Confirmed_per_100,Deaths_per_100
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
California,39512223,8888375,83999,22.495254,0.21259
Texas,28995881,6503870,83698,22.430324,0.288655
Florida,21477737,5815744,67572,27.078011,0.314614
New York,19453561,4907852,26963,25.228553,0.138602
Illinois,12671821,3013578,32299,23.781728,0.254888
Pennsylvania,12801989,2744145,42789,21.435302,0.334237
Ohio,11689100,2644828,35488,22.626447,0.303599
North Carolina,10488084,2559097,22148,24.400043,0.211173
Georgia,10617423,2362567,33389,22.251793,0.314474
Michigan,9986857,2299347,33780,23.02373,0.338245


## Table: Top 10 states by confirmed cases per 100 residents

In [9]:
cases_by_state.sort_values("Confirmed_per_100", ascending=False).head(10)

Unnamed: 0_level_0,Population,Confirmed,Deaths,Confirmed_per_100,Deaths_per_100
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alaska,740995,235131,1140,31.731793,0.153847
North Dakota,762062,236902,2174,31.086972,0.285279
Rhode Island,1059361,324350,3381,30.617514,0.319155
Utah,3205958,914289,4286,28.518433,0.133689
South Carolina,5148714,1449637,16533,28.155322,0.321109
Kentucky,4467673,1255885,13565,28.110495,0.303626
Tennessee,6829174,1888962,22993,27.660183,0.336688
Florida,21477737,5815744,67572,27.078011,0.314614
Arizona,7278717,1967702,27622,27.033638,0.37949
West Virginia,1792147,481818,6162,26.88496,0.343833


## Table: Top 10 states by deaths per 100 residents

In [10]:
# Top 10 states by deaths per 100 residents
cases_by_state.sort_values("Deaths_per_100", ascending=False).head(10)

Unnamed: 0_level_0,Population,Confirmed,Deaths,Confirmed_per_100,Deaths_per_100
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Mississippi,2976149,785528,11836,26.394109,0.397695
Arizona,7278717,1967702,27622,27.033638,0.37949
New Jersey,8882190,2152155,32647,24.230004,0.367556
Alabama,4903185,1273092,17933,25.964592,0.365742
Louisiana,4648794,1156196,16400,24.87088,0.35278
West Virginia,1792147,481818,6162,26.88496,0.343833
Arkansas,3017804,789039,10292,26.146131,0.341043
Michigan,9986857,2299347,33780,23.02373,0.338245
Tennessee,6829174,1888962,22993,27.660183,0.336688
Massachusetts,6892503,1521141,23087,22.069501,0.334958


## Table: Top 20 counties by confirmed cases

In [11]:
cases_by_county.sort_values("Confirmed", ascending=False).head(20)

Unnamed: 0,FIPS,State,County,Population,Confirmed,Deaths,Confirmed_7_Days,Deaths_7_Days,Confirmed_per_100,Deaths_per_100,Confirmed_7_Days_per_100,Deaths_7_Days_per_100
206,6037,California,Los Angeles,10039107,2782953,30356,21083,454,27.721121,0.302377,0.210009,0.004522
105,4013,Arizona,Maricopa,4485414,1240505,15563,10002,243,27.656421,0.346969,0.222989,0.005418
364,12086,Florida,Miami-Dade,2716940,1173497,10153,4691,160,43.191863,0.373692,0.172657,0.005889
612,17031,Illinois,Cook,5150233,1105263,13884,7471,159,21.460447,0.26958,0.145061,0.003087
2625,48201,Texas,Harris,4713325,987881,10558,7900,106,20.959323,0.224003,0.16761,0.002249
224,6073,California,San Diego,3338330,777131,4947,10184,93,23.279035,0.148188,0.305063,0.002786
1853,36047,New York,Kings,2559903,683291,0,1777,0,26.692066,0.0,0.069417,0.0
1870,36081,New York,Queens,2253858,633085,0,1650,0,28.088948,0.0,0.073208,0.0
220,6065,California,Riverside,2470546,608821,6087,3479,126,24.643176,0.246383,0.140819,0.0051
327,12011,Florida,Broward,1952778,597048,5467,2664,88,30.57429,0.27996,0.136421,0.004506


## Table: Top 20 counties by confirmed cases per 100 residents

In [12]:
cases_by_county.sort_values("Confirmed_per_100", ascending=False).head(20)

Unnamed: 0,FIPS,State,County,Population,Confirmed,Deaths,Confirmed_7_Days,Deaths_7_Days,Confirmed_per_100,Deaths_per_100,Confirmed_7_Days_per_100,Deaths_7_Days_per_100
2675,48301,Texas,Loving,169,157,0,6,0,92.899408,0.0,3.550296,0.0
96,2282,Alaska,Yakutat,579,471,5,6,0,81.34715,0.863558,1.036269,0.0
414,13053,Georgia,Chattahoochee,10907,6767,21,19,0,62.042725,0.192537,0.1742,0.0
2588,48127,Texas,Dimmit,10124,5625,50,86,0,55.561043,0.493876,0.849467,0.0
2383,46041,South Dakota,Dewey,5892,3110,38,690,0,52.783435,0.644942,11.710794,0.0
88,2188,Alaska,Northwest Arctic,7621,3989,12,185,0,52.342212,0.15746,2.427503,0.0
259,8025,Colorado,Crowley,6061,3094,28,26,0,51.047682,0.46197,0.428972,0.0
86,2180,Alaska,Nome,10004,5049,5,373,0,50.469812,0.04998,3.728509,0.0
84,2164,Alaska,Bristol Bay plus Lake and Peninsula,1592,788,1,5,0,49.497487,0.062814,0.31407,0.0
70,2050,Alaska,Bethel,18386,8783,38,98,1,47.770042,0.206679,0.533014,0.005439


## Table: Top 20 counties by deaths per 100 residents

In [13]:
cases_by_county.sort_values("Deaths_per_100", ascending=False).head(20)

Unnamed: 0,FIPS,State,County,Population,Confirmed,Deaths,Confirmed_7_Days,Deaths_7_Days,Confirmed_per_100,Deaths_per_100,Confirmed_7_Days_per_100,Deaths_7_Days_per_100
2680,48311,Texas,McMullen,743,166,9,2,1,22.341857,1.211306,0.269179,0.13459
2930,51640,Virginia,Galax,6347,2476,72,34,0,39.010556,1.134394,0.535686,0.0
458,13141,Georgia,Hancock,8457,1552,87,6,1,18.351661,1.028734,0.070947,0.011825
2697,48345,Texas,Motley,1200,268,12,3,0,22.333333,1.0,0.25,0.0
1095,21201,Kentucky,Robertson,2108,599,21,12,0,28.41556,0.996205,0.56926,0.0
2399,46073,South Dakota,Jerauld,2013,406,20,4,1,20.168902,0.993542,0.198708,0.049677
2602,48155,Texas,Foard,1155,198,11,0,0,17.142857,0.952381,0.0,0.0
2925,51595,Virginia,Emporia,5346,1095,50,16,1,20.482604,0.935279,0.299289,0.018706
2536,48023,Texas,Baylor,3509,804,32,8,1,22.912511,0.911941,0.227985,0.028498
2664,48279,Texas,Lamb,12893,4256,116,24,4,33.010161,0.899713,0.186148,0.031025
