# tables_us_data.ipynb

Build tables of the latest COVID-19 statistics for U.S. counties.

Inputs:
* `outputs/us_counties_clean.csv`: The contents of `data/us_counties.csv` after data cleaning by [clean_us_data.ipynb](./clean_us_data.ipynb)
* `outputs/us_counties_clean_meta.json`: Column type metadata for reading `data/us_counties_clean.csv` with `pd.read_csv()`

**Note:** You can redirect these input files by setting the environment variable `COVID_OUTPUTS_DIR` to a replacement for the prefix `outputs` in the above paths.

In [1]:
# Initialization boilerplate
import os
import json
import pandas as pd
import numpy as np
from typing import *

import text_extensions_for_pandas as tp

# Local file of utility functions
import util

# Allow environment variables to override data file locations.
_OUTPUTS_DIR = os.getenv("COVID_OUTPUTS_DIR", "outputs")
util.ensure_dir_exists(_OUTPUTS_DIR)  # create if necessary

# Read and Reformat Input Data

In [2]:
# Read time series data from the binary file that clean_us_data.ipynb produces
dates_file = os.path.join(_OUTPUTS_DIR, "dates.feather")
cases_file = os.path.join(_OUTPUTS_DIR, "us_counties_clean.feather")
cases = pd.read_feather(cases_file).set_index("FIPS")
dates = pd.read_feather(dates_file)["date"].to_numpy()
cases.head()

Unnamed: 0_level_0,State,County,Population,Confirmed,Deaths,Recovered,Confirmed_Outlier,Deaths_Outlier,Recovered_Outlier,Confirmed_7_Days,Deaths_7_Days
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1001,Alabama,Autauga,55869,[ 0 0 0 0 0 0 0 0 0 ...,[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,153,3
1003,Alabama,Baldwin,223234,[ 0 0 0 0 0 0 0 0 0 ...,[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,573,0
1005,Alabama,Barbour,24686,[ 0 0 0 0 0 0 0 0 0 ...,[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,31,1
1007,Alabama,Bibb,22394,[ 0 0 0 0 0 0 0 0 0 ...,[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,149,0
1009,Alabama,Blount,57826,[ 0 0 0 0 0 0 0 0 0 ...,[ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,189,2


## Extract the most recent element of each time series

The tables below focus on the most recent day's data, so we generate a DataFrame with
just the last element of each time series.

In [3]:
cases_by_county = cases[[
    "State", "County", "Population", 
    "Confirmed", "Deaths", 
    "Confirmed_7_Days", "Deaths_7_Days"
]].reset_index()  # Implicitly makes a copy
cases_by_county["Confirmed"] = cases_by_county["Confirmed"].to_numpy()[:,-1]
cases_by_county["Deaths"] = cases_by_county["Deaths"].to_numpy()[:,-1]

cases_by_county

Unnamed: 0,FIPS,State,County,Population,Confirmed,Deaths,Confirmed_7_Days,Deaths_7_Days
0,1001,Alabama,Autauga,55869,2634,39,153,3
1,1003,Alabama,Baldwin,223234,8269,84,573,0
2,1005,Alabama,Barbour,24686,1161,10,31,1
3,1007,Alabama,Bibb,22394,1142,17,149,0
4,1009,Alabama,Blount,57826,2763,36,189,2
...,...,...,...,...,...,...,...,...
3137,56037,Wyoming,Sweetwater,42343,1548,6,438,2
3138,56039,Wyoming,Teton,23464,1433,2,241,0
3139,56041,Wyoming,Uinta,20226,950,4,190,0
3140,56043,Wyoming,Washakie,7805,325,7,98,0


## Normalize the Confirmed and Deaths counts by population

The populations of U.S. counties vary by several orders of magnitude, so it's useful to 
normalize the case count for each count to the county's population. Compute confirmed cases
and deaths per 100 residents.

In [4]:
for col in ["Confirmed", "Deaths", "Confirmed_7_Days", "Deaths_7_Days"]:
    cases_by_county[f"{col}_per_100"] = cases_by_county[col] / cases_by_county["Population"] * 100

cases_by_county

Unnamed: 0,FIPS,State,County,Population,Confirmed,Deaths,Confirmed_7_Days,Deaths_7_Days,Confirmed_per_100,Deaths_per_100,Confirmed_7_Days_per_100,Deaths_7_Days_per_100
0,1001,Alabama,Autauga,55869,2634,39,153,3,4.714600,0.069806,0.273855,0.005370
1,1003,Alabama,Baldwin,223234,8269,84,573,0,3.704185,0.037629,0.256681,0.000000
2,1005,Alabama,Barbour,24686,1161,10,31,1,4.703071,0.040509,0.125577,0.004051
3,1007,Alabama,Bibb,22394,1142,17,149,0,5.099580,0.075913,0.665357,0.000000
4,1009,Alabama,Blount,57826,2763,36,189,2,4.778127,0.062256,0.326843,0.003459
...,...,...,...,...,...,...,...,...,...,...,...,...
3137,56037,Wyoming,Sweetwater,42343,1548,6,438,2,3.655858,0.014170,1.034409,0.004723
3138,56039,Wyoming,Teton,23464,1433,2,241,0,6.107228,0.008524,1.027105,0.000000
3139,56041,Wyoming,Uinta,20226,950,4,190,0,4.696925,0.019777,0.939385,0.000000
3140,56043,Wyoming,Washakie,7805,325,7,98,0,4.163997,0.089686,1.255605,0.000000


# Generate tables

Now that we have read and formatted the input data, we can use Pandas to generate summary tables of the 
latest COVID-19 data.

## Table: COVID-19 Cases and Deaths by State

Aggregate the most recent county-level numbers by state to build up a table of statewide totals.

In [5]:
cases_by_state = (cases_by_county
 .groupby("State")
 .aggregate({
     "Population": "sum",
     "Confirmed": "sum",
     "Deaths": "sum"
 }))
cases_by_state["Confirmed_per_100"] = cases_by_state["Confirmed"] / cases_by_state["Population"] * 100
cases_by_state["Deaths_per_100"] = cases_by_state["Deaths"] / cases_by_state["Population"] * 100

cases_by_state = cases_by_state[["Population", "Confirmed", "Deaths", \
                                 "Confirmed_per_100", "Deaths_per_100"]]
cases_by_state

Unnamed: 0_level_0,Population,Confirmed,Deaths,Confirmed_per_100,Deaths_per_100
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama,4903185,234080,3459,4.77404,0.070546
Alaska,731545,28144,103,3.8472,0.01408
Arizona,7278717,302324,6464,4.153534,0.088807
Arkansas,3017804,143832,2387,4.766115,0.079097
California,39512223,1128219,18753,2.855367,0.047461
Colorado,5758736,202262,2810,3.512264,0.048795
Connecticut,3565287,106167,4871,2.977797,0.136623
Delaware,973764,32111,752,3.297616,0.077226
District of Columbia,705749,20290,672,2.87496,0.095218
Florida,21477737,942348,18085,4.387557,0.084203


In [6]:
# Now our data prep is done and we can start analyzing.

# The latest nationwide totals
cases_by_state[["Confirmed", "Deaths"]].sum()

Confirmed    12261985
Deaths         255288
dtype: int64

In [7]:
# Today's nationwide totals, computed from the county-level data
cases_by_county[["Confirmed", "Deaths"]].sum()

Confirmed    12261985
Deaths         255288
dtype: int64

## Table: Top 10 states by total confirmed cases

In [8]:
cases_by_state.sort_values("Confirmed", ascending=False).head(10)

Unnamed: 0_level_0,Population,Confirmed,Deaths,Confirmed_per_100,Deaths_per_100
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Texas,28995881,1159863,21049,4.000096,0.072593
California,39512223,1128219,18753,2.855367,0.047461
Florida,21477737,942348,18085,4.387557,0.084203
Illinois,12671821,664257,12111,5.242001,0.095574
New York,19453561,602120,34192,3.095166,0.175762
Georgia,10617423,431201,8969,4.061259,0.084474
Wisconsin,5822434,379693,3158,6.521207,0.054238
Ohio,11689100,363304,6020,3.108058,0.051501
North Carolina,10488084,339190,5039,3.234051,0.048045
Tennessee,6829174,333245,4240,4.879726,0.062087


## Table: Top 10 states by confirmed cases per 100 residents

In [9]:
cases_by_state.sort_values("Confirmed_per_100", ascending=False).head(10)

Unnamed: 0_level_0,Population,Confirmed,Deaths,Confirmed_per_100,Deaths_per_100
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
North Dakota,762062,73397,846,9.631369,0.111015
South Dakota,884659,73848,819,8.347623,0.092578
Iowa,3155070,214197,2222,6.788978,0.070426
Wisconsin,5822434,379693,3158,6.521207,0.054238
Nebraska,1934408,114960,928,5.942903,0.047973
Utah,3205958,179212,798,5.589967,0.024891
Montana,1068778,56381,614,5.275277,0.057449
Illinois,12671821,664257,12111,5.242001,0.095574
Idaho,1787065,93088,866,5.208988,0.048459
Wyoming,578759,29431,202,5.085191,0.034902


## Table: Top 10 states by deaths per 100 residents

In [10]:
# Top 10 states by deaths per 100 residents
cases_by_state.sort_values("Deaths_per_100", ascending=False).head(10)

Unnamed: 0_level_0,Population,Confirmed,Deaths,Confirmed_per_100,Deaths_per_100
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
New Jersey,8882190,308864,16772,3.477341,0.188827
New York,19453561,602120,34192,3.095166,0.175762
Massachusetts,6892503,200764,10522,2.912788,0.152659
Connecticut,3565287,106167,4871,2.977797,0.136623
Louisiana,4648794,220785,6284,4.749296,0.135175
Mississippi,2976149,143879,3676,4.834402,0.123515
Rhode Island,1059361,39645,1256,3.74235,0.118562
North Dakota,762062,73397,846,9.631369,0.111015
Illinois,12671821,664257,12111,5.242001,0.095574
District of Columbia,705749,20290,672,2.87496,0.095218


## Table: Top 20 counties by confirmed cases

In [11]:
cases_by_county.sort_values("Confirmed", ascending=False).head(20)

Unnamed: 0,FIPS,State,County,Population,Confirmed,Deaths,Confirmed_7_Days,Deaths_7_Days,Confirmed_per_100,Deaths_per_100,Confirmed_7_Days_per_100,Deaths_7_Days_per_100
204,6037,California,Los Angeles,10039107,370636,7446,28293,171,3.691922,0.07417,0.281828,0.001703
610,17031,Illinois,Cook,5150233,284217,6339,28609,323,5.518527,0.123082,0.555489,0.006272
362,12086,Florida,Miami-Dade,2716940,216442,3782,12788,73,7.966389,0.139201,0.470677,0.002687
103,4013,Arizona,Maricopa,4485414,191503,3896,14510,98,4.269461,0.086859,0.323493,0.002185
2623,48201,Texas,Harris,4713325,183252,2965,7838,55,3.887956,0.062907,0.166294,0.001167
2579,48113,Texas,Dallas,2635516,130186,1494,9471,111,4.939678,0.056687,0.35936,0.004212
1748,32003,Nevada,Clark,2266715,106033,1691,9642,73,4.677827,0.074601,0.425373,0.003221
325,12011,Florida,Broward,1952778,101747,1629,6013,33,5.210372,0.08342,0.30792,0.00169
2742,48439,Texas,Tarrant,2102515,92977,987,10062,58,4.42218,0.046944,0.47857,0.002759
1868,36081,New York,Queens,2253858,86246,7338,3471,15,3.826594,0.325575,0.154003,0.000666


## Table: Top 20 counties by confirmed cases per 100 residents

In [12]:
cases_by_county.sort_values("Confirmed_per_100", ascending=False).head(20)

Unnamed: 0,FIPS,State,County,Population,Confirmed,Deaths,Confirmed_7_Days,Deaths_7_Days,Confirmed_per_100,Deaths_per_100,Confirmed_7_Days_per_100,Deaths_7_Days_per_100
956,20137,Kansas,Norton,5361,1053,21,39,0,19.641858,0.391718,0.727476,0.0
2365,46009,South Dakota,Bon Homme,6901,1320,14,91,7,19.127663,0.202869,1.318649,0.101435
150,5079,Arkansas,Lincoln,13024,2393,18,26,0,18.373771,0.138206,0.199631,0.0
2369,46017,South Dakota,Buffalo,1962,360,6,28,1,18.348624,0.30581,1.427115,0.050968
412,13053,Georgia,Chattahoochee,10907,1948,3,11,1,17.86009,0.027505,0.100853,0.009168
2512,47169,Tennessee,Trousdale,11284,1894,12,24,2,16.784828,0.106345,0.212691,0.017724
257,8025,Colorado,Crowley,6061,1009,1,321,0,16.647418,0.016499,5.296156,0.0
2475,47095,Tennessee,Lake,7016,1151,5,37,0,16.405359,0.071266,0.527366,0.0
352,12067,Florida,Lafayette,8422,1304,19,14,0,15.483258,0.2256,0.166231,0.0
1675,31043,Nebraska,Dakota,20026,3043,50,161,1,15.195246,0.249675,0.803955,0.004994


## Table: Top 20 counties by deaths per 100 residents

In [13]:
cases_by_county.sort_values("Deaths_per_100", ascending=False).head(20)

Unnamed: 0,FIPS,State,County,Population,Confirmed,Deaths,Confirmed_7_Days,Deaths_7_Days,Confirmed_per_100,Deaths_per_100,Confirmed_7_Days_per_100,Deaths_7_Days_per_100
919,20063,Kansas,Gove,2636,278,19,27,0,10.546282,0.720789,1.024279,0.0
2397,46073,South Dakota,Jerauld,2013,231,13,12,0,11.47541,0.645802,0.596125,0.0
456,13141,Georgia,Hancock,8457,468,45,26,0,5.533877,0.532104,0.307438,0.0
2923,51595,Virginia,Emporia,5346,321,28,14,0,6.004489,0.523756,0.261878,0.0
2653,48261,Texas,Kenedy,404,11,2,0,0,2.722772,0.49505,0.0,0.0
2928,51640,Virginia,Galax,6347,533,30,28,0,8.397668,0.472664,0.441153,0.0
2000,38021,North Dakota,Dickey,4872,524,23,36,1,10.755337,0.472085,0.738916,0.020525
506,13243,Georgia,Randolph,6778,365,30,1,0,5.385069,0.442608,0.014754,0.0
1131,22037,Louisiana,East Feliciana,19135,2076,84,85,1,10.849229,0.438986,0.444212,0.005226
2423,46125,South Dakota,Turner,8384,757,36,45,2,9.029103,0.429389,0.536737,0.023855
