# tables_us_data.ipynb

Build tables of the latest COVID-19 statistics for U.S. counties.

Inputs:
* `outputs/us_counties_clean.csv`: The contents of `data/us_counties.csv` after data cleaning by [clean_us_data.ipynb](./clean_us_data.ipynb)
* `outputs/us_counties_clean_meta.json`: Column type metadata for reading `data/us_counties_clean.csv` with `pd.read_csv()`

**Note:** You can redirect these input files by setting the environment variable `COVID_OUTPUTS_DIR` to a replacement for the prefix `outputs` in the above paths.

In [1]:
# Initialization boilerplate
import os
import json
import pandas as pd
import numpy as np
from typing import *

import text_extensions_for_pandas as tp

# Local file of utility functions
import util

# Allow environment variables to override data file locations.
_OUTPUTS_DIR = os.getenv("COVID_OUTPUTS_DIR", "outputs")
util.ensure_dir_exists(_OUTPUTS_DIR)  # create if necessary

In [2]:
# Read in the CSV file and apply the saved type information
csv_file = os.path.join(_OUTPUTS_DIR, "us_counties_clean.csv")
meta_file = os.path.join(_OUTPUTS_DIR, "us_counties_clean_meta.json")

# Read column type metadata
with open(meta_file) as f:
    cases_meta = json.load(f)

# Pandas does not currently support parsing datetime64 from CSV files.
# As a workaround, read the "Date" column as objects and manually 
# convert after.
cases_meta["Date"] = "object"

cases_vertical = (
    pd
    .read_csv(csv_file, dtype=cases_meta, parse_dates=["Date"])   
    .set_index(["FIPS", "Date"], verify_integrity=True)
)
cases_vertical

Unnamed: 0_level_0,Unnamed: 1_level_0,State,County,Population,Confirmed,Deaths,Recovered,Confirmed_Outlier,Deaths_Outlier,Recovered_Outlier
FIPS,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1001,2020-01-22,Alabama,Autauga,55869,0,0,0,False,False,False
1001,2020-01-23,Alabama,Autauga,55869,0,0,0,False,False,False
1001,2020-01-24,Alabama,Autauga,55869,0,0,0,False,False,False
1001,2020-01-25,Alabama,Autauga,55869,0,0,0,False,False,False
1001,2020-01-26,Alabama,Autauga,55869,0,0,0,False,False,False
...,...,...,...,...,...,...,...,...,...,...
56045,2020-05-17,Wyoming,Weston,6927,0,0,0,False,False,False
56045,2020-05-18,Wyoming,Weston,6927,0,0,0,False,False,False
56045,2020-05-19,Wyoming,Weston,6927,0,0,0,False,False,False
56045,2020-05-20,Wyoming,Weston,6927,0,0,0,False,False,False


In [3]:
# Normalize the Confirmed and Deaths counts by population.
cases = cases_vertical.copy()
cases["Confirmed_per_100"] = cases["Confirmed"] / cases["Population"] * 100
cases["Deaths_per_100"] = cases["Deaths"] / cases["Population"] * 100
cases

Unnamed: 0_level_0,Unnamed: 1_level_0,State,County,Population,Confirmed,Deaths,Recovered,Confirmed_Outlier,Deaths_Outlier,Recovered_Outlier,Confirmed_per_100,Deaths_per_100
FIPS,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1001,2020-01-22,Alabama,Autauga,55869,0,0,0,False,False,False,0.0,0.0
1001,2020-01-23,Alabama,Autauga,55869,0,0,0,False,False,False,0.0,0.0
1001,2020-01-24,Alabama,Autauga,55869,0,0,0,False,False,False,0.0,0.0
1001,2020-01-25,Alabama,Autauga,55869,0,0,0,False,False,False,0.0,0.0
1001,2020-01-26,Alabama,Autauga,55869,0,0,0,False,False,False,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
56045,2020-05-17,Wyoming,Weston,6927,0,0,0,False,False,False,0.0,0.0
56045,2020-05-18,Wyoming,Weston,6927,0,0,0,False,False,False,0.0,0.0
56045,2020-05-19,Wyoming,Weston,6927,0,0,0,False,False,False,0.0,0.0
56045,2020-05-20,Wyoming,Weston,6927,0,0,0,False,False,False,0.0,0.0


In [4]:
# Pull off the most recent element of each time series.
cases_without_index = cases.reset_index()
last_date = cases_without_index["Date"].max()
cases_by_county = (
    cases_without_index[cases_without_index["Date"] == last_date]
    .set_index("FIPS")
    .drop(columns=["Confirmed_Outlier", "Deaths_Outlier", "Recovered_Outlier"]))
cases_by_county

Unnamed: 0_level_0,Date,State,County,Population,Confirmed,Deaths,Recovered,Confirmed_per_100,Deaths_per_100
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1001,2020-05-21,Alabama,Autauga,55869,143,3,0,0.255956,0.005370
1003,2020-05-21,Alabama,Baldwin,223234,269,8,0,0.120501,0.003584
1005,2020-05-21,Alabama,Barbour,24686,99,1,0,0.401037,0.004051
1007,2020-05-21,Alabama,Bibb,22394,52,1,0,0.232205,0.004465
1009,2020-05-21,Alabama,Blount,57826,47,1,0,0.081278,0.001729
...,...,...,...,...,...,...,...,...,...
56037,2020-05-21,Wyoming,Sweetwater,42343,25,0,0,0.059042,0.000000
56039,2020-05-21,Wyoming,Teton,23464,100,0,0,0.426185,0.000000
56041,2020-05-21,Wyoming,Uinta,20226,13,0,0,0.064274,0.000000
56043,2020-05-21,Wyoming,Washakie,7805,20,0,0,0.256246,0.000000


In [5]:
# Build a second table of the same data aggregated at the state level
cases_by_state = (cases_by_county
 .groupby("State")
 .aggregate({
     "Population": "sum",
     "Confirmed": "sum",
     "Deaths": "sum"
 }))
cases_by_state["Confirmed_per_100"] = cases_by_state["Confirmed"] / cases_by_state["Population"] * 100
cases_by_state["Deaths_per_100"] = cases_by_state["Deaths"] / cases_by_state["Population"] * 100

cases_by_state = cases_by_state[["Population", "Confirmed", "Deaths", \
                                 "Confirmed_per_100", "Deaths_per_100"]]
cases_by_state

Unnamed: 0_level_0,Population,Confirmed,Deaths,Confirmed_per_100,Deaths_per_100
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama,4903185,13288,529,0.271008,0.010789
Alaska,731545,401,10,0.054815,0.001367
Arizona,7278717,15348,762,0.210861,0.010469
Arkansas,3017804,5307,110,0.175856,0.003645
California,39512223,88031,3583,0.222794,0.009068
Colorado,5758736,23100,1309,0.40113,0.022731
Connecticut,3565287,39014,3581,1.094274,0.100441
Delaware,973764,8340,316,0.85647,0.032451
District of Columbia,705749,7788,412,1.103508,0.058378
Florida,21477737,48593,2144,0.226248,0.009982


In [6]:
# Now our data prep is done and we can start analyzing.

# The latest nationwide totals
cases_by_state[["Confirmed", "Deaths"]].sum()

Confirmed    1560374
Deaths         93537
dtype: int64

In [7]:
# Today's nationwide totals, computed from the county-level data
cases_by_county[["Confirmed", "Deaths"]].sum()

Confirmed    1560374
Deaths         93537
dtype: int64

In [8]:
# Top 10 states by confirmed cases:
cases_by_state.sort_values("Confirmed", ascending=False).head(10)

Unnamed: 0_level_0,Population,Confirmed,Deaths,Confirmed_per_100,Deaths_per_100
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
New York,19453561,355130,28384,1.825527,0.145906
New Jersey,8882190,151586,10846,1.706629,0.12211
Illinois,12671821,102512,4606,0.808976,0.036348
Massachusetts,6892503,89741,6144,1.302009,0.08914
California,39512223,88031,3583,0.222794,0.009068
Pennsylvania,12801989,69252,4869,0.540947,0.038033
Texas,28995881,53053,1460,0.182967,0.005035
Michigan,9986857,50083,5065,0.501489,0.050717
Florida,21477737,48593,2144,0.226248,0.009982
Maryland,6045680,43531,2094,0.720035,0.034636


In [9]:
# Top 10 states by confirmed cases per 100 residents
cases_by_state.sort_values("Confirmed_per_100", ascending=False).head(10)

Unnamed: 0_level_0,Population,Confirmed,Deaths,Confirmed_per_100,Deaths_per_100
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
New York,19453561,355130,28384,1.825527,0.145906
New Jersey,8882190,151586,10846,1.706629,0.12211
Massachusetts,6892503,89741,6144,1.302009,0.08914
Rhode Island,1059361,12018,425,1.134457,0.040119
District of Columbia,705749,7788,412,1.103508,0.058378
Connecticut,3565287,39014,3581,1.094274,0.100441
Delaware,973764,8340,316,0.85647,0.032451
Illinois,12671821,102512,4606,0.808976,0.036348
Louisiana,4648794,36426,2506,0.783558,0.053906
Maryland,6045680,43531,2094,0.720035,0.034636


In [10]:
# Top 10 states by deaths per 100 residents
cases_by_state.sort_values("Deaths_per_100", ascending=False).head(10)

Unnamed: 0_level_0,Population,Confirmed,Deaths,Confirmed_per_100,Deaths_per_100
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
New York,19453561,355130,28384,1.825527,0.145906
New Jersey,8882190,151586,10846,1.706629,0.12211
Connecticut,3565287,39014,3581,1.094274,0.100441
Massachusetts,6892503,89741,6144,1.302009,0.08914
District of Columbia,705749,7788,412,1.103508,0.058378
Louisiana,4648794,36426,2506,0.783558,0.053906
Michigan,9986857,50083,5065,0.501489,0.050717
Rhode Island,1059361,12018,425,1.134457,0.040119
Pennsylvania,12801989,69252,4869,0.540947,0.038033
Illinois,12671821,102512,4606,0.808976,0.036348


In [11]:
# Top 20 counties by total confirmed cases.
cases_by_county.sort_values("Confirmed", ascending=False).head(20)

Unnamed: 0_level_0,Date,State,County,Population,Confirmed,Deaths,Recovered,Confirmed_per_100,Deaths_per_100
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
17031,2020-05-21,Illinois,Cook,5150233,67551,3114,0,1.311611,0.060463
36081,2020-05-21,New York,Queens,2253858,59816,6051,0,2.653938,0.268473
36047,2020-05-21,New York,Kings,2559903,53033,6513,0,2.07168,0.254424
36005,2020-05-21,New York,Bronx,1418207,44169,4337,0,3.114425,0.305809
6037,2020-05-21,California,Los Angeles,10039107,42037,2016,0,0.418732,0.020081
36059,2020-05-21,New York,Nassau,1356924,39487,2073,0,2.910038,0.152772
36103,2020-05-21,New York,Suffolk,1476601,38553,1802,0,2.610929,0.122037
36119,2020-05-21,New York,Westchester,967506,32673,1313,0,3.377033,0.13571
36061,2020-05-21,New York,New York,1628706,24119,2796,0,1.480869,0.17167
42101,2020-05-21,Pennsylvania,Philadelphia,1584064,20700,1178,0,1.306765,0.074366


In [12]:
# Top 20 counties by confirmed cases per 100 residents
cases_by_county.sort_values("Confirmed_per_100", ascending=False).head(20)

Unnamed: 0_level_0,Date,State,County,Population,Confirmed,Deaths,Recovered,Confirmed_per_100,Deaths_per_100
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
47169,2020-05-21,Tennessee,Trousdale,11284,1385,4,0,12.274016,0.035448
31043,2020-05-21,Nebraska,Dakota,20026,1489,15,0,7.435334,0.074903
5079,2020-05-21,Arkansas,Lincoln,13024,966,8,0,7.417076,0.061425
27105,2020-05-21,Minnesota,Nobles,21629,1414,2,0,6.537519,0.009247
47095,2020-05-21,Tennessee,Lake,7016,403,0,0,5.744014,0.0
31037,2020-05-21,Nebraska,Colfax,10709,573,2,0,5.35064,0.018676
20057,2020-05-21,Kansas,Ford,33619,1437,8,0,4.274369,0.023796
18017,2020-05-21,Indiana,Cass,37689,1572,6,0,4.170978,0.01592
40139,2020-05-21,Oklahoma,Texas,19983,820,4,0,4.103488,0.020017
47007,2020-05-21,Tennessee,Bledsoe,15064,607,1,0,4.029474,0.006638


In [13]:
# Top 20 counties by *deaths* per 100 residents
cases_by_county.sort_values("Deaths_per_100", ascending=False).head(20)

Unnamed: 0_level_0,Date,State,County,Population,Confirmed,Deaths,Recovered,Confirmed_per_100,Deaths_per_100
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
36005,2020-05-21,New York,Bronx,1418207,44169,4337,0,3.114425,0.305809
13243,2020-05-21,Georgia,Randolph,6778,170,20,0,2.508114,0.295072
13273,2020-05-21,Georgia,Terrell,8531,204,25,0,2.391279,0.293049
13099,2020-05-21,Georgia,Early,10190,235,29,0,2.306183,0.284593
36081,2020-05-21,New York,Queens,2253858,59816,6051,0,2.653938,0.268473
36047,2020-05-21,New York,Kings,2559903,53033,6513,0,2.07168,0.254424
36085,2020-05-21,New York,Richmond,476143,13210,947,0,2.774377,0.19889
34013,2020-05-21,New Jersey,Essex,798975,16980,1579,0,2.125223,0.197628
36087,2020-05-21,New York,Rockland,325789,12877,611,0,3.952558,0.187545
34039,2020-05-21,New Jersey,Union,556341,15176,1005,0,2.727823,0.180645
