# tables_us_data.ipynb

Build tables of the latest COVID-19 statistics for U.S. counties.

Inputs:
* `data/us_counties_clean.csv`: The contents of `data/us_counties.csv` after data cleaning by `clean.ipynb`
* `data/us_counties_clean_meta.json`: Column type metadata for reading `data/us_counties_clean.csv` with `pd.read_csv()`


In [1]:
# Initialization boilerplate
import os
import json
import pandas as pd
import numpy as np
from typing import *

import text_extensions_for_pandas as tp

# Local file of utility functions
import util

_DATA_DIR = os.getenv("COVID_DATA_DIR", "../data")

In [2]:
# Read in the CSV file and apply the saved type information
csv_file = os.path.join(_DATA_DIR, "us_counties_clean.csv")
meta_file = os.path.join(_DATA_DIR, "us_counties_clean_meta.json")

# Read column type metadata
with open(meta_file) as f:
    cases_meta = json.load(f)

# Pandas does not currently support parsing datetime64 from CSV files.
# As a workaround, read the "Date" column as objects and manually 
# convert after.
cases_meta["Date"] = "object"

cases_vertical = (
    pd
    .read_csv(csv_file, dtype=cases_meta, parse_dates=["Date"])   
    .set_index(["FIPS", "Date"], verify_integrity=True)
)
cases_vertical

Unnamed: 0_level_0,Unnamed: 1_level_0,State,County,Population,Confirmed,Deaths,Recovered,Confirmed_Outlier,Deaths_Outlier,Recovered_Outlier
FIPS,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1001,2020-01-22,Alabama,Autauga,55869,0,0,0,False,False,False
1001,2020-01-23,Alabama,Autauga,55869,0,0,0,False,False,False
1001,2020-01-24,Alabama,Autauga,55869,0,0,0,False,False,False
1001,2020-01-25,Alabama,Autauga,55869,0,0,0,False,False,False
1001,2020-01-26,Alabama,Autauga,55869,0,0,0,False,False,False
...,...,...,...,...,...,...,...,...,...,...
56045,2020-05-13,Wyoming,Weston,6927,0,0,0,False,False,False
56045,2020-05-14,Wyoming,Weston,6927,0,0,0,False,False,False
56045,2020-05-15,Wyoming,Weston,6927,0,0,0,False,False,False
56045,2020-05-16,Wyoming,Weston,6927,0,0,0,False,False,False


In [3]:
# Normalize the Confirmed and Deaths counts by population.
cases = cases_vertical.copy()
cases["Confirmed_per_100"] = cases["Confirmed"] / cases["Population"] * 100
cases["Deaths_per_100"] = cases["Deaths"] / cases["Population"] * 100
cases

Unnamed: 0_level_0,Unnamed: 1_level_0,State,County,Population,Confirmed,Deaths,Recovered,Confirmed_Outlier,Deaths_Outlier,Recovered_Outlier,Confirmed_per_100,Deaths_per_100
FIPS,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1001,2020-01-22,Alabama,Autauga,55869,0,0,0,False,False,False,0.0,0.0
1001,2020-01-23,Alabama,Autauga,55869,0,0,0,False,False,False,0.0,0.0
1001,2020-01-24,Alabama,Autauga,55869,0,0,0,False,False,False,0.0,0.0
1001,2020-01-25,Alabama,Autauga,55869,0,0,0,False,False,False,0.0,0.0
1001,2020-01-26,Alabama,Autauga,55869,0,0,0,False,False,False,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
56045,2020-05-13,Wyoming,Weston,6927,0,0,0,False,False,False,0.0,0.0
56045,2020-05-14,Wyoming,Weston,6927,0,0,0,False,False,False,0.0,0.0
56045,2020-05-15,Wyoming,Weston,6927,0,0,0,False,False,False,0.0,0.0
56045,2020-05-16,Wyoming,Weston,6927,0,0,0,False,False,False,0.0,0.0


In [4]:
# Pull off the most recent element of each time series.
cases_without_index = cases.reset_index()
last_date = cases_without_index["Date"].max()
cases_by_county = cases_without_index[cases_without_index["Date"] == last_date].set_index("FIPS")
cases_by_county

Unnamed: 0_level_0,Date,State,County,Population,Confirmed,Deaths,Recovered,Confirmed_Outlier,Deaths_Outlier,Recovered_Outlier,Confirmed_per_100,Deaths_per_100
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1001,2020-05-17,Alabama,Autauga,55869,110,4,0,False,False,False,0.196889,0.007160
1003,2020-05-17,Alabama,Baldwin,223234,254,8,0,False,False,False,0.113782,0.003584
1005,2020-05-17,Alabama,Barbour,24686,81,1,0,False,False,False,0.328121,0.004051
1007,2020-05-17,Alabama,Bibb,22394,50,1,0,False,False,False,0.223274,0.004465
1009,2020-05-17,Alabama,Blount,57826,46,1,0,False,False,False,0.079549,0.001729
...,...,...,...,...,...,...,...,...,...,...,...,...
56037,2020-05-17,Wyoming,Sweetwater,42343,23,0,0,False,False,False,0.054318,0.000000
56039,2020-05-17,Wyoming,Teton,23464,100,0,0,False,False,False,0.426185,0.000000
56041,2020-05-17,Wyoming,Uinta,20226,10,0,0,False,False,False,0.049441,0.000000
56043,2020-05-17,Wyoming,Washakie,7805,16,0,0,False,False,False,0.204997,0.000000


In [5]:
# Build a second table of the same data aggregated at the state level
cases_by_state = (cases_by_county
 .groupby("State")
 .aggregate({
     "Population": "sum",
     "Confirmed": "sum",
     "Deaths": "sum"
 }))
cases_by_state["Confirmed_per_100"] = cases_by_state["Confirmed"] / cases_by_state["Population"] * 100
cases_by_state["Deaths_per_100"] = cases_by_state["Deaths"] / cases_by_state["Population"] * 100

cases_by_state = cases_by_state[["Population", "Confirmed", "Deaths", \
                                 "Confirmed_per_100", "Deaths_per_100"]]
cases_by_state

Unnamed: 0_level_0,Population,Confirmed,Deaths,Confirmed_per_100,Deaths_per_100
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama,4903185,11771,488,0.240068,0.009953
Alaska,731545,387,10,0.052902,0.001367
Arizona,7278717,13945,679,0.191586,0.009329
Arkansas,3017804,4627,98,0.153323,0.003247
California,39512223,80166,3240,0.202889,0.0082
Colorado,5758736,21855,1214,0.37951,0.021081
Connecticut,3565287,37209,3407,1.043647,0.09556
Delaware,973764,7618,289,0.782325,0.029679
District of Columbia,705749,7123,383,1.009282,0.054269
Florida,21477737,45509,1973,0.211889,0.009186


In [6]:
# Now our data prep is done and we can start analyzing.

# The latest nationwide totals
cases_by_state[["Confirmed", "Deaths"]].sum()

Confirmed    1472204
Deaths         88389
dtype: int64

In [7]:
# Today's nationwide totals, computed from the county-level data
cases_by_county[["Confirmed", "Deaths"]].sum()

Confirmed    1472204
Deaths         88389
dtype: int64

In [8]:
# Top 10 states by confirmed cases:
cases_by_state.sort_values("Confirmed", ascending=False).head(10)

Unnamed: 0_level_0,Population,Confirmed,Deaths,Confirmed_per_100,Deaths_per_100
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
New York,19453561,350118,28229,1.799763,0.14511
New Jersey,8882190,146504,10363,1.649413,0.116672
Illinois,12671821,94040,4176,0.742119,0.032955
Massachusetts,6892503,85657,5793,1.242756,0.084048
California,39512223,80166,3240,0.202889,0.0082
Pennsylvania,12801989,65700,4495,0.513202,0.035112
Michigan,9986857,48396,4831,0.484597,0.048374
Texas,28995881,48396,1343,0.166906,0.004632
Florida,21477737,45509,1973,0.211889,0.009186
Maryland,6045680,38804,1911,0.641847,0.031609


In [9]:
# Top 10 states by confirmed cases per 100 residents
cases_by_state.sort_values("Confirmed_per_100", ascending=False).head(10)

Unnamed: 0_level_0,Population,Confirmed,Deaths,Confirmed_per_100,Deaths_per_100
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
New York,19453561,350118,28229,1.799763,0.14511
New Jersey,8882190,146504,10363,1.649413,0.116672
Massachusetts,6892503,85657,5793,1.242756,0.084048
Connecticut,3565287,37209,3407,1.043647,0.09556
Rhode Island,1059361,10789,0,1.018444,0.0
District of Columbia,705749,7123,383,1.009282,0.054269
Delaware,973764,7618,289,0.782325,0.029679
Illinois,12671821,94040,4176,0.742119,0.032955
Louisiana,4648794,34360,2425,0.739116,0.052164
Maryland,6045680,38804,1911,0.641847,0.031609


In [10]:
# Top 10 states by deaths per 100 residents
cases_by_state.sort_values("Deaths_per_100", ascending=False).head(10)

Unnamed: 0_level_0,Population,Confirmed,Deaths,Confirmed_per_100,Deaths_per_100
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
New York,19453561,350118,28229,1.799763,0.14511
New Jersey,8882190,146504,10363,1.649413,0.116672
Connecticut,3565287,37209,3407,1.043647,0.09556
Massachusetts,6892503,85657,5793,1.242756,0.084048
District of Columbia,705749,7123,383,1.009282,0.054269
Louisiana,4648794,34360,2425,0.739116,0.052164
Michigan,9986857,48396,4831,0.484597,0.048374
Pennsylvania,12801989,65700,4495,0.513202,0.035112
Illinois,12671821,94040,4176,0.742119,0.032955
Maryland,6045680,38804,1911,0.641847,0.031609


In [11]:
# Top 20 counties by total confirmed cases.
cases_by_county.sort_values("Confirmed", ascending=False).head(20)

Unnamed: 0_level_0,Date,State,County,Population,Confirmed,Deaths,Recovered,Confirmed_Outlier,Deaths_Outlier,Recovered_Outlier,Confirmed_per_100,Deaths_per_100
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
17031,2020-05-17,Illinois,Cook,5150233,62218,2839,0,False,False,False,1.208062,0.055124
36047,2020-05-17,New York,Kings,2559903,59137,6362,0,False,False,False,2.310127,0.248525
36081,2020-05-17,New York,Queens,2253858,52067,5601,0,False,False,False,2.310128,0.248507
36059,2020-05-17,New York,Nassau,1356924,39136,2044,0,False,False,False,2.88417,0.150635
36103,2020-05-17,New York,Suffolk,1476601,38117,1748,0,False,False,False,2.581401,0.11838
6037,2020-05-17,California,Los Angeles,10039107,38011,1821,0,False,False,False,0.378629,0.018139
36061,2020-05-17,New York,New York,1628706,37625,4047,0,False,False,False,2.310116,0.248479
36005,2020-05-17,New York,Bronx,1418207,32762,3524,0,False,False,False,2.3101,0.248483
36119,2020-05-17,New York,Westchester,967506,32224,1289,0,False,False,False,3.330625,0.133229
42101,2020-05-17,Pennsylvania,Philadelphia,1584064,19606,1031,0,False,False,False,1.237703,0.065086


In [12]:
# Top 20 counties by confirmed cases per 100 residents
cases_by_county.sort_values("Confirmed_per_100", ascending=False).head(20)

Unnamed: 0_level_0,Date,State,County,Population,Confirmed,Deaths,Recovered,Confirmed_Outlier,Deaths_Outlier,Recovered_Outlier,Confirmed_per_100,Deaths_per_100
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
47169,2020-05-17,Tennessee,Trousdale,11284,1383,4,0,False,False,False,12.256292,0.035448
5079,2020-05-17,Arkansas,Lincoln,13024,955,8,0,False,False,False,7.332617,0.061425
31043,2020-05-17,Nebraska,Dakota,20026,1458,3,0,False,False,False,7.280535,0.014981
27105,2020-05-17,Minnesota,Nobles,21629,1361,2,0,False,False,False,6.292478,0.009247
47095,2020-05-17,Tennessee,Lake,7016,404,0,0,False,False,False,5.758267,0.0
31037,2020-05-17,Nebraska,Colfax,10709,516,0,0,False,False,False,4.818377,0.0
18017,2020-05-17,Indiana,Cass,37689,1553,5,0,False,False,False,4.120566,0.013266
47007,2020-05-17,Tennessee,Bledsoe,15064,607,1,0,False,False,False,4.029474,0.006638
36087,2020-05-17,New York,Rockland,325789,12758,602,0,False,False,False,3.916032,0.184782
20057,2020-05-17,Kansas,Ford,33619,1311,8,0,False,False,False,3.899581,0.023796


In [13]:
# Top 20 counties by *deaths* per 100 residents
cases_by_county.sort_values("Deaths_per_100", ascending=False).head(20)

Unnamed: 0_level_0,Date,State,County,Population,Confirmed,Deaths,Recovered,Confirmed_Outlier,Deaths_Outlier,Recovered_Outlier,Confirmed_per_100,Deaths_per_100
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
13243,2020-05-17,Georgia,Randolph,6778,169,21,0,False,False,False,2.493361,0.309826
13273,2020-05-17,Georgia,Terrell,8531,199,24,0,False,False,False,2.332669,0.281327
13099,2020-05-17,Georgia,Early,10190,233,28,0,False,False,False,2.286555,0.274779
36047,2020-05-17,New York,Kings,2559903,59137,6362,0,False,False,False,2.310127,0.248525
36081,2020-05-17,New York,Queens,2253858,52067,5601,0,False,False,False,2.310128,0.248507
36005,2020-05-17,New York,Bronx,1418207,32762,3524,0,False,False,False,2.3101,0.248483
36061,2020-05-17,New York,New York,1628706,37625,4047,0,False,False,False,2.310116,0.248479
36085,2020-05-17,New York,Richmond,476143,10999,1183,0,False,False,False,2.31002,0.248455
34013,2020-05-17,New Jersey,Essex,798975,16204,1535,0,False,False,False,2.028099,0.192121
36087,2020-05-17,New York,Rockland,325789,12758,602,0,False,False,False,3.916032,0.184782
