In [None]:
!pip install pandas==1.0.3 regex matplotlib==3.2.1 cython grpcio-tools scikit-learn pyarrow fastparquet plotly memoized-property opt-einsum==2.3.2 > /dev/null 2>&1

In [None]:
!pip install --upgrade git+https://github.com/frreiss/text-extensions-for-pandas > /dev/null 2>&1

In [None]:
%load util.py

# tables_us_data.ipynb

Build tables of the latest COVID-19 statistics for U.S. counties.

Inputs:
* `data/us_counties_clean.csv`: The contents of `data/us_counties.csv` after data cleaning by `clean.ipynb`
* `data/us_counties_clean_meta.json`: Column type metadata for reading `data/us_counties_clean.csv` with `pd.read_csv()`


In [1]:
# Initialization boilerplate

# Ensure a consistent Python environment.
#import sys
#sys.path.append("..")  # Local libraries are in the directory above "notebooks"
#import env
#env.maybe_install_libs()

import os
import json
import pandas as pd
import numpy as np

from typing import *

import text_extensions_for_pandas as tp

# Local file of utility functions
import util

_DATA_DIR = os.getenv('COVID_DATA_DIR', '../data')

In [2]:
# Read in the CSV file and apply the saved type information
csv_file = os.path.join(_DATA_DIR, "us_counties_clean.csv")
meta_file = os.path.join(_DATA_DIR, "us_counties_clean_meta.json")

# Read column type metadata
with open(meta_file) as f:
    cases_meta = json.load(f)

# Pandas does not currently support parsing datetime64 from CSV files.
# As a workaround, read the "Date" column as objects and manually 
# convert after.
cases_meta["Date"] = "object"

cases_vertical = (
    pd
    .read_csv(csv_file, dtype=cases_meta, parse_dates=["Date"])   
    .set_index(["FIPS", "Date"], verify_integrity=True)
)
cases_vertical

Unnamed: 0_level_0,Unnamed: 1_level_0,State,County,Population,Confirmed,Deaths,Recovered,Confirmed_Outlier,Deaths_Outlier,Recovered_Outlier
FIPS,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1001,2020-01-22,Alabama,Autauga,55869,0,0,0,False,False,False
1001,2020-01-23,Alabama,Autauga,55869,0,0,0,False,False,False
1001,2020-01-24,Alabama,Autauga,55869,0,0,0,False,False,False
1001,2020-01-25,Alabama,Autauga,55869,0,0,0,False,False,False
1001,2020-01-26,Alabama,Autauga,55869,0,0,0,False,False,False
...,...,...,...,...,...,...,...,...,...,...
56045,2020-04-23,Wyoming,Weston,6927,0,0,0,False,False,False
56045,2020-04-24,Wyoming,Weston,6927,0,0,0,False,False,False
56045,2020-04-25,Wyoming,Weston,6927,0,0,0,False,False,False
56045,2020-04-26,Wyoming,Weston,6927,0,0,0,False,False,False


In [3]:
# Normalize the Confirmed and Deaths counts by population.
cases = cases_vertical.copy()
cases["Confirmed_per_100"] = cases["Confirmed"] / cases["Population"] * 100
cases["Deaths_per_100"] = cases["Deaths"] / cases["Population"] * 100
cases

Unnamed: 0_level_0,Unnamed: 1_level_0,State,County,Population,Confirmed,Deaths,Recovered,Confirmed_Outlier,Deaths_Outlier,Recovered_Outlier,Confirmed_per_100,Deaths_per_100
FIPS,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1001,2020-01-22,Alabama,Autauga,55869,0,0,0,False,False,False,0.0,0.0
1001,2020-01-23,Alabama,Autauga,55869,0,0,0,False,False,False,0.0,0.0
1001,2020-01-24,Alabama,Autauga,55869,0,0,0,False,False,False,0.0,0.0
1001,2020-01-25,Alabama,Autauga,55869,0,0,0,False,False,False,0.0,0.0
1001,2020-01-26,Alabama,Autauga,55869,0,0,0,False,False,False,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
56045,2020-04-23,Wyoming,Weston,6927,0,0,0,False,False,False,0.0,0.0
56045,2020-04-24,Wyoming,Weston,6927,0,0,0,False,False,False,0.0,0.0
56045,2020-04-25,Wyoming,Weston,6927,0,0,0,False,False,False,0.0,0.0
56045,2020-04-26,Wyoming,Weston,6927,0,0,0,False,False,False,0.0,0.0


In [4]:
# Pull off the most recent element of each time series.
cases_without_index = cases.reset_index()
last_date = cases_without_index["Date"].max()
cases_by_county = cases_without_index[cases_without_index["Date"] == last_date].set_index("FIPS")
cases_by_county

Unnamed: 0_level_0,Date,State,County,Population,Confirmed,Deaths,Recovered,Confirmed_Outlier,Deaths_Outlier,Recovered_Outlier,Confirmed_per_100,Deaths_per_100
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1001,2020-04-27,Alabama,Autauga,55869,39,3,0,False,False,False,0.069806,0.005370
1003,2020-04-27,Alabama,Baldwin,223234,168,3,0,False,False,False,0.075257,0.001344
1005,2020-04-27,Alabama,Barbour,24686,35,0,0,False,False,False,0.141781,0.000000
1007,2020-04-27,Alabama,Bibb,22394,42,0,0,False,False,False,0.187550,0.000000
1009,2020-04-27,Alabama,Blount,57826,34,0,0,False,False,False,0.058797,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
56037,2020-04-27,Wyoming,Sweetwater,42343,16,0,0,False,False,False,0.037787,0.000000
56039,2020-04-27,Wyoming,Teton,23464,95,0,0,False,False,False,0.404876,0.000000
56041,2020-04-27,Wyoming,Uinta,20226,7,0,0,False,False,False,0.034609,0.000000
56043,2020-04-27,Wyoming,Washakie,7805,8,0,0,False,False,False,0.102498,0.000000


In [5]:
# Build a second table of the same data aggregated at the state level
cases_by_state = (cases_by_county
 .groupby("State")
 .aggregate({
     "Population": "sum",
     "Confirmed": "sum",
     "Deaths": "sum"
 }))
cases_by_state["Confirmed_per_100"] = cases_by_state["Confirmed"] / cases_by_state["Population"] * 100
cases_by_state["Deaths_per_100"] = cases_by_state["Deaths"] / cases_by_state["Population"] * 100

cases_by_state = cases_by_state[["Population", "Confirmed", "Deaths", \
                                 "Confirmed_per_100", "Deaths_per_100"]]
cases_by_state

Unnamed: 0_level_0,Population,Confirmed,Deaths,Confirmed_per_100,Deaths_per_100
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Alabama,4903185,6539,228,0.133362,0.00465
Alaska,731545,345,5,0.04716,0.000683
Arizona,7278717,6725,275,0.092393,0.003778
Arkansas,3017804,2866,51,0.09497,0.00169
California,39512223,44966,1777,0.113803,0.004497
Colorado,5758736,13703,702,0.237952,0.01219
Connecticut,3565287,25513,2011,0.715595,0.056405
Delaware,973764,4151,125,0.426284,0.012837
District of Columbia,705749,3892,185,0.551471,0.026213
Florida,21477737,32137,1088,0.149629,0.005066


In [6]:
# Now our data prep is done and we can start analyzing.

# The latest nationwide totals
cases_by_state[["Confirmed", "Deaths"]].sum()

Confirmed    978151
Deaths        55616
dtype: int64

In [7]:
# Today's nationwide totals, computed from the county-level data
cases_by_county[["Confirmed", "Deaths"]].sum()

Confirmed    978151
Deaths        55616
dtype: int64

In [8]:
# Top 10 states by confirmed cases:
cases_by_state.sort_values("Confirmed", ascending=False).head(10)

Unnamed: 0_level_0,Population,Confirmed,Deaths,Confirmed_per_100,Deaths_per_100
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
New York,19453561,291994,22666,1.50098,0.116513
New Jersey,8882190,110370,6044,1.242599,0.068046
Massachusetts,6892503,55642,2994,0.807283,0.043439
Illinois,12671821,45545,1983,0.35942,0.015649
California,39512223,44966,1777,0.113803,0.004497
Pennsylvania,12801989,43558,1886,0.340244,0.014732
Michigan,9986857,37056,3367,0.371048,0.033714
Florida,21477737,32137,1088,0.149629,0.005066
Louisiana,4648794,27004,1697,0.580882,0.036504
Connecticut,3565287,25513,2011,0.715595,0.056405


In [9]:
# Top 10 states by confirmed cases per 100 residents
cases_by_state.sort_values("Confirmed_per_100", ascending=False).head(10)

Unnamed: 0_level_0,Population,Confirmed,Deaths,Confirmed_per_100,Deaths_per_100
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
New York,19453561,291994,22666,1.50098,0.116513
New Jersey,8882190,110370,6044,1.242599,0.068046
Massachusetts,6892503,55642,2994,0.807283,0.043439
Connecticut,3565287,25513,2011,0.715595,0.056405
Rhode Island,1059361,6548,0,0.618108,0.0
Louisiana,4648794,27004,1697,0.580882,0.036504
District of Columbia,705749,3892,185,0.551471,0.026213
Delaware,973764,4151,125,0.426284,0.012837
Michigan,9986857,37056,3367,0.371048,0.033714
Illinois,12671821,45545,1983,0.35942,0.015649


In [10]:
# Top 10 states by deaths per 100 residents
cases_by_state.sort_values("Deaths_per_100", ascending=False).head(10)

Unnamed: 0_level_0,Population,Confirmed,Deaths,Confirmed_per_100,Deaths_per_100
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
New York,19453561,291994,22666,1.50098,0.116513
New Jersey,8882190,110370,6044,1.242599,0.068046
Connecticut,3565287,25513,2011,0.715595,0.056405
Massachusetts,6892503,55642,2994,0.807283,0.043439
Louisiana,4648794,27004,1697,0.580882,0.036504
Michigan,9986857,37056,3367,0.371048,0.033714
District of Columbia,705749,3892,185,0.551471,0.026213
Illinois,12671821,45545,1983,0.35942,0.015649
Pennsylvania,12801989,43558,1886,0.340244,0.014732
Maryland,6045680,19487,858,0.322329,0.014192


In [11]:
# Top 20 counties by total confirmed cases.
cases_by_county.sort_values("Confirmed", ascending=False).head(20)

Unnamed: 0_level_0,Date,State,County,Population,Confirmed,Deaths,Recovered,Confirmed_Outlier,Deaths_Outlier,Recovered_Outlier,Confirmed_per_100,Deaths_per_100
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
36047,2020-04-27,New York,Kings,2559903,49279,5378,0,False,False,False,1.925034,0.210086
36081,2020-04-27,New York,Queens,2253858,43388,4735,0,False,False,False,1.925055,0.210084
36059,2020-04-27,New York,Nassau,1356924,34865,1620,0,False,False,False,2.569414,0.119388
36103,2020-04-27,New York,Suffolk,1476601,32470,1102,0,False,False,False,2.198969,0.074631
17031,2020-04-27,Illinois,Cook,5150233,31953,1347,0,False,False,False,0.620419,0.026154
36061,2020-04-27,New York,New York,1628706,31353,3421,0,False,False,False,1.925025,0.210044
36119,2020-04-27,New York,Westchester,967506,28007,962,0,False,False,False,2.894762,0.099431
36005,2020-04-27,New York,Bronx,1418207,27301,2979,0,False,False,False,1.925036,0.210054
6037,2020-04-27,California,Los Angeles,10039107,20423,944,0,False,False,False,0.203434,0.009403
26163,2020-04-27,Michigan,Wayne,1749343,15872,1622,0,False,False,False,0.907312,0.092721


In [12]:
# Top 20 counties by confirmed cases per 100 residents
cases_by_county.sort_values("Confirmed_per_100", ascending=False).head(20)

Unnamed: 0_level_0,Date,State,County,Population,Confirmed,Deaths,Recovered,Confirmed_Outlier,Deaths_Outlier,Recovered_Outlier,Confirmed_per_100,Deaths_per_100
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
5079,2020-04-27,Arkansas,Lincoln,13024,754,0,0,False,False,False,5.789312,0.0
47007,2020-04-27,Tennessee,Bledsoe,15064,588,0,0,False,False,False,3.903346,0.0
36087,2020-04-27,New York,Rockland,325789,11366,481,0,False,False,False,3.488761,0.147642
39101,2020-04-27,Ohio,Marion,65093,2188,4,0,False,False,False,3.361345,0.006145
39129,2020-04-27,Ohio,Pickaway,58457,1698,5,0,False,False,False,2.904699,0.008553
36119,2020-04-27,New York,Westchester,967506,28007,962,0,False,False,False,2.894762,0.099431
18017,2020-04-27,Indiana,Cass,37689,1025,1,0,False,False,False,2.719626,0.002653
36059,2020-04-27,New York,Nassau,1356924,34865,1620,0,False,False,False,2.569414,0.119388
19115,2020-04-27,Iowa,Louisa,11035,267,2,0,False,False,False,2.419574,0.018124
13243,2020-04-27,Georgia,Randolph,6778,156,19,0,False,False,False,2.301564,0.280319


In [13]:
# Top 20 counties by *deaths* per 100 residents
cases_by_county.sort_values("Deaths_per_100", ascending=False).head(20)

Unnamed: 0_level_0,Date,State,County,Population,Confirmed,Deaths,Recovered,Confirmed_Outlier,Deaths_Outlier,Recovered_Outlier,Confirmed_per_100,Deaths_per_100
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
13243,2020-04-27,Georgia,Randolph,6778,156,19,0,False,False,False,2.301564,0.280319
13273,2020-04-27,Georgia,Terrell,8531,167,18,0,False,False,False,1.957567,0.210995
36047,2020-04-27,New York,Kings,2559903,49279,5378,0,False,False,False,1.925034,0.210086
36081,2020-04-27,New York,Queens,2253858,43388,4735,0,False,False,False,1.925055,0.210084
36005,2020-04-27,New York,Bronx,1418207,27301,2979,0,False,False,False,1.925036,0.210054
36061,2020-04-27,New York,New York,1628706,31353,3421,0,False,False,False,1.925025,0.210044
36085,2020-04-27,New York,Richmond,476143,9166,1000,0,False,False,False,1.925052,0.210021
22095,2020-04-27,Louisiana,St. John the Baptist,42837,751,64,0,False,False,False,1.753157,0.149404
36087,2020-04-27,New York,Rockland,325789,11366,481,0,False,False,False,3.488761,0.147642
13099,2020-04-27,Georgia,Early,10190,202,15,0,False,False,False,1.982336,0.147203
