# etl_us_data.ipynb

Acquire a copy of the latest COVID-19 time series data and write that
data out into local CSV files in a format amenable to analysis with 
Pandas dataframes.

Input data sources:
* Primary data source: [2019 Novel Coronavirus COVID-19 (2019-nCoV) Data Repository by Johns Hopkins CSSE](https://github.com/CSSEGISandData/COVID-19).
* Secondary data source (for values missing from primary source): [New York Times "Coronavirus (Covid-19) Data in the United States" repository](https://github.com/nytimes/covid-19-data).
* Secondary data source (for values missing from primary source): [USAFacts Coronavirus Stats & Data](https://usafacts.org/issues/coronavirus/)

Output files produced:
* `outputs/us_counties.csv`: County-level time series data for the United States
* `outputs/us_counties_meta.json`: Column type metadata for reading `data/us_counties.csv` with `pd.read_csv()`

**Note:** You can redirect these output files by setting the environment variable `COVID_OUTPUTS_DIR` to a replacement for the prefix `outputs` in the above paths.

To read these files back in, use `pd.read_csv()`:
```python
with open("outputs/us_counties_meta.json") as f:
    cases_meta = json.load(f)
cases_meta["Date"] = "object"  # Workaround for pd.read_csv() not supporting parsing datetime64
cases_raw = pd.read_csv("../data/us_counties.csv", dtype=cases_meta, parse_dates=["Date"])
cases = cases_raw.set_index(["FIPS", "Date"], verify_integrity=True)
```


In [1]:
# Initialization boilerplate

# Import Python packages that this notebook uses.
import os
import numpy as np
import pandas as pd
from urllib.request import urlopen
import json
from datetime import datetime, date

# Local file of utility functions
import util

# Allow environment variables to override data file locations
_OUTPUTS_DIR = os.getenv("COVID_OUTPUTS_DIR", "outputs")
util.ensure_dir_exists(_OUTPUTS_DIR)  # create if necessary

# URLs for downloading the time series data directly from Github
_JH_BASE_URL = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/" + \
               "csse_covid_19_data/csse_covid_19_time_series/"
_JH_CONFIRMED_URL = _JH_BASE_URL + "time_series_covid19_confirmed_US.csv"
_JH_DEATHS_URL = _JH_BASE_URL + "time_series_covid19_deaths_US.csv"

# Currently there are no data on recovered patients for the US.
# _JH_RECOVERED_URL = _JH_BASE_URL + "time_series_covid19_recovered_US.csv"

_NYT_BASE_URL = "https://raw.githubusercontent.com/nytimes/covid-19-data/master/"
_NYT_CSV_URL = _NYT_BASE_URL + "us-counties.csv"

_USAFACTS_BASE_URL = "https://usafactsstatic.blob.core.windows.net/public/data/covid-19"
_USAFACTS_CONFIRMED_URL = f"{_USAFACTS_BASE_URL}/covid_confirmed_usafacts.csv"
_USAFACTS_DEATHS_URL = f"{_USAFACTS_BASE_URL}/covid_deaths_usafacts.csv"

# First date present in the data set, and the format of these dates. 
# Hopefully this won't change as new data are added.
# NOTE: One file uses "01/22/20" and the other uses "1/22/20"
# so you need to filter with endswith() to find matches.
_FIRST_DATE_SUFFIX = "1/22/20"
_DATE_FORMAT = "%m/%d/%y"

# Johns Hopkins data

We pull the latest Johns Hopkins data from Github.

In [2]:
raw_confirmed = pd.read_csv(_JH_CONFIRMED_URL)
raw_deaths = pd.read_csv(_JH_DEATHS_URL)

# No "recovered" time series at the moment. Generate an empty
# time series the schema from the deaths
raw_recovered = raw_deaths.copy(deep=True)
for i in range(len(raw_recovered.columns)):
    if str(raw_recovered.columns[i]).endswith(_FIRST_DATE_SUFFIX):
        ts_start_index = i
        break
for c in raw_recovered.columns[ts_start_index:]:
    raw_recovered[c] = 0

raw_confirmed

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,9/7/20,9/8/20,9/9/20,9/10/20,9/11/20,9/12/20,9/13/20,9/14/20,9/15/20,9/16/20
0,84001001,US,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,...,1383,1385,1398,1413,1420,1432,1442,1447,1585,1619
1,84001003,US,USA,840,1003.0,Baldwin,Alabama,US,30.727750,-87.722071,...,4586,4609,4639,4678,4722,4752,4781,4800,4978,5003
2,84001005,US,USA,840,1005.0,Barbour,Alabama,US,31.868263,-85.387129,...,617,617,618,618,618,620,622,626,801,809
3,84001007,US,USA,840,1007.0,Bibb,Alabama,US,32.996421,-87.125115,...,557,562,564,566,574,576,578,581,610,612
4,84001009,US,USA,840,1009.0,Blount,Alabama,US,33.982109,-86.567906,...,1070,1071,1082,1099,1109,1114,1121,1128,1464,1487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3335,84056039,US,USA,840,56039.0,Teton,Wyoming,US,43.935225,-110.589080,...,447,450,451,453,462,465,470,476,478,481
3336,84056041,US,USA,840,56041.0,Uinta,Wyoming,US,41.287818,-110.547578,...,308,309,308,310,310,310,311,312,312,323
3337,84090056,US,USA,840,90056.0,Unassigned,Wyoming,US,0.000000,0.000000,...,0,0,0,0,0,0,2,2,2,0
3338,84056043,US,USA,840,56043.0,Washakie,Wyoming,US,43.904516,-107.680187,...,111,111,111,111,111,111,111,111,111,111


In [3]:
# Some of the FIPS codes contain NA's:
raw_confirmed[raw_confirmed["FIPS"].isna()]

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,9/7/20,9/8/20,9/9/20,9/10/20,9/11/20,9/12/20,9/13/20,9/14/20,9/15/20,9/16/20
1267,84070002,US,USA,840,,Dukes and Nantucket,Massachusetts,US,41.406747,-70.687635,...,101,103,104,111,117,122,129,132,134,139
1304,84070005,US,USA,840,,Federal Correctional Institution (FCI),Michigan,US,0.0,0.0,...,196,196,196,196,196,196,196,196,196,196
1336,84070004,US,USA,840,,Michigan Department of Corrections (MDOC),Michigan,US,0.0,0.0,...,5469,5468,5468,5472,5504,5504,5504,5534,5535,5535
1591,84070003,US,USA,840,,Kansas City,Missouri,US,39.0997,-94.5786,...,9808,9842,9852,9932,10026,10088,10146,10219,10250,10232
2954,84070015,US,USA,840,,Bear River,Utah,US,41.521068,-113.083282,...,2683,2698,2705,2727,2767,2802,2826,2852,2885,2923
2959,84070016,US,USA,840,,Central Utah,Utah,US,39.372319,-111.575868,...,533,536,541,543,548,551,553,559,563,567
2978,84070017,US,USA,840,,Southeast Utah,Utah,US,38.996171,-110.701396,...,160,162,162,162,163,166,166,166,166,167
2979,84070018,US,USA,840,,Southwest Utah,Utah,US,37.854472,-111.441876,...,3749,3756,3766,3774,3808,3831,3838,3852,3880,3903
2982,84070019,US,USA,840,,TriCounty,Utah,US,40.124915,-109.517442,...,217,218,219,221,225,225,226,226,227,229
2990,84070020,US,USA,840,,Weber-Morgan,Utah,US,41.27116,-111.914512,...,3564,3575,3607,3631,3662,3712,3749,3772,3805,3842


## Filter the Johns Hopkins Data down to U.S. county-level statistics only.

Filter out the locations without county FIPS codes, since they don't
align properly with the county-level metadata this data set will be
joined with, and the number of cases involved is relatively small.

In [4]:
raw_confirmed = raw_confirmed[~raw_confirmed["FIPS"].isna()].copy()
raw_deaths = raw_deaths[~raw_deaths["FIPS"].isna()].copy()
raw_recovered = raw_recovered[~raw_recovered["FIPS"].isna()].copy()


# Probably due to the presence of NaNs, the remaining FIPS codes end
# up encoded as floating point numbers. Fix that.
for df in [raw_confirmed, raw_deaths, raw_recovered]:
    df["FIPS"] = df["FIPS"].astype("Int64")
    
raw_confirmed

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,9/7/20,9/8/20,9/9/20,9/10/20,9/11/20,9/12/20,9/13/20,9/14/20,9/15/20,9/16/20
0,84001001,US,USA,840,1001,Autauga,Alabama,US,32.539527,-86.644082,...,1383,1385,1398,1413,1420,1432,1442,1447,1585,1619
1,84001003,US,USA,840,1003,Baldwin,Alabama,US,30.727750,-87.722071,...,4586,4609,4639,4678,4722,4752,4781,4800,4978,5003
2,84001005,US,USA,840,1005,Barbour,Alabama,US,31.868263,-85.387129,...,617,617,618,618,618,620,622,626,801,809
3,84001007,US,USA,840,1007,Bibb,Alabama,US,32.996421,-87.125115,...,557,562,564,566,574,576,578,581,610,612
4,84001009,US,USA,840,1009,Blount,Alabama,US,33.982109,-86.567906,...,1070,1071,1082,1099,1109,1114,1121,1128,1464,1487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3335,84056039,US,USA,840,56039,Teton,Wyoming,US,43.935225,-110.589080,...,447,450,451,453,462,465,470,476,478,481
3336,84056041,US,USA,840,56041,Uinta,Wyoming,US,41.287818,-110.547578,...,308,309,308,310,310,310,311,312,312,323
3337,84090056,US,USA,840,90056,Unassigned,Wyoming,US,0.000000,0.000000,...,0,0,0,0,0,0,2,2,2,0
3338,84056043,US,USA,840,56043,Washakie,Wyoming,US,43.904516,-107.680187,...,111,111,111,111,111,111,111,111,111,111


Remove locations that have FIPS codes but are not U.S. counties.

In [5]:
def counties_df(df):
    return df[(df["FIPS"] >= 1000)  # Territories have FIPS codes < 1000
              & (~df["Admin2"].isna())  # Countries don't have the "Admin2" field set
              & (df["Admin2"] != "Unassigned")  # States have Admin2 set to "Unassigned"
              & (df["FIPS"] <= 60000)  # Expatriates are coded by state in values > 80k
              ].copy()

county_confirmed = counties_df(raw_confirmed)
county_deaths = counties_df(raw_deaths)
county_recovered = counties_df(raw_recovered)

county_confirmed

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,9/7/20,9/8/20,9/9/20,9/10/20,9/11/20,9/12/20,9/13/20,9/14/20,9/15/20,9/16/20
0,84001001,US,USA,840,1001,Autauga,Alabama,US,32.539527,-86.644082,...,1383,1385,1398,1413,1420,1432,1442,1447,1585,1619
1,84001003,US,USA,840,1003,Baldwin,Alabama,US,30.727750,-87.722071,...,4586,4609,4639,4678,4722,4752,4781,4800,4978,5003
2,84001005,US,USA,840,1005,Barbour,Alabama,US,31.868263,-85.387129,...,617,617,618,618,618,620,622,626,801,809
3,84001007,US,USA,840,1007,Bibb,Alabama,US,32.996421,-87.125115,...,557,562,564,566,574,576,578,581,610,612
4,84001009,US,USA,840,1009,Blount,Alabama,US,33.982109,-86.567906,...,1070,1071,1082,1099,1109,1114,1121,1128,1464,1487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3334,84056037,US,USA,840,56037,Sweetwater,Wyoming,US,41.659439,-108.882788,...,309,314,316,319,320,320,318,317,317,317
3335,84056039,US,USA,840,56039,Teton,Wyoming,US,43.935225,-110.589080,...,447,450,451,453,462,465,470,476,478,481
3336,84056041,US,USA,840,56041,Uinta,Wyoming,US,41.287818,-110.547578,...,308,309,308,310,310,310,311,312,312,323
3338,84056043,US,USA,840,56043,Washakie,Wyoming,US,43.904516,-107.680187,...,111,111,111,111,111,111,111,111,111,111


## Rearrange the Johns Hopkins data into vertical time series

The time series in the raw data are spread across multiple columns. Rotate them by 90 degrees so that they are spread across rows.

In [6]:
def shred_time_series(df: pd.DataFrame, colname: str):
    """
    Turn a time series encoded as a range of columns into a time series
    encoded as a range of rows.
    
    This function hard-codes the column name mapping for Johns Hopkins 
    data, so it will only work on that data.
    
    :param df: Dataframe with a time series across the columns of each row,
     with an additional outer join indicator column at the very end.
    :param colname: Name of the new column where the time series should go
    
    :returns: A dataframe with one time series element per row.
     The returned dataframe will have a column called "Date" with the date
     of each time series element, and a column with the name `colname` with
     the associated value for each date.
    """
    for i in range(len(df.columns)):
        if str(df.columns[i]).endswith(_FIRST_DATE_SUFFIX):
            ts_start_index = i
            break
    
    ts_matrix = df[df.columns[ts_start_index:]].values
    ts_lists = ts_matrix.tolist()

    date_list = [datetime.strptime(s, _DATE_FORMAT) for s in df.columns[ts_start_index:]]

    # Create a new dataframe where the time series is a list
    nested_df = df[df.columns[:ts_start_index]].copy()
    nested_df[colname] = ts_lists

    # Expand out the list and add the dates back.
    flat_df = nested_df.explode(colname)
    flat_df["Date"] = date_list * len(nested_df.index)
    return flat_df

shredded_confirmed = shred_time_series(county_confirmed, "Confirmed")
shredded_deaths = shred_time_series(county_deaths, "Deaths")
shredded_recovered = shred_time_series(county_recovered, "Recovered")
shredded_confirmed

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Confirmed,Date
0,84001001,US,USA,840,1001,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",0,2020-01-22
0,84001001,US,USA,840,1001,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",0,2020-01-23
0,84001001,US,USA,840,1001,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",0,2020-01-24
0,84001001,US,USA,840,1001,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",0,2020-01-25
0,84001001,US,USA,840,1001,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",0,2020-01-26
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3339,84056045,US,USA,840,56045,Weston,Wyoming,US,43.839612,-104.567488,"Weston, Wyoming, US",23,2020-09-12
3339,84056045,US,USA,840,56045,Weston,Wyoming,US,43.839612,-104.567488,"Weston, Wyoming, US",23,2020-09-13
3339,84056045,US,USA,840,56045,Weston,Wyoming,US,43.839612,-104.567488,"Weston, Wyoming, US",23,2020-09-14
3339,84056045,US,USA,840,56045,Weston,Wyoming,US,43.839612,-104.567488,"Weston, Wyoming, US",23,2020-09-15


## Merge the filtered and reformatted Johns Hopkins data into a single DataFrame

In [7]:
# Sort by FIPS code and Date and clean up some columns that don't match
# perfectly.
sorted_deaths = shredded_deaths.sort_values(["FIPS", "Date"])
sorted_confirmed = shredded_confirmed.sort_values(["FIPS", "Date"])
sorted_recovered = shredded_recovered.sort_values(["FIPS", "Date"])

# The "confirmed" time series is missing the "population" column that is
# present in the "deaths" and "recovered" time series.
# Add it back in.
sorted_confirmed["Population"] = sorted_deaths["Population"]

# The floating point numbers in the "Lat" and "Long_" fields also have
# some discrepancies due to rounding error. Use the values in the 
# "confirmed" time series as the gold standard.
sorted_deaths["Lat"] = sorted_confirmed["Lat"]
sorted_deaths["Long_"] = sorted_confirmed["Long_"]
sorted_recovered["Lat"] = sorted_confirmed["Lat"]
sorted_recovered["Long_"] = sorted_confirmed["Long_"]

# Now we can combine the three time series into a single table
combined = (
    sorted_confirmed
    .merge(sorted_deaths, how="outer")
    .merge(sorted_recovered, how="outer"))

# Check for missing data
missing_rows = combined[combined["Confirmed"].isna()]
if len(missing_rows.index) > 0:
    raise ValueError(f"Missing 'Confirmed' time series data for the following rows:\n{missing_rows}")

combined  

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Confirmed,Date,Population,Deaths,Recovered
0,84001001,US,USA,840,1001,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",0,2020-01-22,55869,0,0
1,84001001,US,USA,840,1001,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",0,2020-01-23,55869,0,0
2,84001001,US,USA,840,1001,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",0,2020-01-24,55869,0,0
3,84001001,US,USA,840,1001,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",0,2020-01-25,55869,0,0
4,84001001,US,USA,840,1001,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",0,2020-01-26,55869,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
750933,84056045,US,USA,840,56045,Weston,Wyoming,US,43.839612,-104.567488,"Weston, Wyoming, US",23,2020-09-12,6927,0,0
750934,84056045,US,USA,840,56045,Weston,Wyoming,US,43.839612,-104.567488,"Weston, Wyoming, US",23,2020-09-13,6927,0,0
750935,84056045,US,USA,840,56045,Weston,Wyoming,US,43.839612,-104.567488,"Weston, Wyoming, US",23,2020-09-14,6927,0,0
750936,84056045,US,USA,840,56045,Weston,Wyoming,US,43.839612,-104.567488,"Weston, Wyoming, US",23,2020-09-15,6927,0,0


## Adjust data types and names of the columns in the merged DataFrame

In [8]:
# The outer joins in the previous cell convert some of the integer
# columns to object types. Fix that.

# Data types before:
combined.dtypes

UID                        int64
iso2                      object
iso3                      object
code3                      int64
FIPS                       Int64
Admin2                    object
Province_State            object
Country_Region            object
Lat                      float64
Long_                    float64
Combined_Key              object
Confirmed                 object
Date              datetime64[ns]
Population                 int64
Deaths                    object
Recovered                 object
dtype: object

In [9]:
combined["iso2"] = combined["iso2"].astype("string")
combined["iso3"] = combined["iso3"].astype("string")
combined["Admin2"] = combined["Admin2"].astype("string")
combined["Province_State"] = combined["Province_State"].astype("string")
combined["Country_Region"] = combined["Country_Region"].astype("string")
combined["Combined_Key"] = combined["Combined_Key"].astype("string")

combined["Confirmed"] = combined["Confirmed"].astype(np.int64)
combined["Deaths"] = combined["Deaths"].astype(np.int64)
combined["Recovered"] = combined["Recovered"].astype(np.int64)

# Data types after:
combined.dtypes

UID                        int64
iso2                      string
iso3                      string
code3                      int64
FIPS                       Int64
Admin2                    string
Province_State            string
Country_Region            string
Lat                      float64
Long_                    float64
Combined_Key              string
Confirmed                  int64
Date              datetime64[ns]
Population                 int64
Deaths                     int64
Recovered                  int64
dtype: object

In [10]:
# Massage the column names a bit and drop unnecessary columns
to_retain = combined[["Date", "FIPS", "Province_State", "Admin2", 
                      "Population",
                      "Confirmed", "Deaths", "Recovered"]]
renamed = to_retain.rename(columns={
    "Province_State": "State",
    "Admin2": "County"
})
renamed

Unnamed: 0,Date,FIPS,State,County,Population,Confirmed,Deaths,Recovered
0,2020-01-22,1001,Alabama,Autauga,55869,0,0,0
1,2020-01-23,1001,Alabama,Autauga,55869,0,0,0
2,2020-01-24,1001,Alabama,Autauga,55869,0,0,0
3,2020-01-25,1001,Alabama,Autauga,55869,0,0,0
4,2020-01-26,1001,Alabama,Autauga,55869,0,0,0
...,...,...,...,...,...,...,...,...
750933,2020-09-12,56045,Wyoming,Weston,6927,23,0,0
750934,2020-09-13,56045,Wyoming,Weston,6927,23,0,0
750935,2020-09-14,56045,Wyoming,Weston,6927,23,0,0
750936,2020-09-15,56045,Wyoming,Weston,6927,23,0,0


# New York Times data

Pull in additional data from the New York Times' data repository to use
for filling in holes in the primary JHU data set.

In [11]:
raw_nyt = pd.read_csv(_NYT_CSV_URL, parse_dates=["date"])
raw_nyt

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0
1,2020-01-22,Snohomish,Washington,53061.0,1,0
2,2020-01-23,Snohomish,Washington,53061.0,1,0
3,2020-01-24,Cook,Illinois,17031.0,1,0
4,2020-01-24,Snohomish,Washington,53061.0,1,0
...,...,...,...,...,...,...
540946,2020-09-16,Sweetwater,Wyoming,56037.0,317,2
540947,2020-09-16,Teton,Wyoming,56039.0,481,1
540948,2020-09-16,Uinta,Wyoming,56041.0,323,2
540949,2020-09-16,Washakie,Wyoming,56043.0,110,6


## Filter the New York Times data down to U.S. county-level statistics only.

In [12]:
# NY Times data also has some locations with FIPS codes of NaN:
raw_nyt[raw_nyt["fips"].isna()][["county", "state"]].drop_duplicates()

Unnamed: 0,county,state
416,New York City,New York
418,Unknown,Rhode Island
1511,Unknown,New Jersey
1858,Unknown,Puerto Rico
2267,Unknown,Virgin Islands
2422,Unknown,Guam
2929,Unknown,Maine
2950,Unknown,Massachusetts
4003,Unknown,Louisiana
4680,Unknown,Kentucky


In [13]:
# For now, drop the NaN FIPS codes like we did up above with the
# JHU data set.
raw_nyt = raw_nyt[~raw_nyt["fips"].isna()].copy()
raw_nyt["fips"] = raw_nyt["fips"].astype("int64")

# Also cast int-valued columns to nullable int.
raw_nyt["cases"] = raw_nyt["cases"].astype("Int64")
raw_nyt["deaths"] = raw_nyt["deaths"].astype("Int64")

# Rename the columns in preparation for joining with the primary
# data set.
nyt = raw_nyt.copy().rename(columns={
    "date": "Date",
    "county": "County",
    "state": "State",
    "fips": "FIPS",
    "cases": "Confirmed_NYT",
    "deaths": "Deaths_NYT"
})
nyt

Unnamed: 0,Date,County,State,FIPS,Confirmed_NYT,Deaths_NYT
0,2020-01-21,Snohomish,Washington,53061,1,0
1,2020-01-22,Snohomish,Washington,53061,1,0
2,2020-01-23,Snohomish,Washington,53061,1,0
3,2020-01-24,Cook,Illinois,17031,1,0
4,2020-01-24,Snohomish,Washington,53061,1,0
...,...,...,...,...,...,...
540946,2020-09-16,Sweetwater,Wyoming,56037,317,2
540947,2020-09-16,Teton,Wyoming,56039,481,1
540948,2020-09-16,Uinta,Wyoming,56041,323,2
540949,2020-09-16,Washakie,Wyoming,56043,110,6


In [14]:
# Compare the number of distinct FIPS codes and dates in primary and
# secondary data sets.
print(f"Primary data set has {len(renamed['FIPS'].unique())} counties "
      f"and {len(renamed['Date'].unique())} dates.")
print(f"Secondary data set has {len(nyt['FIPS'].unique())} counties "
      f"and {len(nyt['Date'].unique())} dates.")

Primary data set has 3142 counties and 239 dates.
Secondary data set has 3206 counties and 240 dates.


# USAFacts data

We pull the latest data from the USAFacts CDN. 

The format of this data is very similar to that of the Johns Hopkins data, so the 
processing here is also similar.

In [15]:
raw_confirmed_usafacts = pd.read_csv(_USAFACTS_CONFIRMED_URL)
raw_deaths_usafacts = pd.read_csv(_USAFACTS_DEATHS_URL)

raw_confirmed_usafacts

Unnamed: 0,countyFIPS,County Name,State,stateFIPS,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,9/7/20,9/8/20,9/9/20,9/10/20,9/11/20,9/12/20,9/13/20,9/14/20,9/15/20,9/16/20
0,0,Statewide Unallocated,AL,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1001,Autauga County,AL,1,0,0,0,0,0,0,...,1504,1508,1522,1544,1551,1565,1576,1585,1601,1619
2,1003,Baldwin County,AL,1,0,0,0,0,0,0,...,4730,4757,4787,4833,4886,4922,4959,4978,4992,5003
3,1005,Barbour County,AL,1,0,0,0,0,0,0,...,778,778,778,785,786,792,794,801,806,809
4,1007,Bibb County,AL,1,0,0,0,0,0,0,...,583,589,591,594,602,604,607,610,611,612
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3190,56037,Sweetwater County,WY,56,0,0,0,0,0,0,...,309,314,316,317,317,317,317,317,317,317
3191,56039,Teton County,WY,56,0,0,0,0,0,0,...,447,450,451,453,462,465,470,476,478,481
3192,56041,Uinta County,WY,56,0,0,0,0,0,0,...,308,308,308,310,310,310,311,312,312,323
3193,56043,Washakie County,WY,56,0,0,0,0,0,0,...,111,111,111,111,111,111,111,111,111,111


## Filter the USAFacts Data down to U.S. county-level statistics only.

Filter out the locations without FIPS codes, since they don't
align properly with the county-level metadata this data set will be
joined with, and the number of cases involved is relatively small.

In [16]:
def counties_df(df):
    return df[(df["countyFIPS"] >= 1000)  # Territories and statewide remainders have FIPS codes < 1000
              ].copy()

county_confirmed_usafacts = counties_df(raw_confirmed_usafacts)
county_deaths_usafacts = counties_df(raw_deaths_usafacts)

county_confirmed_usafacts

Unnamed: 0,countyFIPS,County Name,State,stateFIPS,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,9/7/20,9/8/20,9/9/20,9/10/20,9/11/20,9/12/20,9/13/20,9/14/20,9/15/20,9/16/20
1,1001,Autauga County,AL,1,0,0,0,0,0,0,...,1504,1508,1522,1544,1551,1565,1576,1585,1601,1619
2,1003,Baldwin County,AL,1,0,0,0,0,0,0,...,4730,4757,4787,4833,4886,4922,4959,4978,4992,5003
3,1005,Barbour County,AL,1,0,0,0,0,0,0,...,778,778,778,785,786,792,794,801,806,809
4,1007,Bibb County,AL,1,0,0,0,0,0,0,...,583,589,591,594,602,604,607,610,611,612
5,1009,Blount County,AL,1,0,0,0,0,0,0,...,1384,1390,1401,1430,1441,1446,1453,1464,1475,1487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3190,56037,Sweetwater County,WY,56,0,0,0,0,0,0,...,309,314,316,317,317,317,317,317,317,317
3191,56039,Teton County,WY,56,0,0,0,0,0,0,...,447,450,451,453,462,465,470,476,478,481
3192,56041,Uinta County,WY,56,0,0,0,0,0,0,...,308,308,308,310,310,310,311,312,312,323
3193,56043,Washakie County,WY,56,0,0,0,0,0,0,...,111,111,111,111,111,111,111,111,111,111


## Rearrange the USAFacts data into vertical time series

The time series in the raw data are spread across multiple columns. Rotate them by 90 degrees so that they are spread across rows.

In [17]:
shredded_confirmed_usafacts = shred_time_series(county_confirmed_usafacts, "Confirmed")
shredded_deaths_usafacts = shred_time_series(county_deaths_usafacts, "Deaths")
shredded_confirmed_usafacts

Unnamed: 0,countyFIPS,County Name,State,stateFIPS,Confirmed,Date
1,1001,Autauga County,AL,1,0,2020-01-22
1,1001,Autauga County,AL,1,0,2020-01-23
1,1001,Autauga County,AL,1,0,2020-01-24
1,1001,Autauga County,AL,1,0,2020-01-25
1,1001,Autauga County,AL,1,0,2020-01-26
...,...,...,...,...,...,...
3194,56045,Weston County,WY,56,23,2020-09-12
3194,56045,Weston County,WY,56,23,2020-09-13
3194,56045,Weston County,WY,56,23,2020-09-14
3194,56045,Weston County,WY,56,23,2020-09-15


In [18]:
shredded_deaths_usafacts[shredded_deaths_usafacts["countyFIPS"] == 8014]

Unnamed: 0,countyFIPS,County Name,State,stateFIPS,Deaths,Date
259,8014,Broomfield County,CO,8,0,2020-01-22
259,8014,Broomfield County,CO,8,0,2020-01-23
259,8014,Broomfield County,CO,8,0,2020-01-24
259,8014,Broomfield County,CO,8,0,2020-01-25
259,8014,Broomfield County,CO,8,0,2020-01-26
...,...,...,...,...,...,...
259,8014,Broomfield County,CO,8,33,2020-09-12
259,8014,Broomfield County,CO,8,33,2020-09-13
259,8014,Broomfield County,CO,8,33,2020-09-14
259,8014,Broomfield County,CO,8,33,2020-09-15


In [19]:
shredded_confirmed_usafacts[shredded_confirmed_usafacts["countyFIPS"] == 8014]

Unnamed: 0,countyFIPS,County Name,State,stateFIPS,Confirmed,Date
259,8014,Broomfield County and City,CO,8,0,2020-01-22
259,8014,Broomfield County and City,CO,8,0,2020-01-23
259,8014,Broomfield County and City,CO,8,0,2020-01-24
259,8014,Broomfield County and City,CO,8,0,2020-01-25
259,8014,Broomfield County and City,CO,8,0,2020-01-26
...,...,...,...,...,...,...
259,8014,Broomfield County and City,CO,8,564,2020-09-12
259,8014,Broomfield County and City,CO,8,570,2020-09-13
259,8014,Broomfield County and City,CO,8,571,2020-09-14
259,8014,Broomfield County and City,CO,8,572,2020-09-15


## Merge the filtered and reformatted USAFacts data into a single DataFrame

In [20]:
# Sort by FIPS code and Date prior to joining
sorted_deaths_usafacts = shredded_deaths_usafacts.sort_values(["countyFIPS", "Date"])
sorted_confirmed_usafacts = shredded_confirmed_usafacts.sort_values(["countyFIPS", "Date"])

# Now we can combine the time series into a single table
combined_usafacts = sorted_confirmed_usafacts.merge(
    sorted_deaths_usafacts[["countyFIPS", "Date", "Deaths"]], 
    how="outer", on=["countyFIPS", "Date"])

combined_usafacts  

Unnamed: 0,countyFIPS,County Name,State,stateFIPS,Confirmed,Date,Deaths
0,1001,Autauga County,AL,1,0,2020-01-22,0
1,1001,Autauga County,AL,1,0,2020-01-23,0
2,1001,Autauga County,AL,1,0,2020-01-24,0
3,1001,Autauga County,AL,1,0,2020-01-25,0
4,1001,Autauga County,AL,1,0,2020-01-26,0
...,...,...,...,...,...,...,...
751411,56045,Weston County,WY,56,23,2020-09-12,0
751412,56045,Weston County,WY,56,23,2020-09-13,0
751413,56045,Weston County,WY,56,23,2020-09-14,0
751414,56045,Weston County,WY,56,23,2020-09-15,0


## Adjust data types and names of the columns in the merged DataFrame

In [21]:
# Data types before:
combined_usafacts.dtypes

countyFIPS              int64
County Name            object
State                  object
stateFIPS               int64
Confirmed              object
Date           datetime64[ns]
Deaths                 object
dtype: object

In [22]:
# Encode strings with the Pandas string type
combined_usafacts["County Name"] = combined_usafacts["County Name"].astype("string")
combined_usafacts["State"] = combined_usafacts["State"].astype("string")

# Encode integer fields containing NaNs using the Pandas nullable int type
combined_usafacts["Confirmed"] = combined_usafacts["Confirmed"].astype("Int64")
combined_usafacts["Deaths"] = combined_usafacts["Deaths"].astype("Int64")

# Data types after:
combined_usafacts.dtypes

countyFIPS              int64
County Name            string
State                  string
stateFIPS               int64
Confirmed               Int64
Date           datetime64[ns]
Deaths                  Int64
dtype: object

In [23]:
# Massage the column names a bit and drop unnecessary columns
to_retain_usafacts = combined_usafacts[
    ["Date", "countyFIPS", "County Name", 
     "State", "Confirmed", "Deaths"]]
renamed_usafacts = to_retain_usafacts.rename(columns={
    "countyFIPS": "FIPS",
    "County Name": "County",
    "Confirmed": "Confirmed_USAFacts",
    "Deaths": "Deaths_USAFacts",
})
renamed_usafacts

Unnamed: 0,Date,FIPS,County,State,Confirmed_USAFacts,Deaths_USAFacts
0,2020-01-22,1001,Autauga County,AL,0,0
1,2020-01-23,1001,Autauga County,AL,0,0
2,2020-01-24,1001,Autauga County,AL,0,0
3,2020-01-25,1001,Autauga County,AL,0,0
4,2020-01-26,1001,Autauga County,AL,0,0
...,...,...,...,...,...,...
751411,2020-09-12,56045,Weston County,WY,23,0
751412,2020-09-13,56045,Weston County,WY,23,0
751413,2020-09-14,56045,Weston County,WY,23,0
751414,2020-09-15,56045,Weston County,WY,23,0


# Merge the three data sets into a single DataFrame

In [24]:
# Outer-join the three data sets
to_write = (
    renamed
    # Don't use the county names, because they differ across data sets
    .merge(nyt[["Date", "FIPS", "Confirmed_NYT", "Deaths_NYT"]], how="left")
    .merge(renamed_usafacts[["Date", "FIPS", "Confirmed_USAFacts", "Deaths_USAFacts"]], 
           on=["Date", "FIPS"], how="left")
)
to_write

Unnamed: 0,Date,FIPS,State,County,Population,Confirmed,Deaths,Recovered,Confirmed_NYT,Deaths_NYT,Confirmed_USAFacts,Deaths_USAFacts
0,2020-01-22,1001,Alabama,Autauga,55869,0,0,0,,,0,0
1,2020-01-23,1001,Alabama,Autauga,55869,0,0,0,,,0,0
2,2020-01-24,1001,Alabama,Autauga,55869,0,0,0,,,0,0
3,2020-01-25,1001,Alabama,Autauga,55869,0,0,0,,,0,0
4,2020-01-26,1001,Alabama,Autauga,55869,0,0,0,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
750933,2020-09-12,56045,Wyoming,Weston,6927,23,0,0,23,0,23,0
750934,2020-09-13,56045,Wyoming,Weston,6927,23,0,0,23,0,23,0
750935,2020-09-14,56045,Wyoming,Weston,6927,23,0,0,23,0,23,0
750936,2020-09-15,56045,Wyoming,Weston,6927,23,0,0,23,0,23,0


# Write out the merged data to a CSV file

In [25]:
# Write the data out to a CSV file + a JSON file of type info.
output_csv_data_file = os.path.join(_OUTPUTS_DIR,"us_counties.csv")
print(f"Writing data to {output_csv_data_file}")
to_write.to_csv(output_csv_data_file, index=False)
col_type_mapping = {
    key: str(value) for key, value in to_write.dtypes.iteritems()
}
output_json_data_file = os.path.join(_OUTPUTS_DIR,"us_counties_meta.json")
print(f"Writing metadata to {output_json_data_file}")
with open(output_json_data_file, "w") as f:
    json.dump(col_type_mapping, f)

Writing data to outputs/us_counties.csv


Writing metadata to outputs/us_counties_meta.json
