# etl_us_data.ipynb

Acquire a copy of the latest COVID-19 time series data and write that
data out into local CSV files in a format amenable to analysis with 
Pandas dataframes.

Input data sources:
* Primary data source: [2019 Novel Coronavirus COVID-19 (2019-nCoV) Data Repository by Johns Hopkins CSSE](https://github.com/CSSEGISandData/COVID-19).
* Secondary data source (for values missing from primary source): [New York Times "Coronavirus (Covid-19) Data in the United States" repository](https://github.com/nytimes/covid-19-data).

Output files produced:
* `outputs/us_counties.csv`: County-level time series data for the United States
* `outputs/us_counties_meta.json`: Column type metadata for reading `data/us_counties.csv` with `pd.read_csv()`

**Note:** You can redirect these output files by setting the environment variable `COVID_OUTPUTS_DIR` to a replacement for the prefix `outputs` in the above paths.

To read these files back in, use `pd.read_csv()`:
```python
with open("outputs/us_counties_meta.json") as f:
    cases_meta = json.load(f)
cases_meta["Date"] = "object"  # Workaround for pd.read_csv() not supporting parsing datetime64
cases_raw = pd.read_csv("../data/us_counties.csv", dtype=cases_meta, parse_dates=["Date"])
cases = cases_raw.set_index(["FIPS", "Date"], verify_integrity=True)
```


In [1]:
# Initialization boilerplate

# Import Python packages that this notebook uses.
import os
import numpy as np
import pandas as pd
from urllib.request import urlopen
import json
from datetime import datetime, date

# Local file of utility functions
import util

# Allow environment variables to override data file locations
_OUTPUTS_DIR = os.getenv("COVID_OUTPUTS_DIR", "outputs")
util.ensure_dir_exists(_OUTPUTS_DIR)  # create if necessary

# URLs for downloading the time series data directly from Github
_JH_BASE_URL = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/" + \
               "csse_covid_19_data/csse_covid_19_time_series/"
_JH_CONFIRMED_URL = _JH_BASE_URL + "time_series_covid19_confirmed_US.csv"
_JH_DEATHS_URL = _JH_BASE_URL + "time_series_covid19_deaths_US.csv"

# Currently there are no data on recovered patients for the US.
# _JH_RECOVERED_URL = _JH_BASE_URL + "time_series_covid19_recovered_US.csv"

_NYT_BASE_URL = "https://raw.githubusercontent.com/nytimes/covid-19-data/master/"
_NYT_CSV_URL = _NYT_BASE_URL + "us-counties.csv"


# First date present in the data set, and the format of these dates. 
# Hopefully this won't change as new data are added.
# NOTE: One file uses "01/22/20" and the other uses "1/22/20"
# so you need to filter with endswith() to find matches.
_FIRST_DATE_SUFFIX = "1/22/20"
_DATE_FORMAT = "%m/%d/%y"

In [2]:
# Read the raw data from Github
raw_confirmed = pd.read_csv(_JH_CONFIRMED_URL)
raw_deaths = pd.read_csv(_JH_DEATHS_URL)

# No "recovered" time series at the moment. Generate an empty
# time series the schema from the deaths
raw_recovered = raw_deaths.copy(deep=True)
for i in range(len(raw_recovered.columns)):
    if str(raw_recovered.columns[i]).endswith(_FIRST_DATE_SUFFIX):
        ts_start_index = i
        break
for c in raw_recovered.columns[ts_start_index:]:
    raw_recovered[c] = 0

raw_confirmed

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,5/8/20,5/9/20,5/10/20,5/11/20,5/12/20,5/13/20,5/14/20,5/15/20,5/16/20,5/17/20
0,16,AS,ASM,16,60.0,,American Samoa,US,-14.271000,-170.132000,...,0,0,0,0,0,0,0,0,0,0
1,316,GU,GUM,316,66.0,,Guam,US,13.444300,144.793700,...,151,151,151,151,152,152,152,154,154,154
2,580,MP,MNP,580,69.0,,Northern Mariana Islands,US,15.097900,145.673900,...,15,16,16,19,19,19,19,19,21,21
3,630,PR,PRI,630,72.0,,Puerto Rico,US,18.220800,-66.590100,...,2156,2173,2198,2256,2299,2329,2427,2542,2589,2646
4,850,VI,VIR,850,78.0,,Virgin Islands,US,18.335800,-64.896300,...,68,68,69,69,69,69,69,69,69,69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3256,84070016,US,USA,840,,Central Utah,Utah,US,39.372319,-111.575868,...,27,27,28,28,28,29,29,29,29,30
3257,84070017,US,USA,840,,Southeast Utah,Utah,US,38.996171,-110.701396,...,13,13,13,14,13,13,13,13,14,14
3258,84070018,US,USA,840,,Southwest Utah,Utah,US,37.854472,-111.441876,...,138,155,164,170,167,180,187,200,209,219
3259,84070019,US,USA,840,,TriCounty,Utah,US,40.124915,-109.517442,...,14,16,16,16,16,16,16,16,19,19


In [3]:
# Some of the FIPS codes contain NA's:
raw_confirmed[raw_confirmed["FIPS"].isna()]

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,5/8/20,5/9/20,5/10/20,5/11/20,5/12/20,5/13/20,5/14/20,5/15/20,5/16/20,5/17/20
3147,84070002,US,USA,840,,Dukes and Nantucket,Massachusetts,US,41.406747,-70.687635,...,33,34,35,35,35,35,36,36,37,38
3148,84070003,US,USA,840,,Kansas City,Missouri,US,39.0997,-94.5786,...,739,752,767,783,791,808,838,848,863,895
3253,84070004,US,USA,840,,Michigan Department of Corrections (MDOC),Michigan,US,0.0,0.0,...,2124,2136,2138,2139,2144,2145,2171,2227,2227,2538
3254,84070005,US,USA,840,,Federal Correctional Institution (FCI),Michigan,US,0.0,0.0,...,109,110,110,114,114,114,115,115,116,117
3255,84070015,US,USA,840,,Bear River,Utah,US,41.521068,-113.083282,...,66,71,78,81,80,83,83,84,87,88
3256,84070016,US,USA,840,,Central Utah,Utah,US,39.372319,-111.575868,...,27,27,28,28,28,29,29,29,29,30
3257,84070017,US,USA,840,,Southeast Utah,Utah,US,38.996171,-110.701396,...,13,13,13,14,13,13,13,13,14,14
3258,84070018,US,USA,840,,Southwest Utah,Utah,US,37.854472,-111.441876,...,138,155,164,170,167,180,187,200,209,219
3259,84070019,US,USA,840,,TriCounty,Utah,US,40.124915,-109.517442,...,14,16,16,16,16,16,16,16,19,19
3260,84070020,US,USA,840,,Weber-Morgan,Utah,US,41.27116,-111.914512,...,184,188,194,197,207,211,214,216,221,226


In [4]:
# For now, filter out the locations without FIPS codes, since they don't
# align properly with the county-level metadata this data set will be
# joined with, and the number of cases involved is relatively small.
raw_confirmed = raw_confirmed[~raw_confirmed["FIPS"].isna()].copy()
raw_deaths = raw_deaths[~raw_deaths["FIPS"].isna()].copy()
raw_recovered = raw_recovered[~raw_recovered["FIPS"].isna()].copy()


# Probably due to the presence of NaNs, the remaining FIPS codes end
# up encoded as floating point numbers. Fix that.
for df in [raw_confirmed, raw_deaths, raw_recovered]:
    df["FIPS"] = df["FIPS"].astype("Int64")
    
raw_confirmed

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,5/8/20,5/9/20,5/10/20,5/11/20,5/12/20,5/13/20,5/14/20,5/15/20,5/16/20,5/17/20
0,16,AS,ASM,16,60,,American Samoa,US,-14.2710,-170.1320,...,0,0,0,0,0,0,0,0,0,0
1,316,GU,GUM,316,66,,Guam,US,13.4443,144.7937,...,151,151,151,151,152,152,152,154,154,154
2,580,MP,MNP,580,69,,Northern Mariana Islands,US,15.0979,145.6739,...,15,16,16,19,19,19,19,19,21,21
3,630,PR,PRI,630,72,,Puerto Rico,US,18.2208,-66.5901,...,2156,2173,2198,2256,2299,2329,2427,2542,2589,2646
4,850,VI,VIR,850,78,,Virgin Islands,US,18.3358,-64.8963,...,68,68,69,69,69,69,69,69,69,69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3248,84090053,US,USA,840,90053,Unassigned,Washington,US,0.0000,0.0000,...,69,56,40,38,57,73,110,125,112,101
3249,84090054,US,USA,840,90054,Unassigned,West Virginia,US,0.0000,0.0000,...,0,0,0,0,0,0,0,0,0,0
3250,84090055,US,USA,840,90055,Unassigned,Wisconsin,US,0.0000,0.0000,...,0,0,0,0,0,0,0,0,0,0
3251,84090056,US,USA,840,90056,Unassigned,Wyoming,US,0.0000,0.0000,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Filter down to just U.S. counties
def counties_df(df):
    return df[(df["FIPS"] >= 1000)  # Territories have FIPS codes < 1000
              & (~df["Admin2"].isna())  # Countries don't have the "Admin2" field set
              & (df["Admin2"] != "Unassigned")  # States have Admin2 set to "Unassigned"
              & (df["FIPS"] <= 60000)  # Expatriates are coded by state in values > 80k
              ].copy()

county_confirmed = counties_df(raw_confirmed)
county_deaths = counties_df(raw_deaths)
county_recovered = counties_df(raw_recovered)

county_confirmed

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,5/8/20,5/9/20,5/10/20,5/11/20,5/12/20,5/13/20,5/14/20,5/15/20,5/16/20,5/17/20
5,84001001,US,USA,840,1001,Autauga,Alabama,US,32.539527,-86.644082,...,67,68,74,84,91,93,104,103,110,110
6,84001003,US,USA,840,1003,Baldwin,Alabama,US,30.727750,-87.722071,...,208,216,222,224,227,231,243,244,254,254
7,84001005,US,USA,840,1005,Barbour,Alabama,US,31.868263,-85.387129,...,53,58,59,61,67,69,74,79,79,81
8,84001007,US,USA,840,1007,Bibb,Alabama,US,32.996421,-87.125115,...,44,45,46,46,46,46,46,49,50,50
9,84001009,US,USA,840,1009,Blount,Alabama,US,33.982109,-86.567906,...,44,44,44,45,45,45,45,45,45,46
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3142,84056037,US,USA,840,56037,Sweetwater,Wyoming,US,41.659439,-108.882788,...,19,19,19,20,20,20,21,22,23,23
3143,84056039,US,USA,840,56039,Teton,Wyoming,US,43.935225,-110.589080,...,98,98,98,98,98,99,99,99,100,100
3144,84056041,US,USA,840,56041,Uinta,Wyoming,US,41.287818,-110.547578,...,9,9,9,9,9,9,9,9,10,10
3145,84056043,US,USA,840,56043,Washakie,Wyoming,US,43.904516,-107.680187,...,8,8,8,8,8,8,8,9,9,16


In [6]:
# The time series in the raw data are spread across multiple columns.
# Rotate them by 90 degrees so that they are spread across rows.

def shred_time_series(df: pd.DataFrame, colname: str):
    """
    Turn a time series encoded as a range of columns into a time series
    encoded as a range of rows.
    
    This function hard-codes the column name mapping for Johns Hopkins 
    data, so it will only work on that data.
    
    :param df: Dataframe with a time series across the columns of each row,
     with an additional outer join indicator column at the very end.
    :param colname: Name of the new column where the time series should go
    
    :returns: A dataframe with one time series element per row.
     The returned dataframe will have a column called "Date" with the date
     of each time series element, and a column with the name `colname` with
     the associated value for each date.
    """
    for i in range(len(df.columns)):
        if str(df.columns[i]).endswith(_FIRST_DATE_SUFFIX):
            ts_start_index = i
            break
    
    ts_matrix = df[df.columns[ts_start_index:]].values
    ts_lists = ts_matrix.tolist()

    date_list = [datetime.strptime(s, _DATE_FORMAT) for s in df.columns[ts_start_index:]]

    # Create a new dataframe where the time series is a list
    nested_df = df[df.columns[:ts_start_index]].copy()
    nested_df[colname] = ts_lists

    # Expand out the list and add the dates back.
    flat_df = nested_df.explode(colname)
    flat_df["Date"] = date_list * len(nested_df.index)
    return flat_df

shredded_confirmed = shred_time_series(county_confirmed, "Confirmed")
shredded_deaths = shred_time_series(county_deaths, "Deaths")
shredded_recovered = shred_time_series(county_recovered, "Recovered")
shredded_confirmed

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Confirmed,Date
5,84001001,US,USA,840,1001,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",0,2020-01-22
5,84001001,US,USA,840,1001,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",0,2020-01-23
5,84001001,US,USA,840,1001,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",0,2020-01-24
5,84001001,US,USA,840,1001,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",0,2020-01-25
5,84001001,US,USA,840,1001,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",0,2020-01-26
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3146,84056045,US,USA,840,56045,Weston,Wyoming,US,43.839612,-104.567488,"Weston, Wyoming, US",0,2020-05-13
3146,84056045,US,USA,840,56045,Weston,Wyoming,US,43.839612,-104.567488,"Weston, Wyoming, US",0,2020-05-14
3146,84056045,US,USA,840,56045,Weston,Wyoming,US,43.839612,-104.567488,"Weston, Wyoming, US",0,2020-05-15
3146,84056045,US,USA,840,56045,Weston,Wyoming,US,43.839612,-104.567488,"Weston, Wyoming, US",0,2020-05-16


In [7]:
# Combine the three time series into a single table.

# Sort by FIPS code and Date and clean up some columns that don't match
# perfectly.
sorted_deaths = shredded_deaths.sort_values(["FIPS", "Date"])
sorted_confirmed = shredded_confirmed.sort_values(["FIPS", "Date"])
sorted_recovered = shredded_recovered.sort_values(["FIPS", "Date"])

# The "confirmed" time series is missing the "population" column that is
# present in the "deaths" and "recovered" time series.
# Add it back in.
sorted_confirmed["Population"] = sorted_deaths["Population"]

# The floating point numbers in the "Lat" and "Long_" fields also have
# some discrepancies due to rounding error. Use the values in the 
# "confirmed" time series as the gold standard.
sorted_deaths["Lat"] = sorted_confirmed["Lat"]
sorted_deaths["Long_"] = sorted_confirmed["Long_"]
sorted_recovered["Lat"] = sorted_confirmed["Lat"]
sorted_recovered["Long_"] = sorted_confirmed["Long_"]

# Now we can combine the three time series into a single table
combined = (
    sorted_confirmed
    .merge(sorted_deaths, how="outer")
    .merge(sorted_recovered, how="outer"))

# Check for missing data
missing_rows = combined[combined["Confirmed"].isna()]
if len(missing_rows.index) > 0:
    raise ValueError(f"Missing 'Confirmed' time series data for the following rows:\n{missing_rows}")

combined  

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Confirmed,Date,Population,Deaths,Recovered
0,84001001,US,USA,840,1001,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",0,2020-01-22,55869,0,0
1,84001001,US,USA,840,1001,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",0,2020-01-23,55869,0,0
2,84001001,US,USA,840,1001,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",0,2020-01-24,55869,0,0
3,84001001,US,USA,840,1001,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",0,2020-01-25,55869,0,0
4,84001001,US,USA,840,1001,Autauga,Alabama,US,32.539527,-86.644082,"Autauga, Alabama, US",0,2020-01-26,55869,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
367609,84056045,US,USA,840,56045,Weston,Wyoming,US,43.839612,-104.567488,"Weston, Wyoming, US",0,2020-05-13,6927,0,0
367610,84056045,US,USA,840,56045,Weston,Wyoming,US,43.839612,-104.567488,"Weston, Wyoming, US",0,2020-05-14,6927,0,0
367611,84056045,US,USA,840,56045,Weston,Wyoming,US,43.839612,-104.567488,"Weston, Wyoming, US",0,2020-05-15,6927,0,0
367612,84056045,US,USA,840,56045,Weston,Wyoming,US,43.839612,-104.567488,"Weston, Wyoming, US",0,2020-05-16,6927,0,0


In [8]:
# The outer joins in the previous cell convert some of the integer
# columns to object types. Fix that.

# Data types before:
combined.dtypes

UID                        int64
iso2                      object
iso3                      object
code3                      int64
FIPS                       Int64
Admin2                    object
Province_State            object
Country_Region            object
Lat                      float64
Long_                    float64
Combined_Key              object
Confirmed                 object
Date              datetime64[ns]
Population                 int64
Deaths                    object
Recovered                 object
dtype: object

In [9]:
combined["iso2"] = combined["iso2"].astype("string")
combined["iso3"] = combined["iso3"].astype("string")
combined["Admin2"] = combined["Admin2"].astype("string")
combined["Province_State"] = combined["Province_State"].astype("string")
combined["Country_Region"] = combined["Country_Region"].astype("string")
combined["Combined_Key"] = combined["Combined_Key"].astype("string")

combined["Confirmed"] = combined["Confirmed"].astype(np.int64)
combined["Deaths"] = combined["Deaths"].astype(np.int64)
combined["Recovered"] = combined["Recovered"].astype(np.int64)

# Data types after:
combined.dtypes

UID                        int64
iso2                      string
iso3                      string
code3                      int64
FIPS                       Int64
Admin2                    string
Province_State            string
Country_Region            string
Lat                      float64
Long_                    float64
Combined_Key              string
Confirmed                  int64
Date              datetime64[ns]
Population                 int64
Deaths                     int64
Recovered                  int64
dtype: object

In [10]:
# Massage the column names a bit and drop unnecessary columns
to_retain = combined[["Date", "FIPS", "Province_State", "Admin2", 
                      "Population",
                      "Confirmed", "Deaths", "Recovered"]]
renamed = to_retain.rename(columns={
    "Province_State": "State",
    "Admin2": "County"
})
renamed

Unnamed: 0,Date,FIPS,State,County,Population,Confirmed,Deaths,Recovered
0,2020-01-22,1001,Alabama,Autauga,55869,0,0,0
1,2020-01-23,1001,Alabama,Autauga,55869,0,0,0
2,2020-01-24,1001,Alabama,Autauga,55869,0,0,0
3,2020-01-25,1001,Alabama,Autauga,55869,0,0,0
4,2020-01-26,1001,Alabama,Autauga,55869,0,0,0
...,...,...,...,...,...,...,...,...
367609,2020-05-13,56045,Wyoming,Weston,6927,0,0,0
367610,2020-05-14,56045,Wyoming,Weston,6927,0,0,0
367611,2020-05-15,56045,Wyoming,Weston,6927,0,0,0
367612,2020-05-16,56045,Wyoming,Weston,6927,0,0,0


In [11]:
# Pull in additional data from the New York Times' data repository to use
# for filling in holes in the primary JHU data set.
raw_nyt = pd.read_csv(_NYT_CSV_URL, parse_dates=["date"])
raw_nyt

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0
1,2020-01-22,Snohomish,Washington,53061.0,1,0
2,2020-01-23,Snohomish,Washington,53061.0,1,0
3,2020-01-24,Cook,Illinois,17031.0,1,0
4,2020-01-24,Snohomish,Washington,53061.0,1,0
...,...,...,...,...,...,...
153096,2020-05-17,Sublette,Wyoming,56035.0,3,0
153097,2020-05-17,Sweetwater,Wyoming,56037.0,23,0
153098,2020-05-17,Teton,Wyoming,56039.0,100,1
153099,2020-05-17,Uinta,Wyoming,56041.0,10,0


In [12]:
# NY Times data also has some locations with FIPS codes of NaN:
raw_nyt[raw_nyt["fips"].isna()][["county", "state"]].drop_duplicates()

Unnamed: 0,county,state
416,New York City,New York
418,Unknown,Rhode Island
1511,Unknown,New Jersey
1858,Unknown,Puerto Rico
2267,Unknown,Virgin Islands
2422,Unknown,Guam
2929,Unknown,Maine
2950,Unknown,Massachusetts
4003,Unknown,Louisiana
4680,Unknown,Kentucky


In [13]:
# For now, drop the NaN FIPS codes like we did up above with the
# JHU data set.
raw_nyt = raw_nyt[~raw_nyt["fips"].isna()].copy()
raw_nyt["fips"] = raw_nyt["fips"].astype("int64")

# Also cast int-valued columns to nullable int.
raw_nyt["cases"] = raw_nyt["cases"].astype("Int64")
raw_nyt["deaths"] = raw_nyt["deaths"].astype("Int64")

# Rename the columns in preparation for joining with the primary
# data set.
nyt = raw_nyt.copy().rename(columns={
    "date": "Date",
    "county": "County",
    "state": "State",
    "fips": "FIPS",
    "cases": "Confirmed_NYT",
    "deaths": "Deaths_NYT"
})
nyt

Unnamed: 0,Date,County,State,FIPS,Confirmed_NYT,Deaths_NYT
0,2020-01-21,Snohomish,Washington,53061,1,0
1,2020-01-22,Snohomish,Washington,53061,1,0
2,2020-01-23,Snohomish,Washington,53061,1,0
3,2020-01-24,Cook,Illinois,17031,1,0
4,2020-01-24,Snohomish,Washington,53061,1,0
...,...,...,...,...,...,...
153096,2020-05-17,Sublette,Wyoming,56035,3,0
153097,2020-05-17,Sweetwater,Wyoming,56037,23,0
153098,2020-05-17,Teton,Wyoming,56039,100,1
153099,2020-05-17,Uinta,Wyoming,56041,10,0


In [14]:
# Compare the number of distinct FIPS codes and dates in primary and
# secondary data sets.
print(f"Primary data set has {len(renamed['FIPS'].unique())} counties "
      f"and {len(renamed['Date'].unique())} dates.")
print(f"Secondary data set has {len(nyt['FIPS'].unique())} counties "
      f"and {len(nyt['Date'].unique())} dates.")

Primary data set has 3142 counties and 117 dates.
Secondary data set has 2905 counties and 118 dates.


In [15]:
# Outer-join the two data sets
to_write = renamed.merge(nyt[["Date", "FIPS", "Confirmed_NYT", "Deaths_NYT"]],
                         how="left")
to_write

Unnamed: 0,Date,FIPS,State,County,Population,Confirmed,Deaths,Recovered,Confirmed_NYT,Deaths_NYT
0,2020-01-22,1001,Alabama,Autauga,55869,0,0,0,,
1,2020-01-23,1001,Alabama,Autauga,55869,0,0,0,,
2,2020-01-24,1001,Alabama,Autauga,55869,0,0,0,,
3,2020-01-25,1001,Alabama,Autauga,55869,0,0,0,,
4,2020-01-26,1001,Alabama,Autauga,55869,0,0,0,,
...,...,...,...,...,...,...,...,...,...,...
367609,2020-05-13,56045,Wyoming,Weston,6927,0,0,0,,
367610,2020-05-14,56045,Wyoming,Weston,6927,0,0,0,,
367611,2020-05-15,56045,Wyoming,Weston,6927,0,0,0,,
367612,2020-05-16,56045,Wyoming,Weston,6927,0,0,0,,


In [16]:
# Write the data out to a CSV file + a JSON file of type info.
output_csv_data_file = os.path.join(_OUTPUTS_DIR,"us_counties.csv")
print(f"Writing data to {output_csv_data_file}")
to_write.to_csv(output_csv_data_file, index=False)
col_type_mapping = {
    key: str(value) for key, value in to_write.dtypes.iteritems()
}
output_json_data_file = os.path.join(_OUTPUTS_DIR,"us_counties_meta.json")
print(f"Writing metadata to {output_json_data_file}")
with open(output_json_data_file, "w") as f:
    json.dump(col_type_mapping, f)

Writing data to outputs/us_counties.csv
Writing metadata to outputs/us_counties_meta.json
