# US county-level data
* Get everything in shape from NYT
* Then add in JHU for recent observations

In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd

### Functions to be used

In [2]:
def coerce_fips_integer(df):
    def integrify(x):
        return int(float(x)) if not pd.isna(x) else None

    cols = [
        "fips",
    ]
    
    new_cols = {c: df[c].apply(integrify, convert_dtype=False) for c in cols}
    
    return df.assign(**new_cols)

In [3]:
def correct_county_fips(row):
    if len(str(row.fips)) == 5:
        return str(row.fips)
    elif row.fips is not None:
        return "0" + str(row.fips)
    elif row.fips is None:
        return ""

## Use NYT for county-level time-series data

In [4]:
nyt_county_url = "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv"
county = pd.read_csv(nyt_county_url)

In [5]:
def clean_nyt_county(df):
    keep_cols = ['date', 'county', 'state', 'fips', 'cases', 'deaths']
    
    df = df[keep_cols]
    
    # Coerce fips into integer, then convert to string
    df = coerce_fips_integer(df)

    df['fips'] = df.apply(correct_county_fips, axis = 1)

    df['date'] = pd.to_datetime(df.date)
    
    return df

In [6]:
county = clean_nyt_county(county)

## JHU data for 3/25 onward

In [7]:
jhu325 = gpd.read_file('../data/jhu_feature_layer_3_25_2020.geojson')
jhu326 = gpd.read_file('../data/jhu_feature_layer_3_26_2020.geojson')

In [8]:
url_327 = "https://services1.arcgis.com/0MSEUqKaxRlEPj5g/ArcGIS/rest/services/ncov_cases_US/FeatureServer/0/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&resultType=none&distance=0.0&units=esriSRUnit_Meter&returnGeodetic=false&outFields=OBJECTID%2C+Province_State%2C+Country_Region%2C+Last_Update%2C+Lat%2C+Long_%2C+Confirmed%2C+Recovered%2C+Deaths%2C+Active%2C+Admin2%2C+FIPS%2C+Combined_Key%2C+Incident_Rate%2C+People_Tested&returnGeometry=true&featureEncoding=esriDefault&multipatchOption=xyFootprint&maxAllowableOffset=&geometryPrecision=&outSR=&datumTransformation=&applyVCSProjection=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&returnQueryGeometry=false&returnDistinctValues=false&cacheHint=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&returnZ=false&returnM=false&returnExceededLimitFeatures=true&quantizationParameters=&sqlFormat=none&f=pgeojson&token="

jhu327 = gpd.read_file(url_327)

#jhu327.to_file(driver = 'GeoJSON', filename = '../data/jhu_feature_layer_3_27_2020.geojson')

In [9]:
jhu325['date'] = '3/25/2020'
jhu326['date'] = '3/26/2020'
jhu327['date'] = '3/27/2020'

jhu1 = jhu325.append(jhu326).append(jhu327)
jhu1['date'] = pd.to_datetime(jhu1.date)

In [10]:
# Replace the link
nyt_geog = pd.read_csv('../data/nyt_us_county.csv')
nyt_geog = nyt_geog[nyt_geog.fips.notna()][['fips', 'county', 'state']].drop_duplicates()

nyt_geog = coerce_fips_integer(nyt_geog)
nyt_geog['fips'] = nyt_geog.apply(correct_county_fips, axis = 1)

In [11]:
def clean_jhu_county(df):
    # Only keep certain columns and rename them to match NYT schema
    keep_cols = ['Province_State', 'Country_Region', 'Lat', 'Long_',
                 'Confirmed', 'Deaths', 'FIPS', 
                 'Incident_Rate', 'People_Tested', 'date']
    
    df = df[keep_cols]
    
    df.rename(columns = {'Confirmed': 'cases', 'Deaths': 'deaths', 
                         'FIPS': 'fips', 'Long_': 'Lon', 
                        'People_Tested': 'people_tested', 'Incident_Rate': 'incident_rate'}, inplace = True)
        
    # Use FIPS to merge in NYT columns for county and state names
    # There are some values with no FIPS, which were all state observations. 
    # Drop them, use an inner join for merge.
    df = pd.merge(df, nyt_geog, on = 'fips', how = 'inner', validate = 'm:1')
    
    
    # Only keep certain columns and rename them to match NYT schema
    drop_cols = ['Province_State', 'Country_Region']
    
    df = df.drop(columns = drop_cols)
    
    return df

In [12]:
jhu1 = clean_jhu_county(jhu1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


## Append old data -- do this once

In [13]:
us_county_time_series = county.append(jhu1, sort = False)

In [14]:
def fill_missing_stuff(df):
    for col in ['Lat', 'Lon']:
        df[col] = df.groupby(['fips', 'county', 'state'])[col].transform('max')
    
    # There's a FIPS that isn't caught because of a tilde for Dona Ana, New Mexico.
    df['fips'] = df.apply(lambda row: "35013" if ('Ana' in row.county) & (row.fips is "")
                          else row.fips, axis = 1)
    
    # Sort columns
    col_order = ['county', 'state', 'fips', 'date', 'Lat', 'Lon', 
             'cases', 'deaths', 'incident_rate', 'people_tested']

    df = df.reindex(columns = col_order).sort_values(['fips', 'date'])
    
    # Set data types for cases and deaths? Seems ok for now....
    for col in ['incident_rate', 'people_tested']:
        df[col] = df[col].astype(float)
    
    return df

In [15]:
us_county_time_series = fill_missing_stuff(us_county_time_series)

## JHU data that needs to be a DAG

* Read in feature layer
* Add date column
* Apply clean_jhu_county function
* Do upsert

In [18]:
# First, we need to make sure our nyt_geog crosswalk is open
nyt_geog = pd.read_csv('../data/nyt_us_county.csv')
nyt_geog = nyt_geog[nyt_geog.fips.notna()][['fips', 'county', 'state']].drop_duplicates()

nyt_geog = coerce_fips_integer(nyt_geog)
nyt_geog['fips'] = nyt_geog.apply(correct_county_fips, axis = 1)

In [19]:
# Pretend 3/27 is the current date showing for JHU
jhu_today = clean_jhu_county(jhu327)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [20]:
jhu_today = fill_missing_stuff(jhu_today)

In [21]:
# Now it's read to be upserted
# Needs a drop_duplicates() line because we and JHU are updating multiple times a day
# Also, keep Ian's localize then UTC timezone stuff