# US county-level data
* Get everything in shape from NYT
* Then add in JHU for recent observations

In [1]:
import pandas as pd
import geopandas as gpd

### Functions to be used

In [2]:
def coerce_fips_integer(df):
    def integrify(x):
        return int(float(x)) if not pd.isna(x) else None

    cols = [
        "fips",
    ]
    
    new_cols = {c: df[c].apply(integrify, convert_dtype=False) for c in cols}
    
    return df.assign(**new_cols)

In [3]:
def correct_county_fips(row):
    if len(str(row.fips)) == 5:
        return str(row.fips)
    elif row.fips is not None:
        return "0" + str(row.fips)
    elif row.fips is None:
        return ""

## Use NYT for county-level time-series data

In [None]:
bucket_name = "public-health-dashboard"
county = pd.read_csv(f"s3://{bucket_name}/jhu_covid19/county_time_series_330.csv")

## JHU data that needs to be a DAG

* Read in feature layer
* Add date column
* Apply clean_jhu_county function
* Do upsert

In [18]:
# First, we need to make sure our nyt_geog crosswalk is open
NYT_330_COMMIT = "99b30cbf4181e35bdcc814e2b29671f38d7860a7"
NYT_COUNTY_URL = (
    f"https://raw.githubusercontent.com/nytimes/covid-19-data/{NYT_330_COMMIT}/"
    "us-counties.csv"
)
county = pd.read_csv(NYT_COUNTY_URL)

nyt_geog = county[county.fips.notna()][['fips', 'county', 'state']].drop_duplicates()

nyt_geog = coerce_fips_integer(nyt_geog)
nyt_geog["fips"] = nyt_geog.apply(correct_county_fips, axis=1)

In [None]:
# Read in JHU feature layer in
jhu = 

In [None]:
def clean_jhu_county(df):
    # Only keep certain columns and rename them to match NYT schema
    keep_cols = [
        "Province_State",
        "Country_Region",
        "Lat",
        "Long_",
        "Confirmed",
        "Deaths",
        "FIPS",
        "Incident_Rate",
        "People_Tested",
        "date",
    ]

    df = df[keep_cols]

    df.rename(
        columns={
            "Confirmed": "cases",
            "Deaths": "deaths",
            "FIPS": "fips",
            "Long_": "Lon",
            "People_Tested": "people_tested",
            "Incident_Rate": "incident_rate",
        },
        inplace=True,
    )

    # Use FIPS to merge in NYT columns for county and state names
    # There are some values with no FIPS, NYT calls these county = "Unknown"
    df = pd.merge(df, nyt_geog, on="fips", how="left", validate="m:1")

    # Fix when FIPS is unknown, which wouldn't have merged in anything from nyt_geog
    df["county"] = df.apply(
        lambda row: "Unknown" if row.fips is None else row.county, axis=1
    )
    df["state"] = df.apply(
        lambda row: row.Province_State if row.fips is None else row.state, axis=1
    )
    df["fips"] = df.fips.fillna("")

    # Only keep certain columns and rename them to match NYT schema
    drop_cols = ["Province_State", "Country_Region"]

    df = df.drop(columns=drop_cols)

    return df

In [19]:
# Pretend 3/27 is the current date showing for JHU
jhu_today = clean_jhu_county(jhu)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [None]:
def sort_drop_duplicates(df):
    for col in ["Lat", "Lon"]:
        df[col] = df.groupby(["fips", "county", "state"])[col].transform("max")

    # Sort columns
    col_order = [
        "county",
        "state",
        "fips",
        "date",
        "Lat",
        "Lon",
        "cases",
        "deaths",
        "incident_rate",
        "people_tested",
    ]

    df = df.reindex(columns=col_order).sort_values(
        ["state", "county", "fips", "date", "cases"]
    )

    # Set data types for cases and deaths? Seems ok for now....
    for col in ["incident_rate", "people_tested"]:
        df[col] = df[col].astype(float)

    # Drop duplicates
    # Either: (1) values are updated throughout the day, or
    # (2) slight discrepancies between NYT and JHU.
    # Regardless, take the max value for cases and deaths for each date.
    group_cols = ["state", "county", "fips", "date"]
    for col in ["cases", "deaths"]:
        df[col] = df.groupby(group_cols).transform("max")

    df = df.drop_duplicates(subset=group_cols)

    return df

In [None]:
jhu_today = sort_drop_duplicates(jhu_today)

In [None]:
# Append everything just once -- should this step happen?
us_county = county.append(jhu_today, sort=False)

# Save as temporary file?

In [21]:
# Now it's ready to be upserted
# Also, keep Ian's localize then UTC timezone stuff