# US state-level data
* Get everything in shape from NYT
* Then add in JHU for recent observations

In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd

## Functions to be used

In [2]:
def coerce_fips_integer(df):
    def integrify(x):
        return int(float(x)) if not pd.isna(x) else None

    cols = [
        "fips",
    ]
    
    new_cols = {c: df[c].apply(integrify, convert_dtype=False) for c in cols}
    
    return df.assign(**new_cols)

In [3]:
def correct_state_fips(row):
    if len(str(row.fips)) == 2:
        return str(row.fips)
    elif row.fips is not None:
        return "0" + str(row.fips)
    elif row.fips is None:
        return ""

## NYT state data

In [4]:
state = pd.read_csv('../data/nyt_us_state.csv')

In [5]:
def clean_nyt_state(df):
    keep_cols = ['date', 'state', 'fips', 'cases', 'deaths']
    
    df = df[keep_cols]
    
    # Coerce fips into integer, then convert to string
    df = coerce_fips_integer(df)

    df['fips'] = df.apply(correct_state_fips, axis = 1)

    df['date'] = pd.to_datetime(df.date)
    
    return df   

In [6]:
state = clean_nyt_state(state)

## JHU state level data

In [7]:
county_df = pd.read_csv('../data/us_county_time_series.csv')

county_df['date'] = pd.to_datetime(county_df.date)
recent = county_df[(county_df.date >= '3/26/2020') & (county_df.date <= '3/27/2020')]

In [8]:
recent.head()

Unnamed: 0.1,Unnamed: 0,county,state,fips,date,Lat,Lon,cases,deaths,incident_rate,people_tested
158,157,Autauga,Alabama,1001.0,2020-03-26,32.539527,-86.644082,5,0,,
159,158,Autauga,Alabama,1001.0,2020-03-27,32.539527,-86.644082,6,0,,
173,169,Baldwin,Alabama,1003.0,2020-03-26,30.72775,-87.722071,4,0,,
174,170,Baldwin,Alabama,1003.0,2020-03-27,30.72775,-87.722071,5,0,,
177,310,Blount,Alabama,1009.0,2020-03-26,33.982109,-86.567906,2,0,,


In [9]:
# Create state totals
jhu_state_totals = recent.groupby(['state', 'Lat', 'Lon', 'date']).agg({'cases':'sum', 
                                                                        'deaths':'sum'}).reset_index()

In [10]:
df = state.append(jhu_state_totals, sort = False)

In [11]:
coordinates = jhu_state_totals[['state', 'Lat', 'Lon']].drop_duplicates()

def fill_missing_stuff(df):
    df = pd.merge(df.drop(columns = ['Lat', 'Lon']), coordinates, on = 'state', how = 'left')
    
    # Sort columns
    col_order = ['state', 'fips', 'date', 'Lat', 'Lon', 
             'cases', 'deaths', 'incident_rate', 'people_tested']

    df = df.reindex(columns = col_order).sort_values(['fips', 'date'])
    
    # Set data types for cases and deaths? Seems ok for now....
    for col in ['incident_rate', 'people_tested']:
        df[col] = df[col].astype(float)
    
    return df

In [12]:
df = fill_missing_stuff(df)

In [13]:
fix_me = df[df.Lat.isna()]
rest_of_df = df[df.Lat.notna()]

fix_latitude = {
    'Virgin Islands': 18.3358,
    'Puerto Rico': 18.2,
    'Guam': 13.4443
}

fix_longitude = {
    'Virgin Islands': -64.8963,
    'Puerto Rico': -66.5,
    'Guam': 144.7937     
}


fix_me['Lat'] = fix_me.state.map(fix_latitude)
fix_me['Lon'] = fix_me.state.map(fix_longitude)

full_df = rest_of_df.append(fix_me, sort = False)

full_df = full_df.sort_values(['fips', 'date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [14]:
""" Grab from JHU
US Virgin Islands,  18.3358, -64.8963
PR, 18.2, -66.5
Guam, 13.4443, 144.7937
name, lat, long
"""

' Grab from JHU\nUS Virgin Islands,  18.3358, -64.8963\nPR, 18.2, -66.5\nGuam, 13.4443, 144.7937\nname, lat, long\n'

In [15]:
full_df.to_csv('../data/us_state_time_series.csv')
full_df.to_parquet('../data/us_state_time_series.parquet')