In [21]:
import pandas as pd
from pathlib import Path
import numpy as np
import altair as alt
from vega_datasets import data
import datetime

from functools import reduce

In [8]:
DATA_PATH = Path('data/covidcast-indicator-combination-confirmed_incidence_prop-2020-02-20-to-2021-02-20.csv')
df = pd.read_csv(DATA_PATH, sep=',', usecols=['geo_value', 'time_value', 'value'])

dates = sorted(list(set(df['time_value'].to_numpy())))
avg_len = 30
dates2id = {d: i // avg_len for i, d in enumerate(dates)}
df['interval_id'] = df['time_value'].apply(lambda x: dates2id[x])
grouped_df =  df.groupby(['geo_value', 'interval_id'])

In [11]:
agg_df = grouped_df.agg({'value': np.sum}).reset_index()

In [46]:
abbr2state = {'al': 'Alabama', 'ak': 'Alaska', 'as': 'American Samoa', 'az': 'Arizona', 'ar': 'Arkansas', 'ca': 'California', 'co': 'Colorado', 'ct': 'Connecticut', 'de': 'Delaware', 'dc': 'District of Columbia', 'fl': 'Florida', 'ga': 'Georgia', 'gu': 'Guam', 'hi': 'Hawaii', 'id': 'Idaho', 'il': 'Illinois', 'in': 'Indiana', 'ia': 'Iowa', 'ks': 'Kansas', 'ky': 'Kentucky', 'la': 'Louisiana', 'me': 'Maine', 'md': 'Maryland', 'ma': 'Massachusetts', 'mi': 'Michigan', 'mn': 'Minnesota', 'ms': 'Mississippi', 'mo': 'Missouri', 'mt': 'Montana', 'ne': 'Nebraska', 'nv': 'Nevada', 'nh': 'New Hampshire', 'nj': 'New Jersey', 'nm': 'New Mexico', 'ny': 'New York', 'nc': 'North Carolina', 'nd': 'North Dakota', 'mp': 'Northern Mariana Islands', 'oh': 'Ohio', 'ok': 'Oklahoma', 'or': 'Oregon', 'pa': 'Pennsylvania', 'pr': 'Puerto Rico', 'ri': 'Rhode Island', 'sc': 'South Carolina', 'sd': 'South Dakota', 'tn': 'Tennessee', 'tx': 'Texas', 'ut': 'Utah', 'vt': 'Vermont', 'vi': 'Virgin Islands', 'va': 'Virginia', 'wa': 'Washington', 'wv': 'West Virginia', 'wi': 'Wisconsin', 'wy': 'Wyoming'}

abbr2id = {'ak': '02', 'al': '01', 'ar': '05', 'as': '60', 'az': '04', 'ca': '06', 'co': '08', 'ct': '09', 'dc': '11', 'de': '10', 'fl': '12', 'ga': '13', 'gu': '66', 'hi': '15', 'ia': '19', 'id': '16', 'il': '17', 'in': '18', 'ks': '20', 'ky': '21', 'la': '22', 'ma': '25', 'md': '24', 'me': '23', 'mi': '26', 'mn': '27', 'mo': '29', 'ms': '28', 'mt': '30', 'nc': '37', 'nd': '38', 'ne': '31', 'nh': '33', 'nj': '34', 'nm': '35', 'nv': '32', 'ny': '36', 'oh': '39', 'ok': '40', 'or': '41', 'pa': '42', 'pr': '72', 'ri': '44', 'sc': '45', 'sd': '46', 'tn': '47', 'tx': '48', 'ut': '49', 'va': '51', 'vi': '78', 'vt': '50', 'wa': '53', 'wi': '55', 'wv': '54', 'wy': '56'}

56

In [51]:
agg_df['state'] = agg_df['geo_value'].apply(lambda x: abbr2state[x])
agg_df['state_id'] = agg_df['geo_value'].apply(lambda x: int(abbr2id[x]))

In [98]:
# alt.Chart(agg_df).mark_rect().encode(
#     x=alt.X('interval_id:O'),
#     y=alt.Y('state:O'),
#     color=alt.Color('value:Q', scale=alt.Scale(scheme='blues')),
# ).properties(width=550)

In [53]:
agg_df

Unnamed: 0,geo_value,interval_id,value,state,state_id
0,ak,0,1.777061,Alaska,2
1,ak,1,41.965976,Alaska,2
2,ak,2,10.935759,Alaska,2
3,ak,3,42.102673,Alaska,2
4,ak,4,155.971266,Alaska,2
...,...,...,...,...,...
671,wy,8,2384.412165,Wyoming,56
672,wy,9,3040.816644,Wyoming,56
673,wy,10,1492.331005,Wyoming,56
674,wy,11,760.765707,Wyoming,56


In [95]:
agg_df['log_value'] = agg_df['value'].apply(np.log)

In [96]:
states = alt.topo_feature(data.us_10m.url, 'states')
alt.Chart(agg_df).mark_geoshape().encode(
    shape='geo:G',
    color=alt.Color('log_value:Q'),
    tooltip=['state:N', 'value:Q'],
).transform_lookup(
    lookup='state_id',
    from_=alt.LookupData(data=states, key='id'),
    as_='geo'
).properties(
    width=550,
).project(
    type='albersUsa'
)

In [100]:
# states = alt.topo_feature(data.us_10m.url, 'states')
# source = data.income.url
## https://cdn.jsdelivr.net/npm/vega-datasets@v1.29.0/data/income.json
# alt.Chart(source).mark_geoshape().encode(
#     shape='geo:G',
#     color='pct:Q',
#     tooltip=['name:N', 'pct:Q'],
#     facet=alt.Facet('group:N', columns=2),
# ).transform_lookup(
#     lookup='id',
#     from_=alt.LookupData(data=states, key='id'),
#     as_='geo'
# ).properties(
#     width=300,
#     height=175,
# ).project(
#     type='albersUsa'
# )

In [None]:
# percentage of people who wore a mask for most or all of the time while in public in the past 5 days; those not in public in the past 5 days are not counted.

In [147]:
df1 = pd.read_csv(Path('data/covidcast-fb-survey-smoothed_wearing_mask-2020-12-20-to-2021-03-12.csv'), sep=',', usecols=['geo_value', 'time_value', 'value'])
df2 = pd.read_csv(Path('data/covidcast-fb-survey-smoothed_accept_covid_vaccine-2020-12-20-to-2021-03-12.csv'), sep=',', usecols=['geo_value', 'time_value', 'value'])

merged_df = (df1.merge(df2, how='inner', on=['geo_value', 'time_value'])
                .merge(df, how='inner', on=['geo_value', 'time_value'])
                .rename(columns={'value_x': 'wear_mask', 'value_y': 'accept_vaccine', 'value': 'covid_cases'}))


In [148]:
merged_df.head()

Unnamed: 0,geo_value,time_value,wear_mask,accept_vaccine,covid_cases,interval_id
0,ak,2020-12-20,87.282878,67.307692,24.742155,10
1,al,2020-12-20,89.566684,64.39905,51.945827,10
2,ar,2020-12-20,90.62016,61.565444,50.931074,10
3,az,2020-12-20,93.271341,67.85113,73.721784,10
4,ca,2020-12-20,95.694482,76.008745,68.335816,10


In [149]:
data.us_10m.url

'https://cdn.jsdelivr.net/npm/vega-datasets@v1.29.0/data/us-10m.json'

In [42]:

def str2datetime(dd):
    tmp = dd.split('-')
    return datetime.date(int(tmp[0]), int(tmp[1]), int(tmp[2]))


In [14]:
def load_mental_data():
    paths = ['data/covidcast-fb-survey-smoothed_anxious_5d-2020-12-20-to-2021-03-12.csv',
             'data/covidcast-fb-survey-smoothed_depressed_5d-2020-12-20-to-2021-03-12.csv',
             'data/covidcast-fb-survey-smoothed_felt_isolated_5d-2020-12-20-to-2021-03-12.csv']
    keys = ['anxious', 'depressed', 'isolated']
    dfs = []
    for p, k in zip(paths, keys):
        df = pd.read_csv(p, sep=',', usecols=['geo_value', 'time_value',
                         'value']).rename(columns={'value': k})
        dfs.append(df)
    merged_df = reduce(lambda x, y: pd.merge(x, y, how='inner', on=['geo_value', 'time_value']), dfs)
    return merged_df

In [15]:
merged_df = load_mental_data()

In [120]:
def preprocess_mental_data():
    top_states_by_cases_per_1M = ['nd', 'sd', 'ri', 'ut', 'tn']
    bottom_states_by_cases_per_1M = ['hi', 'vt', 'me', 'or', 'wa']

    tsc = set(top_states_by_cases_per_1M)
    bsc = set(bottom_states_by_cases_per_1M)
    top_df = merged_df[merged_df['geo_value'].apply(lambda x: x in tsc)].copy()
    # top_df = top_df.assign(case_type='top_5')
    bottom_df = merged_df[merged_df['geo_value'].apply(lambda x: x in bsc)].copy()
    # bottom_df = bottom_df.assign(case_type='bottom_5')
    top_df['datetime'] = top_df['time_value'].apply(str2datetime)
    bottom_df['datetime'] = bottom_df['time_value'].apply(str2datetime)
    return top_df, bottom_df

In [121]:
top_df, bottom_df = preprocess_mental_data()

In [122]:
d = datetime.date(2021, 1, 1)
top_df = top_df[top_df['datetime'] == d]
bottom_df = bottom_df[bottom_df['datetime'] == d]

In [123]:
top_df.head()

Unnamed: 0,geo_value,time_value,anxious,depressed,isolated,datetime
640,nd,2021-01-01,16.369766,13.86371,15.938762,2021-01-01
651,ri,2021-01-01,16.480125,12.236789,21.322181,2021-01-01
653,sd,2021-01-01,14.108589,10.602653,16.175906,2021-01-01
654,tn,2021-01-01,18.083967,12.782612,18.901344,2021-01-01
656,ut,2021-01-01,18.75061,14.510599,21.922757,2021-01-01


In [124]:
melted_top_df = pd.melt(top_df, id_vars=['geo_value', 'time_value'], value_vars=['anxious', 'depressed', 'isolated'])
melted_bottom_df = pd.melt(bottom_df, id_vars=['geo_value', 'time_value'], value_vars=['anxious', 'depressed', 'isolated'])

In [125]:
melted_top_df.head()

Unnamed: 0,geo_value,time_value,variable,value
0,nd,2021-01-01,anxious,16.369766
1,ri,2021-01-01,anxious,16.480125
2,sd,2021-01-01,anxious,14.108589
3,tn,2021-01-01,anxious,18.083967
4,ut,2021-01-01,anxious,18.75061


In [129]:
top_chart = alt.Chart(melted_top_df).mark_bar().encode(
    x='value',
    y='geo_value',
    color='variable'
).properties(title='Top 5 States by #covid cases per 100K people')

bottom_chart = alt.Chart(melted_bottom_df).mark_bar().encode(
    x='value',
    y='geo_value',
    color='variable'
).properties(title='Bottom 5 States by #covid cases per 100K people')

top_chart | bottom_chart
