In [None]:
%cd ../

from src.requirements import *
from src.utils import *
%matplotlib inline

## ******************************************************************************************************************************
## OPEN CONFIG FILE
## ******************************************************************************************************************************

with open('configFile.json') as json_data_file:
    configFile = json.load(json_data_file)

## ******************************************************************************************************************************
## SET BIG QUERY CREDENTIALS
## ******************************************************************************************************************************

SERVICE_ACCOUNT_FILE = configFile['config']['BQ_key_file']
client_bq = bigquery.Client.from_service_account_json(SERVICE_ACCOUNT_FILE)
bq_dataset_id = configFile['config']['bq_dataset']
bq_dataset_ref = client_bq.dataset(bq_dataset_id)

## ******************************************************************************************************************************
## CARTO
## ******************************************************************************************************************************

carto_username = configFile['config']['CARTO']['username']
carto_API = configFile['config']['CARTO']['API_key']

creds = Credentials(carto_username, carto_API)
set_default_credentials(creds)

%cd etl/

## Select phase: lockdown or recovery

In [None]:
date_min = '2020-02-20'
date_max = '2020-07-17'
phase = 'lockdown' # select between 'lockdown' or 'recovery'
pc_var_thr = 0.55 # % of retained variance in the Census variables 
filename = 'mobility_google_county_{}'.format(phase)

## Pre-process  Google mobility data

In [None]:
table_name = '{}.{}.epi_mobility_census_county_full'.format(bq_dataset_ref.project,bq_dataset_id)

q = """SELECT * 
        FROM  `{}`
""".format(table_name)
df = client_bq.query(q).to_dataframe()

### Select only counties with a full series

In [None]:
df['geoid_cnt'] = df.groupby('geoid')['geoid'].transform('count')
cnt = df['geoid_cnt'].max()
df = df[df.geoid_cnt==cnt]

### Interpolate missing data if the gap length is less than 3 days

In [None]:
df.sort_values(['geoid','date'], inplace = True)
df['workplaces_percent_change_from_baseline_interp'] = df[['geoid','workplaces_percent_change_from_baseline']].groupby('geoid').transform(lambda x: x.interpolate(method='linear', order = 3, 
                                                                                                                                                               limit = 3, 
                                                                                                                                                               limit_direction='both',
                                                                                                                                                               limit_area=None))

### Ignore counties with gaps larger than 3 days

In [None]:
df['workplaces_percent_change_from_baseline_interp_na'] = df.groupby('geoid')['workplaces_percent_change_from_baseline_interp'].transform(lambda x: x.isnull().any())
df = df[df.workplaces_percent_change_from_baseline_interp_na==False]

### Compute 7-days running mean

In [None]:
df['workplaces_percent_change_from_baseline_interp_ma'] = df.groupby('geoid')['workplaces_percent_change_from_baseline_interp'].transform(lambda x: x.rolling(7, 1).mean())
df = df[(df.date >= date_min) & (df.date <= date_max)]
df.date = df.date.apply(lambda x : pd.Timestamp(x))

### Plot a random sample of 100 counties and the US median 

In [None]:
df_median = df[['date','workplaces_percent_change_from_baseline_interp_ma']].groupby(['date']).apply(np.median)

## Get the date that the % change in mobility reaches the minumum
date_drop = str(df_median.idxmin().strftime("%Y-%m-%d"))

df_median = df_median.reset_index()
df_median.columns = ['date','workplaces_percent_change_from_baseline_interp_ma']
df_median.date = df_median.date.apply(lambda x : pd.Timestamp(x))

In [None]:
df_sample = df[df.geoid.isin(df.geoid.sample(n=100))]

fig = plt.figure(figsize=(25,12.5))
ax = fig.add_subplot(111)

df_sample.groupby('geoid').plot(x="date", y="workplaces_percent_change_from_baseline_interp", 
                  color = 'darkblue', 
                  alpha = 0.1,
                  legend = False, rot = 45, ax = ax)

df_sample.groupby('geoid').plot(x="date", y="workplaces_percent_change_from_baseline_interp_ma", 
                  color = 'orange', 
                  linewidth = 2,                         
                  alpha = 0.3,
                  legend = False, rot = 45,ax = ax)

df_median.plot(x="date", y="workplaces_percent_change_from_baseline_interp_ma", 
                  color = 'red', 
                  linewidth = 5,                         
                  alpha = 1,
                  legend = False, rot = 45,ax = ax)

ax.set_xlabel("",fontsize=15)
ax.set_ylabel("",fontsize=25)
ax.set_title("% change from baseline in workplaces mobility",fontsize=25)
ax.axhline(y=0.00,c="black",linewidth=2,zorder=0)
ax.axvline(x=pd.Timestamp('2020-03-13'),c="black",linewidth=2,linestyle = '--')
ax.axvline(x=pd.Timestamp('2020-05-25'),c="black",linewidth=2,linestyle = '--')
ax.axvline(x=pd.Timestamp('2020-07-04'),c="black",linewidth=2,linestyle = '--')
ax.set_ylim(-60, 10)
ax.xaxis.set_major_locator(mdates.WeekdayLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %d'))
ax.tick_params(axis='both', which='minor', pad = 10, size = 0, labelsize=1)
ax.tick_params(axis='both', which='major', pad = 10, size = 20, labelsize=25)
ax.tick_params(axis = 'x', labelsize = 25)

colors = ['darkblue', 'orange','red','coral','coral']
linestyles =  ['-', '-', '-', ':','--']
lines = [Line2D([0], [0], color=c, linewidth=3, linestyle = s) for c,s in zip(colors, linestyles)]
labels = ['daily', 
          'daily, 7-days running mean',
          'daily, 7-days running mean (US median)']
plt.legend(lines, labels, fontsize = 25,frameon=False)

plt.suptitle("Change in mobility by County, source: Google", fontsize=30)
fig.tight_layout(rect=[0, 0.03, 1, 0.95])
fig.savefig('../plots/mobility_county.pdf')

### Plot the median by median household income percentile group

In [None]:
df_q1 = df[df.INCCYMEDHH < df.INCCYMEDHH.quantile(.20)]
df_q2 = df[(df.INCCYMEDHH >= df.INCCYMEDHH.quantile(.20)) & (df.INCCYMEDHH < df.INCCYMEDHH.quantile(.40))]
df_q3 = df[(df.INCCYMEDHH >= df.INCCYMEDHH.quantile(.40)) & (df.INCCYMEDHH < df.INCCYMEDHH.quantile(.60))]
df_q4 = df[(df.INCCYMEDHH >= df.INCCYMEDHH.quantile(.60)) & (df.INCCYMEDHH < df.INCCYMEDHH.quantile(.80))]
df_q5 = df[df.INCCYMEDHH >= df.INCCYMEDHH.quantile(.80)]

df_q1 = df_q1[['date','workplaces_percent_change_from_baseline_interp_ma']].groupby(['date']).apply(np.median)
df_q2 = df_q2[['date','workplaces_percent_change_from_baseline_interp_ma']].groupby(['date']).apply(np.median)
df_q3 = df_q3[['date','workplaces_percent_change_from_baseline_interp_ma']].groupby(['date']).apply(np.median)
df_q4 = df_q4[['date','workplaces_percent_change_from_baseline_interp_ma']].groupby(['date']).apply(np.median)
df_q5 = df_q5[['date','workplaces_percent_change_from_baseline_interp_ma']].groupby(['date']).apply(np.median)

df_q1 = df_q1.reset_index()
df_q1.columns = ['date','workplaces_percent_change_from_baseline_interp_ma']
df_q1.date = df_q1.date.apply(lambda x : pd.Timestamp(x))

df_q2 = df_q2.reset_index()
df_q2.columns = ['date','workplaces_percent_change_from_baseline_interp_ma']
df_q2.date = df_q2.date.apply(lambda x : pd.Timestamp(x))

df_q3 = df_q3.reset_index()
df_q3.columns = ['date','workplaces_percent_change_from_baseline_interp_ma']
df_q3.date = df_q3.date.apply(lambda x : pd.Timestamp(x))

df_q4 = df_q4.reset_index()
df_q4.columns = ['date','workplaces_percent_change_from_baseline_interp_ma']
df_q4.date = df_q4.date.apply(lambda x : pd.Timestamp(x))

df_q5 = df_q5.reset_index()
df_q5.columns = ['date','workplaces_percent_change_from_baseline_interp_ma']
df_q5.date = df_q5.date.apply(lambda x : pd.Timestamp(x))

In [None]:
df_sample = df[df.geoid.isin(df.geoid.sample(n=100))]

fig = plt.figure(figsize=(25,12.5))
ax = fig.add_subplot(111)

df_q1.plot(x="date", y="workplaces_percent_change_from_baseline_interp_ma", 
                  color = 'darkblue', 
                  linewidth = 5,                         
                  alpha = 1,
                  linestyle = ':',
                  legend = False, rot = 45,ax = ax)

df_q2.plot(x="date", y="workplaces_percent_change_from_baseline_interp_ma", 
                  color = 'darkblue', 
                  linewidth = 5,                         
                  alpha = 1,
                  linestyle = '--',
                  legend = False, rot = 45,ax = ax)

df_q3.plot(x="date", y="workplaces_percent_change_from_baseline_interp_ma", 
                  color = 'red', 
                  linewidth = 5,                         
                  alpha = 1,
                  linestyle = '-',
                  legend = False, rot = 45,ax = ax)

df_q4.plot(x="date", y="workplaces_percent_change_from_baseline_interp_ma", 
                  color = 'coral', 
                  linewidth = 5,                         
                  alpha = 1,
                  linestyle = ':',
                  legend = False, rot = 45,ax = ax)

df_q5.plot(x="date", y="workplaces_percent_change_from_baseline_interp_ma", 
                  color = 'coral', 
                  linewidth = 5,                         
                  alpha = 1,
                  linestyle = '--',
                  legend = False, rot = 45,ax = ax)


ax.set_xlabel("",fontsize=15)
ax.set_ylabel("",fontsize=25)
ax.set_title("% change from baseline in workplaces mobility",fontsize=25)
ax.axhline(y=0.00,c="black",linewidth=2,zorder=0)
ax.axvline(x=pd.Timestamp('2020-03-13'),c="black",linewidth=2,linestyle = '--')
ax.axvline(x=pd.Timestamp('2020-05-25'),c="black",linewidth=2,linestyle = '--')
ax.axvline(x=pd.Timestamp('2020-07-04'),c="black",linewidth=2,linestyle = '--')
ax.set_ylim(-60, 10)
ax.xaxis.set_major_locator(mdates.WeekdayLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %d'))
ax.tick_params(axis='both', which='minor', pad = 10, size = 0, labelsize=1)
ax.tick_params(axis='both', which='major', pad = 10, size = 20, labelsize=25)
ax.tick_params(axis = 'x', labelsize = 25)

colors = ['darkblue', 'darkblue','red','coral','coral']
linestyles =  [':','--', '-', ':','--']
lines = [Line2D([0], [0], color=c, linewidth=3, linestyle = s) for c,s in zip(colors, linestyles)]
labels = ['Counties with median household income < 20th quantile', 
          'Counties with median household income < 40th quantile and > 20th quantile',
          'Counties with median household income < 60th quantile and > 40th',
          'Counties with median household income < 80th quantile and > 60th',
          'Counties with median household income > 80th quantile']
plt.legend(lines, labels, fontsize = 25,frameon=False)

plt.suptitle("Change in mobility by County, source: Google", fontsize=30)
fig.tight_layout(rect=[0, 0.03, 1, 0.95])
fig.savefig('../plots/mobility_county_income.pdf')

### Get the minumum and maximum % change in mobility

In [None]:
df['population'] = df.groupby('geoid')['population'].transform('last')
if phase == 'lockdown':
    df = df[df.date <= date_drop]
else:
    df = df[df.date >= date_drop)]
df['workplaces_percent_change_from_baseline_interp_ma_min'] = df.groupby('geoid')['workplaces_percent_change_from_baseline_interp_ma'].transform('min')
df['workplaces_percent_change_from_baseline_interp_ma_max'] = df.groupby('geoid')['workplaces_percent_change_from_baseline_interp_ma'].transform('max')
df['workplaces_percent_change_from_baseline_interp_ma_diff_minmax'] = df['workplaces_percent_change_from_baseline_interp_ma_min'] - df['workplaces_percent_change_from_baseline_interp_ma_max']
df['workplaces_percent_change_from_baseline_interp_ma_diff_maxmin'] = -df['workplaces_percent_change_from_baseline_interp_ma_diff_minmax']

### Get the cumulative number of cases and deaths by county

In [None]:
df['cases_cum'] = df.groupby('geoid')['cases'].transform('last')
df['deaths_cum'] = df.groupby('geoid')['deaths'].transform('last')

df['cases_cum_dens'] = df['cases_cum'].div(df['population'])
df['deaths_cum_dens'] =  df['deaths_cum'].div(df['population'])

## Save files

In [None]:
gdf = df[['geom','ID','geoid']]
gdf = gdf[~gdf.duplicated(keep='first')]
gdf['geometry'] = gdf['geom'].apply(lambda x: str_to_geom(x))
gdf.drop(['geom'], axis = 1, inplace = True)
gdf = gpd.GeoDataFrame(gdf, geometry = gdf.geometry)
gdf.to_file(driver = 'ESRI Shapefile', filename= "../data/{}.shp".format(filename))

In [None]:
df[df.columns[df.columns!='geom']].to_csv("../data/{}_ts.csv".format(filename),index = False)