### Covidcast -- Google search data

In [1]:
# !pip install covidcast

In [17]:
import pandas as pd
import numpy as np
from datetime import date
import covidcast
import pickle

### data_dictionary = https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/ght.html
Google searches, provided to us by Google Health Trends. 
- estimate the volume of COVID-related searches in a given location, on a given day. 
- signal is measured in arbitrary units (its scale is meaningless); larger numbers represent higher numbers of COVID-related searches.
- overall searcher interest in a set of COVID-19 related terms about anosmia (lack of smell or taste), which emerged as a symptom of the coronavirus. The specific terms are:
>“why cant i smell or taste”  OR  “loss of smell”  OR  “loss of taste”
- information reported by the API is unitless and pre-normalized for population size; i.e., the time series obtained for New York and Wyoming states are directly comparable


difference between time_value and issue ... they are both dates
- time_value is the date the search was completed
- issue is the date the data was collected/published by Google.  collection started in May (may 6), and was sporadic until late July (july 15). Since july, query data has been published daily.


### Collect data from Google Covidcast.
**1.**  Data used in modeling is saved to pickle file below.  If you wish to reload data (not necessary), uncomment cells immediately.  
**2.**  If you re-load data with this command, you will receive errors indicating "UserWarning: Problem obtaining data on 20200817: no results" for three dates:  20200817, 20200821, 20200919.  These are warnings, not fatal errors, and may be ignored.

In [18]:
# google = covidcast.signal("ght", "smoothed_search",
#                         date(2020, 2, 1), date(2020, 10, 26),
#                         "state")



In [20]:
# google.shape

(13521, 11)

In [21]:
# google.head(5)

Unnamed: 0,geo_value,signal,time_value,direction,issue,lag,value,stderr,sample_size,geo_type,data_source
0,ak,smoothed_search,2020-02-01,,2020-05-06,95,0.0,,,state,ght
1,al,smoothed_search,2020-02-01,,2020-05-06,95,2.016856,,,state,ght
2,ar,smoothed_search,2020-02-01,,2020-05-06,95,3.961135,,,state,ght
3,az,smoothed_search,2020-02-01,,2020-05-06,95,1.732458,,,state,ght
4,ca,smoothed_search,2020-02-01,,2020-05-06,95,4.639261,,,state,ght


In [38]:
# google.to_pickle('./data/google_raw.pkl')    # uncomment to re-save data

### Clean Google search data for input into models.
Final pickle file created below.

In [5]:
data = pd.read_pickle('./data/google_raw.pkl')
data.head()

Unnamed: 0,geo_value,signal,time_value,direction,issue,lag,value,stderr,sample_size,geo_type,data_source
0,ak,smoothed_search,2020-02-01,,2020-05-06,95,0.0,,,state,ght
1,al,smoothed_search,2020-02-01,,2020-05-06,95,2.016856,,,state,ght
2,ar,smoothed_search,2020-02-01,,2020-05-06,95,3.961135,,,state,ght
3,az,smoothed_search,2020-02-01,,2020-05-06,95,1.732458,,,state,ght
4,ca,smoothed_search,2020-02-01,,2020-05-06,95,4.639261,,,state,ght


In [6]:
# other settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [7]:
data.shape

(13317, 11)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13317 entries, 0 to 50
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   geo_value    13317 non-null  object        
 1   signal       13317 non-null  object        
 2   time_value   13317 non-null  datetime64[ns]
 3   direction    8173 non-null   object        
 4   issue        13317 non-null  datetime64[ns]
 5   lag          13317 non-null  int64         
 6   value        13317 non-null  float64       
 7   stderr       0 non-null      object        
 8   sample_size  0 non-null      object        
 9   geo_type     13317 non-null  object        
 10  data_source  13317 non-null  object        
dtypes: datetime64[ns](2), float64(1), int64(1), object(7)
memory usage: 1.2+ MB


In [9]:
#datetime to index, sort
data.set_index('time_value', inplace=True)

In [10]:
data.head()

Unnamed: 0_level_0,geo_value,signal,direction,issue,lag,value,stderr,sample_size,geo_type,data_source
time_value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-02-01,ak,smoothed_search,,2020-05-06,95,0.0,,,state,ght
2020-02-01,al,smoothed_search,,2020-05-06,95,2.016856,,,state,ght
2020-02-01,ar,smoothed_search,,2020-05-06,95,3.961135,,,state,ght
2020-02-01,az,smoothed_search,,2020-05-06,95,1.732458,,,state,ght
2020-02-01,ca,smoothed_search,,2020-05-06,95,4.639261,,,state,ght


In [11]:
#explore
data['direction'].value_counts(ascending=False, normalize=True)

 0    0.698030
 1    0.160039
-1    0.141931
Name: direction, dtype: float64

In [12]:
# uppercase the state columns ['geo_value']
# uncomment if need to revise

# data['geo_value'] = data['geo_value'].str.upper()
# data['geo_value']

In [13]:
#remove several columns to reduce size

df_google = data.drop(columns=['signal', 'issue', 'stderr', 'sample_size', 'geo_type', 'data_source'])
df_google.head()

Unnamed: 0_level_0,geo_value,direction,lag,value
time_value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-02-01,ak,,95,0.0
2020-02-01,al,,95,2.016856
2020-02-01,ar,,95,3.961135
2020-02-01,az,,95,1.732458
2020-02-01,ca,,95,4.639261


In [14]:
df_google.to_csv('./data/data_state_detail/google_clean.csv')

In [15]:
df_google.to_pickle('./data/google_clean.pkl')

In [16]:
google = pd.read_pickle('./data/google_clean.pkl')
google.head()

Unnamed: 0_level_0,geo_value,direction,lag,value
time_value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-02-01,ak,,95,0.0
2020-02-01,al,,95,2.016856
2020-02-01,ar,,95,3.961135
2020-02-01,az,,95,1.732458
2020-02-01,ca,,95,4.639261
