# Libraries and Packages

In [2]:
import pandas as pd
import time

from googleapiclient.discovery import build #May need to run in console: sudo pip install --upgrade google-api-python-client
from pytrends.request import TrendReq

# Bring in External Data

In [3]:
# Search Terms list:
search_terms = pd.read_csv("/Users/cpollack/Documents/GST & Wiki Brainstorm - GST.csv")
search_terms.head()

Unnamed: 0,Suggested Query,Hypothesis for Query,Added By,Notes,URL,Unnamed: 5,Category of Query
0,"""where to get covid vaccine""",Areas with less COVID-19-specific vaccine hesi...,Maia,,https://trends.google.com/trends/explore?geo=U...,,General
1,"""vaccine exemption""",Areas with more vaccine hesitancy in general w...,Maia,,https://trends.google.com/trends/explore?date=...,,Avoidance
2,"""covid vaccine mandatory""",Areas with more vaccine hesitancy in general w...,Shagun,Given Maia's previous query I was curious! Th...,https://trends.google.com/trends/explore?geo=U...,,Avoidance
3,"""vaccine infertility""",A vaccine myth suggesting that the pfizer vacc...,Shagun,https://www.nytimes.com/2020/12/10/technology/...,https://trends.google.com/trends/explore?date=...,,Myths/Conspiracy Theories
4,"""covid vaccine safe""",Higher levels may denote more concern or initi...,Catherine,,https://trends.google.com/trends/explore?geo=U...,,General


# Set-Up

## API

Note: This is adapted from the Google Trends getting started guide

In [4]:
api_key = "HIDDEN"
server = 'https://trends.googleapis.com'
api_version = 'v1beta'
discovery_url_suffix = '/$discovery/rest?version=' + api_version
discovery_url = server + discovery_url_suffix

max_queries = 30 #API only allows 30 queries in one request

## PyTrends

In [5]:
pytrends = TrendReq(hl = 'en-US',
                    tz = 300)

max_pytrends = 5

# Functions

## Related Queries

In [7]:
def relatedQueries(keywords_list, start_date, end_date):
    """
    Identify related search terms that could be valuable (single snowball sample)
    Arguments: 
        param keywords (list of str): List of keywords of interest
        param start_date (str): Start date for data collection (in form YYYY-MM-DD)
        param end_date (str): End date for data collection (in form YYYY-MM-DD)
        
    Returns:
        List of additional keywords to search on
    """
    dat = {}
    batch_intervals = range(0, len(keywords_list), max_pytrends) #Can only take 5 at a time
    
    for batch_start in batch_intervals:
        batch_end = min(batch_start + max_pytrends, len(keywords_list))
        query_batch = keywords_list[batch_start:batch_end]

        pytrends.build_payload(kw_list = query_batch,
                               timeframe = f'{start_date} {end_date}',
                               geo = 'US')
        
        dat.update(pytrends.related_queries())
    return dat

## Convert Related Queries to Dataframe

In [42]:
def queryToDataframeHelper(query_dataframe, key, top_rising = "top"):
    """
    Helper functio for queryToDataframe that generates a dataframe from the top or rising results from query_dictionary
    
    Arguments:
        param query_dataframe (Pandas DataFrame): The top 24 top-or-rising related queries to the initial search term
        param key (str): The initial query
        param top_rising (str): Whether the query is from the "top" list or "rising" list. Choose from top (default) or rising
        
    Returns:
        A formatted dataframe for the queryToDataFrame function 
    """
    query_dataframe["top_rising"] = top_rising
    query_dataframe["initial_query"] = key
    query_dataframe = query_dataframe.rename(columns={'query': 'new_query'})
        
    return query_dataframe

In [50]:
def queryToDataframe(query_dictionary):
    """
    Turn query dictionary into a dataframe
    Arguments:
        param dat (dict): A dictionary of related queries as returned by relatedQueries

    Returns:
        A formatted database of all queries
        
    Raises:
        TypeError if no top or rising queries
    """
    column_names = ["new_query", "top_rising", "value", "initial_query"]
    dat = pd.DataFrame(columns = column_names)
    
    for key in query_dictionary:
        try:
            queryToDataframeHelper(query_dictionary[key]["top"], key, "top")
        except TypeError:
            print(f"No top related queries for {key}.")
        else:
             top_queries = queryToDataframeHelper(query_dictionary[key]["top"], key, "top")
                
        try:
            queryToDataframeHelper(query_dictionary[key]["rising"], key, "rising")
        except TypeError:
            print(f"No rising related queries for {key}.")
        else:
             rising_queries = queryToDataframeHelper(query_dictionary[key]["rising"], key, "rising")
        
        if top_queries is not None:
            dat = dat.append([top_queries], ignore_index = True)
            
        if related_queries is not None:
            dat = dat.append([top_queries], ignore_index = True)
            
    return dat

## Google API 

Note: This is adapted from the Google Trends getting started guide

In [37]:
def getQueryVolumes(queries, start_date, end_date, geo='US', geo_level = 'county', frequency = 'week'):
    """
    Extract query volumes from Trends API
    Arguments:
        param queries (list of str): A list of all queries to use
        param start_date (str): Start date of timelines (in form YYYY-MM-DD)
        param end_date (str): End date of timelines (in form YYYY-MM-DD)
        param geo (str): The string representation of the geography of interest. This can be a country (US, default), region (US-NY) or DMA (501)
        param geo_level (str): The granularity of geography. Choose from country (default), region, or dma
        param frequency (str): The temporal granularity. Choose from day, week (default), month, year

    Returns:
        List of lists (one row per date)

    Raises:
        ValueError if:
            - api key not defined
            - geo_level is undefined
"""
    if not api_key:
        raise ValueError("api_key not set.")
    
    service = build('trends', 
                    api_version, 
                    developerKey = api_key,
                    discoveryServiceUrl = discovery_url,
                    static_discovery=False)
    
    dat = {}
    
    # Accounting for max batch range of 30
    batch_intervals = range(0, len(queries), max_queries) 
    
    for batch_start in batch_intervals:
        batch_end = min(batch_start + max_queries, len(queries))
        query_batch = queries[batch_start:batch_end]
        
        # Make API query:
        if geo_level == "country":
            req = service.getTimelinesForHealth(terms = query_batch,
                                                time_startDate = start_date,
                                                time_endDate = end_date,
                                                timelineResolution = frequency,
                                                geoRestriction_country = geo)
        elif geo_level == "dma":
              req = service.getTimelinesForHealth(terms=query_batch,
                                                  time_startDate=start_date,
                                                  time_endDate=end_date,
                                                  timelineResolution=frequency,
                                                  geoRestriction_dma=geo)
        elif geo_level == 'region':
      # Region format is ISO-3166-2 (4-letters): en.wikipedia.org/wiki/ISO_3166-2:US)
              req = service.getTimelinesForHealth(terms=query_batch,
                                                  time_startDate=start_date,
                                                  time_endDate=end_date,
                                                  timelineResolution=frequency,
                                                  geoRestriction_region=geo)
        else:
            raise ValueError("geo_type must be one of 'country', 'region', or 'dma'")
        
        res = req.execute()
        
        time.sleep(1) #To avoid rate limit
        
        res_dict = {(line[u'term'], point[u'date']):
                   point[u'value']
                   for line in res[u'lines']
                   for point in line[u'points']}
        
        dat.update(res_dict)
        
        res = [['date'] + queries]
        for date in sorted(list(set([x[1] for x in dat]))):
            vals = [dat.get((term, date), 0) for term in queries]
            res.append([date] + vals)
        
        return res
   

# Data Collection

## Snowball sampling for search queries

In [53]:
query_list = list(search_terms['Suggested Query'])
query_list = [x.replace("\"", "") for x in query_list]
related_queries = relatedQueries(query_list, '2020-01-01', '2021-03-24')
related_queries_df = queryToDataframe(related_queries)

No top related queries for covid vaccine autism.
No rising related queries for covid vaccine autism.
No top related queries for whats in the covid vaccine.
No rising related queries for whats in the covid vaccine.
No top related queries for anti-vax  and derivatives.
No rising related queries for anti-vax  and derivatives.
No top related queries for The Vaccine Book  Making the Right Decision for Your Child.
No rising related queries for The Vaccine Book  Making the Right Decision for Your Child.
No top related queries for allergic reaction to vaccine/vaccine allergic reaction.
No rising related queries for allergic reaction to vaccine/vaccine allergic reaction.
No top related queries for vaccine 5g.
No rising related queries for vaccine 5g.
No top related queries for vaccine and autoimmune conditions.
No rising related queries for vaccine and autoimmune conditions.


# Exports

In [54]:
related_queries_df.to_csv('210324_top_rising_snowball_related_queries.csv')