## Getting Trends Data
This notebook requests trends data on topics, namely relevant terms like destination cities and destination countries. The topic ids have already been collected.

These functions below is how we make reqeusts to google trends to return trends on keywords.

It is a bit of trial and error and a bit of help from [this post](https://stackoverflow.com/a/67199394/10006534).

It gathers one term at a time from a list of trends and then if an error occurs (which often happens due to the fact that Google Trends is rate-limited), it sleeps for a minute and repeats the request. If 20 requests are made in a row that result in an error, it will skip that particular request and move on to the next term.

In [3]:
# list of language codes from googletrans
import pandas as pd
from pytrends.request import TrendReq
import numpy as np
import time
from tqdm import tqdm

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/112.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    # 'Accept-Encoding': 'gzip, deflate, br',
    'Referer': 'https://trends.google.com/',
    'Alt-Used': 'trends.google.com',
    'Connection': 'keep-alive',
    # 'Cookie': '__utma=10102256.699944976.1681467038.1683327769.1683363479.30; __utmz=10102256.1683363479.30.23.utmcsr=trends.google.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmc=10102256; NID=511=GaXIe0Lwd1l8RAGkA2geWNynqviDUhjPBcVgHksJdTnugCvKuUPbm_bM-mT7DhT2jrBHT00aCt71oY7fZhydICB-HNWUzrDnonyPyOGmPTA75lOvpTiguXi3KiGJtRjK3BBH3e1ZcqQ_ywcsU5vHoxJFtH9HGhcLdOt7CL7AWKx8Jj9VSOI3cCwmjDl8gbj2PZ75BU_W4NqspBRMktcdhRitXCyOIqMdLMwZfSOOvFmRBTOJKg8M7UkUTwAVhXtxsKVlHfxPpiWx8HQ63Vr5SV_8qW9f4J0f8EbXWiofQLqpPKJzo0CMbyM-EcnRlR4YVqptEli6EgemOBUJAgH8951i7ANgVDSWy-vn3zXA5KPR5l0LtkriirFZPvsNAmV-_-Mtyuf6gYu8eYJL3g; CONSENT=PENDING+639; SID=WAjkbwUHGFuugy4Yy2rq46Op5ZjRIMvPaLQIAltzHSM35MU0x7YgYongisCrn5htv3RhAw.; __Secure-1PSID=WAjkbwUHGFuugy4Yy2rq46Op5ZjRIMvPaLQIAltzHSM35MU0YH1nUovIFt-jaEUUCx_SIQ.; __Secure-3PSID=WAjkbwUHGFuugy4Yy2rq46Op5ZjRIMvPaLQIAltzHSM35MU0Z8qXUg1LhjB4DtBZWFfNQg.; HSID=A8QJObb1Ve4vQOXFw; SSID=AJr3GRs7Jf_ctBT41; APISID=rNTBsHwZF0AVrKao/AoTWce3Qv8CyFykEc; SAPISID=vftmcyrgIFqWdYpV/AHlhj91rgxiQPlOq8; __Secure-1PAPISID=vftmcyrgIFqWdYpV/AHlhj91rgxiQPlOq8; __Secure-3PAPISID=vftmcyrgIFqWdYpV/AHlhj91rgxiQPlOq8; SIDCC=AP8dLtyjVDmjXvg3rEmTwoLfGyXkY0SDrIFQWqi1z9D1QOL5voioH1Uti_ANGJkiQCuzVd4Axww; __Secure-1PSIDCC=AP8dLtxxVvSKM2MgLGepw_20VZbYsJHar-zF5kvDajRKezVqui3YqxWUaT1e6meVcR9HTUP4lgo; __Secure-3PSIDCC=AP8dLtyyI8BLnakxZZ2OFmPTDfYzPW8jo13jnE34rpPuptgnFDFq-aKX5vfcZdtRDLLZswyAl3gv; 1P_JAR=2023-5-6-12; SOCS=CAISHAgCEhJnd3NfMjAyMjEwMDQtMF9SQzMaAmVuIAEaBgiAwY2aBg; AEC=AUEFqZchyarTzQblW5K5GOTGtYARrs8luJGdx84JVmSwETHSFqijMgs9FA; _ga_VWZPXDNJJB=GS1.1.1683458025.38.1.1683458061.0.0.0; _ga=GA1.3.699944976.1681467038; OTZ=6986051_48_52_123900_48_436380; ADS_VISITOR_ID=00000000-0000-0000-0000-000000000000/112727363205027642159; S=billing-ui-v3=wWfIrmncuOn4LfU6DArDU3LLPpCDgsAT:billing-ui-v3-efe=wWfIrmncuOn4LfU6DArDU3LLPpCDgsAT; __Secure-1PSIDTS=sidts-CjIBLFra0jgJEQyM4EqRZoyaN18X_Umt8M6GTvixMw1pDB_sj5P5XvQokN5dkVw1R2qAkRAA; __Secure-3PSIDTS=sidts-CjIBLFra0jgJEQyM4EqRZoyaN18X_Umt8M6GTvixMw1pDB_sj5P5XvQokN5dkVw1R2qAkRAA; _gid=GA1.3.1220682113.1683458025; _gat_gtag_UA_4401283=1',
    'Upgrade-Insecure-Requests': '1',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-User': '?1',
    # Requests doesn't support trailers
    # 'TE': 'trailers',
}

def pytrends_request(word_list, country, pytrends, start_date, end_date):
    
    pytrends.build_payload(kw_list=word_list, geo=country, timeframe=start_date + ' ' + end_date)
    trends = pytrends.interest_over_time()
    if 'isPartial' in trends.columns:
        trends.drop('isPartial', axis=1, inplace=True)
    # print(word_list)
    return trends

def get_trends_data(country, keywords, start_date, end_date):
    pytrends = TrendReq(hl='en-US', tz=360, requests_args={'headers': headers})
    trends_df = pd.DataFrame()
    error_count = 0

    for keyword in keywords:
        while True:
            try:
                trends_df = pd.concat([trends_df, pytrends_request([keyword], country, pytrends, start_date, end_date)], axis=1)
                error_count = 0  # Reset error count if successful request
                break  # Exit the while loop if successful
            except:
                error_count += 1
                # print('Got an error. Trying again in 60 seconds.')
                time.sleep(60)

                if error_count == 20:
                    print('Reached maximum error count. Exiting loop.')
                    return trends_df  # Return the trends_df even if not complete
    return trends_df

In [15]:
start_date = '2005-01-01'
end_date = '2023-01-01'

### Semantic Link Topic Trends

In [4]:
semantic_topic_ids = pd.read_csv('topic_ids/semantic_topic_ids.csv')
countries = pd.read_csv('../../data/clean/unhcr.csv', engine='pyarrow').drop_duplicates('iso_o').Country_o
import country_converter as coco
iso2_countries = coco.convert(countries, to='iso2')

semantic_dict = semantic_topic_ids[['keyword','topic_id']].set_index('topic_id')['keyword'].to_dict()

In [5]:
country_trends_list = []
for iso2country in tqdm(iso2_countries):
    a_country_trends = get_trends_data(iso2country, semantic_topic_ids.topic_id, '2005-01-01', '2023-01-01')
    a_country_trends['country'] = iso2country
    country_trends_list.append(a_country_trends)

semantic_trends_df = pd.DataFrame()
for idx, a_country_semantic_trends in enumerate(country_trends_list):
    a_country = a_country_semantic_trends.copy()
    if a_country.index.name == 'date':
        a_country.reset_index(inplace=True)
    if 'index' in a_country.columns.values:
        a_country.drop('index',axis=1, inplace=True)
    a_country = a_country.loc[:, ~a_country.columns.duplicated()]
    # a_country.set_index(['date','country'], inplace=True)
    a_country.rename(columns=semantic_dict, inplace=True)
    semantic_trends_df = pd.concat([semantic_trends_df, a_country], axis=0, ignore_index=True)

semantic_trends_df.to_csv('data/semantic_topic_trends_2005_complete.csv')

100%|██████████| 196/196 [15:43:42<00:00, 288.89s/it]    


In [11]:
semantic_trends_df[semantic_trends_df[['passport', 'Immigration', 'Travel Visa', 'Refugee', 'Conflict',
       'Violence', 'Crisis', 'Militia', 'Genocide', 'Armed Forces', 'Civilian',
       'Currency', 'Lottery', 'Economy', 'Bureau de change', 'Wage', 'Protest',
       'Coup d’état', 'Government']].notna().any(axis=1)].country.unique().__len__()

195

## Neighboring Countries

In [139]:
import pandas as pd
import igraph as ig
import country_converter as coco

# convert unhcr data to network format. To produce the unhcr.csv file, you will need to:
# # drag and drop the data.csv file from geraldine into the data/raw/ folder
# # open the clean_data.ipynb notebook in data/
# # run the section that cleans the unhcr data, which outputs unhcr.csv into data/clean/
unhcr = pd.read_csv('../../data/clean/unhcr.csv', engine='pyarrow').groupby(['iso_o','iso_d']).agg({'newarrival':'sum','contig':'first','Country_o':'first','Country_d':'first', 'island_o':'first', 'dist':'first'}).reset_index()

df_network = unhcr[unhcr.contig == 1]

graph = ig.Graph.TupleList(df_network[['Country_o','Country_d']].itertuples(index=False), directed=False)

# add island countries 
islands = unhcr.drop_duplicates('Country_o').sort_values('Country_o').Country_o[~unhcr.groupby('Country_o')['contig'].any().values].values

for i in islands:
    v = graph.add_vertex()
    # Set the name or other properties of the added vertex if needed
    v['name'] = i

graph.vs['name'] = coco.convert(graph.vs['name'], to='iso2')

In [13]:
# get country topic ids
country_topic_ids = pd.read_csv('topic_ids/country_topic_ids.csv')
country_topic_ids['iso2'] = coco.convert(country_topic_ids.search, to='iso2')
country_topic_dict = country_topic_ids[['topic_title', 'topic_mid']].set_index('topic_mid')['topic_title'].to_dict()

In [19]:
# list of countries
iso2_countries = coco.convert(unhcr.Country_o.unique(), to='iso2')

country_trends_df = pd.DataFrame()
country_trends_list = []
for iso2country in tqdm(iso2_countries):
    # get neighbors of country
    neighboring_countries = graph.vs[graph.neighborhood(iso2country, order=1)]['name'][1:]

    # if no bordering countries (islands), take the 3 closest countries
    if len(neighboring_countries) == 0:
        neighboring_countries = coco.convert(unhcr[unhcr.iso_o  == coco.convert(iso2country, to='iso3')].nsmallest(3, 'dist').iso_d.iloc[0:3], to='iso2')
    
    order1_countries = country_topic_ids[country_topic_ids.iso2.isin(neighboring_countries)]

    a_country_trends = get_trends_data(iso2country, order1_countries.topic_mid, start_date, end_date)
    
    if len(a_country_trends) != 0:
        a_country_trends = a_country_trends.reset_index().melt(id_vars='date', var_name = 'topic_mid')
        a_country_trends['country_o'] = iso2country
        country_trends_df = pd.concat([country_trends_df, a_country_trends], axis=0)

100%|██████████| 196/196 [4:20:29<00:00, 79.74s/it]   


In [20]:
country_trends_df.to_csv('data/2005_topics/country_topic_trends_2005_complete.csv')

Then we can gather trends for countries of order 2, excluding order 1 

(I think we can skp this part for now. There needs to be some distance-based filter as well that omits far away countries, Looking at Afghanistan for example yields too many countries/cities).

This could be obtained by merging countries with the unhcr distance measurments between countries, and omitting countries above a certain threshold.

In [121]:
neighboring_countries_order2 = list(set(graph.vs[graph.neighborhood('AF', order=2)]['name']) - set(graph.vs[graph.neighborhood('AF', order=1)]['name']))

# too many countries for order 2.
country_topic_ids[country_topic_ids.iso2.isin(neighboring_countries_order2)]

Unnamed: 0,search,topic_title,topic_type,topic_mid,iso2
6,Armenia,Armenia,Country in Asia,/m/0jgx,AM
10,Azerbaijan,Azerbaijan,Country,/m/0jhd,AZ
19,Bhutan,Bhutan,Country in South Asia,/m/07bxhl,BT
73,Hong Kong SAR,Hong Kong,Special administrative regions of China,/m/03h64,HK
76,India,India,Country in South Asia,/m/03rk0,IN
79,Iraq,Iraq,Country in the Middle East,/m/0d05q4,IQ
86,Kazakhstan,Kazakhstan,Country in Central Asia,/m/047lj,KZ
92,Kyrgyz Republic,Kyrgyzstan,Country in Central Asia,/m/0jt3tjf,KG
93,Lao P.D.R.,Laos,Country in Asia,/m/04hhv,LA
101,Macao SAR,Macao,Special administrative regions of China,/m/04thp,MO


In [33]:
# list of countries
iso2_countries = coco.convert(unhcr.Country_o.unique(), to='iso2')

country_trends_df = pd.DataFrame()
country_trends_list = []
for iso2country in tqdm(iso2_countries):
    # get neighbors of country
    neighboring_countries_order2 = list(set(graph.vs[graph.neighborhood(iso2country, order=2)]['name']) - set(graph.vs[graph.neighborhood(iso2country, order=1)]['name']))

    # if no bordering countries (islands), take the next 3 closest countries after the 3 closest.
    if len(neighboring_countries_order2) == 0:
        neighboring_countries_order2 = coco.convert(unhcr[unhcr.iso_o  == coco.convert(iso2country, to='iso3')].nsmallest(3, 'dist').iso_d.iloc[3:6], to='iso2')
    
    order2_countries = country_topic_ids[country_topic_ids.iso2.isin(neighboring_countries_order2)]

    a_country_trends = get_trends_data(iso2country, order2_countries.topic_mid, start_date, end_date)
    
    if len(a_country_trends) != 0:
        a_country_trends = a_country_trends.reset_index().melt(id_vars='date', var_name = 'topic_mid')
        a_country_trends['country_o'] = iso2country
        country_trends_df = pd.concat([country_trends_df, a_country_trends], axis=0)

100%|██████████| 196/196 [22:56:49<00:00, 421.48s/it]   


In [54]:
country_trends_df.to_csv('data/2005_topics/country_topic_trends_2005_order2.csv', index=False)

### Remaining Countries

I calculate the degrees of separation for all the possible pairs.

Then I take the top 500 pairs that aren't order 1 or order 2.

In [86]:
deg_of_sep = pd.read_csv('degrees_of_separation.csv')
unhcr = unhcr.merge(deg_of_sep)

In [97]:
top_remaining_pairs = unhcr[~unhcr.degrees_of_separation.isin([1,2])].sort_values(by='newarrival',ascending=False).head(500)

In [112]:
country_iso2_dict = dict(zip(unhcr.Country_o.unique(), coco.convert(unhcr.Country_o.unique(), to='iso2')))
top_remaining_pairs['iso2_o'] = top_remaining_pairs.Country_o.map(country_iso2_dict)
top_remaining_pairs['iso2_d'] = top_remaining_pairs.Country_d.map(country_iso2_dict)

remaining_iso2 = top_remaining_pairs.drop_duplicates('iso2_o')

In [122]:
country_trends_df = pd.DataFrame()
country_trends_list = []
for iso2country in tqdm(remaining_iso2.iso2_o):
    top_d_countries = top_remaining_pairs[top_remaining_pairs.iso2_o == iso2country].iso2_d
    top_d_countries_topic_ids = country_topic_ids[country_topic_ids.iso2.isin(top_d_countries)].topic_mid   

    a_country_trends = get_trends_data(iso2country, top_d_countries_topic_ids, start_date, end_date)
    
    if len(a_country_trends) != 0:
        a_country_trends = a_country_trends.reset_index().melt(id_vars='date', var_name = 'topic_mid')
        a_country_trends['country_o'] = iso2country
        country_trends_df = pd.concat([country_trends_df, a_country_trends], axis=0)

100%|██████████| 89/89 [5:51:24<00:00, 236.90s/it]   


In [124]:
country_trends_df.to_csv('data/2005_topics/remaining_500_country_trends.csv', index=False)

## Neighboring Cities

In [43]:
# get city topic ids
city_topic_ids = pd.read_csv('topic_ids/city_topic_id.csv')
city_topic_ids['iso2'] = coco.convert(city_topic_ids.search_country, to='iso2')

First we can gather all the cities of neighboring countries of order 1

In [55]:
# list of countries
iso2_countries = coco.convert(unhcr.Country_o.unique(), to='iso2')

city_trends_list = []
city_trends_df = pd.DataFrame()
for iso2country in tqdm(iso2_countries):
    # get neighbors of country
    neighboring_countries = graph.vs[graph.neighborhood(iso2country, order=1)]['name'][1:]

     # if no bordering countries (islands), take the 3 closest countries
    if len(neighboring_countries) == 0:
        neighboring_countries = coco.convert(unhcr[unhcr.iso_o  == coco.convert(iso2country, to='iso3')].nsmallest(3, 'dist').iso_d.iloc[0:3], to='iso2')

    order1_cities = city_topic_ids[city_topic_ids.iso2.isin(neighboring_countries)]
    a_country_trends = get_trends_data(iso2country, order1_cities.topic_mid, start_date, end_date)
    a_country_trends['country_o'] = iso2country
    
    city_trends_list.append(a_country_trends)

    if len(a_country_trends) != 0:
        a_country_trends = a_country_trends.reset_index().melt(id_vars='date', var_name = 'topic_mid')
        a_country_trends['country_o'] = iso2country
        city_trends_df = pd.concat([city_trends_df, a_country_trends], axis=0)

In [58]:
city_trends_df.to_csv('data/2005_topics/cities_topic_trends_2005_order1.csv', index=False)

Then we can gather trends for countries of order 2, excluding order 1 

(I think we can skp this part for now. There needs to be some distance-based filter as well that omits far away countries, Looking at Afghanistan for example yields too many countries/cities).

this could be obtained by using the location coordinates from geonames-all-cities-with-a-population-1000.csv

In [157]:
# list of countries
iso2_countries = coco.convert(unhcr.Country_o.unique(), to='iso2')

city_trends_df = pd.DataFrame()
country_trends_list = []
for iso2country in tqdm(iso2_countries):
    # get neighbors of country
    neighboring_countries_order2 = list(set(graph.vs[graph.neighborhood(iso2country, order=2)]['name']) - set(graph.vs[graph.neighborhood(iso2country, order=1)]['name']))

    # if no bordering countries (islands), take the next 3 closest countries after the 3 closest.
    if len(neighboring_countries_order2) == 0:
        neighboring_countries_order2 = coco.convert(unhcr[unhcr.iso_o  == coco.convert(iso2country, to='iso3')].nsmallest(6, 'dist').iso_d.iloc[3:6], to='iso2')
    
    order2_cities = city_topic_ids[city_topic_ids.iso2.isin(neighboring_countries_order2)]

    a_country_trends = get_trends_data(iso2country, order2_cities.topic_mid, start_date, end_date)
    
    if len(a_country_trends) != 0:
        a_country_trends = a_country_trends.reset_index().melt(id_vars='date', var_name = 'topic_mid')
        a_country_trends['country_o'] = iso2country
        city_trends_df = pd.concat([city_trends_df, a_country_trends], axis=0)

city_trends_df.to_csv('data/2005_topics/cities_topic_trends_2005_order2.csv', index=False)

100%|██████████| 196/196 [17:54:31<00:00, 328.94s/it]   


### Top remaining countries' destination cities

In [128]:
city_trends_df = pd.DataFrame()
city_trends_list = []
for iso2country in tqdm(remaining_iso2.iso2_o):
    top_d_countries = top_remaining_pairs[top_remaining_pairs.iso2_o == iso2country].iso2_d
    top_d_countries_topic_ids = city_topic_ids[city_topic_ids.iso2.isin(top_d_countries)].topic_mid   

    a_country_trends = get_trends_data(iso2country, top_d_countries_topic_ids, start_date, end_date)
    
    if len(a_country_trends) != 0:
        a_country_trends = a_country_trends.reset_index().melt(id_vars='date', var_name = 'topic_mid')
        a_country_trends['country_o'] = iso2country
        city_trends_df = pd.concat([city_trends_df, a_country_trends], axis=0)

country_trends_df.to_csv('data/2005_topics/cities_topic_trends_2005_top_remaining.csv', index=False)

100%|██████████| 89/89 [18:09<00:00, 12.25s/it] 


## Neighboring Border Cities

In [59]:
import pandas as pd
import json
import country_converter as coco

neighboring_city_ids = pd.read_csv('topic_ids/neighboring_city_topic_id.csv')
neighboring_city_ids['iso2'] = coco.convert(neighboring_city_ids.search_country, to='iso2')
with open('bordering_countries_bordering_cities.json') as json_file:
    bordering_country_cities_dict = json.load(json_file)
# Convert keys to a series
keys_series = pd.Series(list(bordering_country_cities_dict.keys()))
converted_keys = coco.convert(keys_series, to='iso2')
iso2_border_cities = {key: bordering_country_cities_dict[value] for key, value in zip(converted_keys, bordering_country_cities_dict.keys())}

unhcr = pd.read_csv('../../data/clean/unhcr.csv', engine='pyarrow').groupby(['iso_o','iso_d']).agg({'newarrival':'sum','contig':'first','Country_o':'first','Country_d':'first', 'island_o':'first'}).reset_index()
iso2_countries = coco.convert(unhcr.Country_o.unique(), to='iso2')

city_topic_ids = pd.read_csv('topic_ids/city_topic_id.csv')
city_topic_ids['iso2'] = coco.convert(city_topic_ids.search_country, to='iso2')

In [70]:
border_city_trends = pd.DataFrame()
for o_country in tqdm(iso2_countries):
    o_dict = iso2_border_cities[o_country]
    for d_country in o_dict:
        topic_mids = neighboring_city_ids[neighboring_city_ids.search_keyword.isin(o_dict[d_country]) & ~neighboring_city_ids.search_keyword.isin(city_topic_ids.search_keyword)].topic_mid
        o_trend_d_cities = get_trends_data(o_country, topic_mids, start_date, end_date)
        o_trend_d_cities['country_d'] = d_country
        o_trend_d_cities['country_o'] = o_country
        border_city_trends = pd.concat([border_city_trends, o_trend_d_cities.loc[:, ~o_trend_d_cities.columns.duplicated()]], axis=0)

100%|██████████| 196/196 [10:48:29<00:00, 198.52s/it]  


In [79]:
border_city_trends.to_csv('data/2005_topics/bordering_cities_trends_2005.csv')