This notebook is used for collecting google trends data.
- It takes the countries from the UNHCR refugees dataset
- Gets the languages associated  to each country
- Gets the two letter language codes associated with each language
- Translates words from english to those languages

#### Read in relevant dictionaries, dataframes, modules

In [21]:
import pytrends
from country_abbrev import *
from country_language import *
from pytrends.request import TrendReq
import pandas as pd
import itertools
from googletrans import LANGCODES
import swifter

# get the list of all unique countries:
countries = pd.Series(pd.read_csv('../../data/data.csv', engine="pyarrow").Country_o.unique()).to_frame(name='country')

# list of all unique languages:
unique_languages = pd.Series(list(set(list(itertools.chain(*country_language_dict.values())))), name='language')

# list of language codes from googletrans
langcodes = pd.DataFrame.from_dict(LANGCODES, orient='index', columns=['code'])
langcodes.index = langcodes.index.str.capitalize()

Merge list of languages 

In [22]:
refugee_lang = unique_languages.to_frame().merge(langcodes, left_on='language', right_index=True, how='left')

Out of the approximately 190 languages, there are about 110 left that don't have codes associated with the specific names we provide. This could be due to not data cleaning, because appear to be less commonly used languages we will skip this for now.

In [23]:
refugee_lang[refugee_lang['code'].isna()].sample(10)

Unnamed: 0,language,code
112,Balochi,
36,Moldovan,
69,Tuvaluan,
67,Kinyarwanda,
99,Tamazight,
33,Swati,
90,Setswana,
85,Kikuyu,
20,Bemba,
17,Lomwe,


In [24]:
refugee_lang.dropna(inplace=True)

Set up translator(s)

In [25]:
# from deep_translator import GoogleTranslator
# translator = GoogleTranslator(source='en', target='en') # output -> Weiter so, du bist großartig

# def translate_keywords_slow(translator, series, lang):
#     translator.target = lang
#     series = series.str.split('+').explode()
#     series_translated = translator.translate_batch(series.values.tolist())
#     series_translated = pd.Series(index=series.index.tolist(), data=series_translated, name = series.name).to_frame().groupby(series.index)[series.name].agg(list).apply(lambda x: '+'.join(x))
#     return series_translated

In [26]:
import requests

def translate_keywords(series, lang):

    series = series.str.split('+').explode()
    url = "https://translate.googleapis.com/translate_a/single"
    params = {
        "client": "gtx",
        "sl": "auto",
        "tl": lang,
        "dt": "t",
        "q": "\n".join(series.tolist())
    }
    response = requests.get(url, params=params)
    series_translated = [r[0].strip('\n').lower() for r in response.json()[0]]
    series_translated = pd.Series(index=series.index.tolist(), data=series_translated, name = series.name).to_frame().groupby(series.index)[series.name].agg(list).apply(lambda x: '+'.join(x))
    return series_translated

Read in list of words from the paper:

In [27]:
boss_words = pd.read_csv('boss_words.csv')['list']

In [14]:
# removing en from list
refugee_lang_not_en = refugee_lang[refugee_lang['code'] != 'en']

# for each language in set, translate list of words to that language
translated_keyword = refugee_lang_not_en['code'].swifter.apply(lambda x: translate_keywords(series = boss_words, lang= x))

Pandas Apply:   0%|          | 0/83 [00:00<?, ?it/s]

In [65]:
# display results of df.t
terms_df = pd.concat([boss_words.rename('en',), translated_keyword.T.rename(refugee_lang['code'], axis='columns')], axis=1)

In [33]:
python -m spacy download es_core_news_sm

SyntaxError: invalid syntax (3137984990.py, line 1)

In [34]:
import spacy


def get_gendered_words(adjective, language_code):
    nlp = spacy.load(f'{language_code}_core_news_sm')
    adjective_doc = nlp(adjective)
    gendered_words = {}
    for token in adjective_doc:
        if token.pos_ == 'ADJ':
            if token.tag_ == 'JJM':
                gendered_words['male'] = token.text
            elif token.tag_ == 'JJF':
                gendered_words['female'] = token.text
    return gendered_words


In [35]:
get_gendered_words('amigo', 'es')

OSError: [E050] Can't find model 'es_core_news_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

#### Pytrends

In [36]:
import numpy as np

pytrends = TrendReq(hl='en-US', tz=360)

In [91]:
np.array([1,2,3]).max()

3

In [115]:
def groups_of_5(an_index):
    index_len = len(an_index)
    remainder_len = index_len % 5
    main_array = np.repeat(np.arange(index_len//5),5)
    end_array = np.repeat(main_array.max() + 1, remainder_len)
    entire_grouping = np.append(main_array, end_array)
    return entire_grouping

def pytrends_request(word_list, country):
    pytrends.build_payload(kw_list=word_list, geo=country_to_abbrev[country], timeframe='all')
    return pytrends.interest_over_time().drop('isPartial', axis=1)


def get_trends_data(country, trends_df):
    langs = country_language_dict[country]
    for lang in langs:
        code = langcodes.to_dict()['code'][lang]
        lang_terms = trends_df[code].to_frame()
        
        lang_terms['group'] = groups_of_5(lang_terms)
        

        trends = lang_terms.groupby('group').apply(lambda x: pytrends_request(x, country))
    return trends

test1= get_trends_data('Colombia', terms_df[0:10])
test1
        

ReadTimeout: HTTPSConnectionPool(host='trends.google.com', port=443): Read timed out. (read timeout=2)

MultiIndex([(0, '2004-01-01'),
            (0, '2004-02-01'),
            (0, '2004-03-01'),
            (0, '2004-04-01'),
            (0, '2004-05-01'),
            (0, '2004-06-01'),
            (0, '2004-07-01'),
            (0, '2004-08-01'),
            (0, '2004-09-01'),
            (0, '2004-10-01'),
            ...
            (1, '2022-07-01'),
            (1, '2022-08-01'),
            (1, '2022-09-01'),
            (1, '2022-10-01'),
            (1, '2022-11-01'),
            (1, '2022-12-01'),
            (1, '2023-01-01'),
            (1, '2023-02-01'),
            (1, '2023-03-01'),
            (1, '2023-04-01')],
           names=['index', 'date'], length=464)

In [50]:
pytrends.build_payload(kw_list=['amigo','salsa','test','cats','wowee'], geo=country_to_abbrev['Colombia'], timeframe='all')
pytrends.interest_over_time().drop('isPartial')

Unnamed: 0_level_0,amigo,salsa,test,cats,wowee,isPartial
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2004-01-01,25,53,68,7,0,False
2004-02-01,10,35,95,5,0,False
2004-03-01,16,32,87,0,0,False
2004-04-01,17,47,88,2,0,False
2004-05-01,13,46,86,0,0,False
...,...,...,...,...,...,...
2022-12-01,39,45,58,2,0,False
2023-01-01,36,37,69,2,0,False
2023-02-01,36,34,71,2,0,False
2023-03-01,36,37,66,2,0,False


ValueError: Length mismatch: Expected axis has 192 elements, new values have 190 elements

2

In [73]:
import numpy as np

In [89]:

np.append(np.repeat(np.arange(192//5),5), [38])

array([ 0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  3,  3,
        3,  3,  3,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  6,  6,  6,  6,
        6,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9, 10,
       10, 10, 10, 10, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 13, 13, 13,
       13, 13, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16,
       17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 20, 20,
       20, 20, 20, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 23, 23, 23, 23,
       23, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 27,
       27, 27, 27, 27, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 30, 30, 30,
       30, 30, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
       34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36, 37, 37,
       37, 37, 37, 38])

In [None]:
192//5