In [1]:
# Dependencies and Setup
import pandas as pd
import json
from tinydb import TinyDB, Query

In [2]:
# File to Load - Google_trends.csv created by Jing Duan
google_trends_to_load = "google_trends.csv"
google_trends_df = pd.read_csv(google_trends_to_load)

google_trends_df

Unnamed: 0,Month,Angola,Benin,Canada,Mozambique,Jersey,Kyrgyzstan,Laos,Jordan,Myanmar,...,Caribbean Netherlands,Antigua and Barbuda,Australia,Macao,Monaco,Kosovo,Kuwait,Mongolia,Madagascar,Aruba
0,2011-01,0,0,<1,0,0,0,0,0,0,...,0,0,<1,0,0.0,0,0,0,0,0
1,2011-02,0,0,<1,0,0,0,0,0,0,...,0,0,<1,0,0.0,0,0,0,0,0
2,2011-03,0,26,<1,0,0,0,0,0,0,...,0,0,<1,0,0.0,0,0,0,0,0
3,2011-04,0,0,<1,0,0,0,0,0,0,...,0,0,<1,0,0.0,0,0,0,0,0
4,2011-05,0,0,1,0,0,0,0,3,0,...,0,0,1,11,0.0,0,1,0,0,0
5,2011-06,12,0,2,0,4,45,0,1,0,...,0,31,2,6,0.0,0,1,0,27,0
6,2011-07,12,0,1,0,0,49,0,1,0,...,0,0,1,0,0.0,0,1,0,0,0
7,2011-08,0,0,1,0,3,0,20,1,0,...,0,0,1,0,0.0,0,1,0,0,5
8,2011-09,0,0,1,0,0,0,19,1,0,...,0,0,1,0,0.0,0,1,0,0,6
9,2011-10,10,0,<1,0,0,0,0,2,0,...,0,0,<1,0,0.0,0,0,2,0,0


In [3]:
google_country_list = sorted(list(google_trends_df.columns))
print(f"Excluding the Month column, Google Trends \
gave us {len(google_country_list) - 1} countries.")
list_google_countries_df = pd.DataFrame(google_country_list, columns = ["Country"])
google_country_list = list(google_trends_df.columns)
print(f"The first 5 countries are {google_country_list[0:5]}.")

Excluding the Month column, Google Trends gave us 220 countries.
The first 5 countries are ['Month', 'Angola', 'Benin', 'Canada', 'Mozambique'].


In [4]:
#pd.DataFrame.isnull(google_trends_df)
# NaN's were found; replace those with 0; replace "<1"'s with 0.5
google_trends_df.fillna(0, inplace = True)
google_trends_df.replace(to_replace ="<1", value = 0.5, inplace = True)
google_trends_df

Unnamed: 0,Month,Angola,Benin,Canada,Mozambique,Jersey,Kyrgyzstan,Laos,Jordan,Myanmar,...,Caribbean Netherlands,Antigua and Barbuda,Australia,Macao,Monaco,Kosovo,Kuwait,Mongolia,Madagascar,Aruba
0,2011-01,0,0,0.5,0,0,0,0,0,0,...,0,0,0.5,0,0.0,0,0,0,0,0
1,2011-02,0,0,0.5,0,0,0,0,0,0,...,0,0,0.5,0,0.0,0,0,0,0,0
2,2011-03,0,26,0.5,0,0,0,0,0,0,...,0,0,0.5,0,0.0,0,0,0,0,0
3,2011-04,0,0,0.5,0,0,0,0,0,0,...,0,0,0.5,0,0.0,0,0,0,0,0
4,2011-05,0,0,1,0,0,0,0,3,0,...,0,0,1,11,0.0,0,1,0,0,0
5,2011-06,12,0,2,0,4,45,0,1,0,...,0,31,2,6,0.0,0,1,0,27,0
6,2011-07,12,0,1,0,0,49,0,1,0,...,0,0,1,0,0.0,0,1,0,0,0
7,2011-08,0,0,1,0,3,0,20,1,0,...,0,0,1,0,0.0,0,1,0,0,5
8,2011-09,0,0,1,0,0,0,19,1,0,...,0,0,1,0,0.0,0,1,0,0,6
9,2011-10,10,0,0.5,0,0,0,0,2,0,...,0,0,0.5,0,0.0,0,0,2,0,0


In [5]:
#centroids_df = pd.read_json('country_centroids_az8.json', lines=False)
latlngs_to_load = "google_latlngs.csv" #"country_latlngs.csv"
latlngs_df = pd.read_csv(latlngs_to_load)[["Country","Latitude","Longitude"]]

print(f"The Lat-Lng reference file has {len(latlngs_df)} countries.")
latlngs_df.head()

The Lat-Lng reference file has 252 countries.


Unnamed: 0,Country,Latitude,Longitude
0,Afghanistan,33.93911,67.709953
1,Aland Islands,60.214887,19.953288
2,Albania,41.153332,20.168331
3,Algeria,28.033886,1.659626
4,American Samoa,-14.270972,-170.132217


In [6]:
latlngs_country_list = sorted(list(latlngs_df['Country']))
culled_countries_df = list_google_countries_df[list_google_countries_df["Country"]. \
                                             isin(latlngs_country_list)]
print(f"{len(culled_countries_df)} countries from \
Google Trend were found in the latlng reference file.")

220 countries from Google Trend were found in the latlng reference file.


In [7]:
# Get difference of two lists 
# Using set() 
def Diff(google_country_list, latlngs_country_list): 
    return (list(set(google_country_list) - set(latlngs_country_list))) 
  
print(f"Here's a list of 'country' column titles differing between between \
the two csv files: {Diff(google_country_list, latlngs_country_list)}.")

Here's a list of 'country' column titles differing between between the two csv files: ['Month'].


In [8]:
# example of using country name to extract lat/lng from reference df
latlngs_row = latlngs_df.loc[latlngs_df["Country"] == 'Aruba'].values.tolist()
print(latlngs_row[0][1])
latlngs_df.loc[latlngs_df["Country"] == 'Aruba'].values[0][1]

12.52111


12.52111

In [9]:
# Assemble a list of dictionaries, each composed of 2 dicts: 1) a YYYY-MM value 2) a dict of 
# countries with their name, latlng coords, and google trend values:
# {
#     "date": "2011-01",
#     "countries": [
#       {
#         "name": "Angola",
#         "latlng": [-12.29336054, 17.53736768],
#         "trend_value": "0"
#       },
#       {
#         ...
#       }]
# },...

data_dict_list = []
for row in range(0,len(google_trends_df)):
    # print(f"Processing row {row}")
    i = 1
    # create a dict; 
    month_data_dict = {"date": google_trends_df['Month'][row]}
    
    # create a list of country dicts
    countries = []
    while i <= (len(google_country_list) - 1):
        
#         countries_dict = {"name": google_country_list[i],
#             "lat": latlngs_df.loc[latlngs_df["Country"] == google_country_list[i]].values[0][1],
#             "lng": latlngs_df.loc[latlngs_df["Country"] == google_country_list[i]].values[0][2],
#             "trend_value": str(google_trends_df[google_country_list[i]][row])
#                  }

        countries_dict = {"name": google_country_list[i],
            "latlng": [latlngs_df.loc[latlngs_df["Country"] == google_country_list[i]].values[0][1],
                       latlngs_df.loc[latlngs_df["Country"] == google_country_list[i]].values[0][2]],
            "trendValue": str(google_trends_df[google_country_list[i]][row])
                 }




        # add each 'countries' dict to the countries list
        countries.append(dict(countries_dict))
        i+=1

    # add second dictionary 'countries' to our month_data_dict
    month_data_dict["countries"] = countries
    
    # add this two-dictionary dictionaries to overall list
    data_dict_list.append(month_data_dict)

In [17]:
data_dict_list[0]

{'date': '2011-01',
 'countries': [{'name': 'Angola',
   'latlng': [-11.202691999999999, 17.873887],
   'trendValue': '0'},
  {'name': 'Benin', 'latlng': [9.30769, 2.315834], 'trendValue': '0'},
  {'name': 'Canada',
   'latlng': [56.130366, -106.34677099999999],
   'trendValue': '0.5'},
  {'name': 'Mozambique', 'latlng': [-18.665695, 35.529562], 'trendValue': '0'},
  {'name': 'Jersey', 'latlng': [49.214439, -2.13125], 'trendValue': '0'},
  {'name': 'Kyrgyzstan', 'latlng': [41.20438, 74.766098], 'trendValue': '0'},
  {'name': 'Laos',
   'latlng': [19.856270000000002, 102.495496],
   'trendValue': '0'},
  {'name': 'Jordan',
   'latlng': [30.585164000000002, 36.238414],
   'trendValue': '0'},
  {'name': 'Myanmar', 'latlng': [21.913965, 95.956223], 'trendValue': '0'},
  {'name': 'Cameroon', 'latlng': [7.369722, 12.354722], 'trendValue': '0'},
  {'name': 'Belize', 'latlng': [17.189877, -88.49765], 'trendValue': '0'},
  {'name': 'Andorra',
   'latlng': [42.546245, 1.6015540000000001],
   'tr

In [20]:
db = TinyDB('google-trend-data.db')
db.purge_tables()

In [21]:
for i in range(len(data_dict_list)):
    db.insert(data_dict_list[i])

In [13]:
# Not needed, but these are other ways to save the data:

# dump the whole data list to a json file
with open('google-trend-data.json', 'w') as fout:
    json.dump(data_dict_list, fout)

# create a javascript data file with the data as a variable
trendDataJs = "var trendData = " + str(data_dict_list)
with open("trendData.js", "w") as text_file:
    text_file.write(trendDataJs)