In [1]:
import pandas as pd
import pyspark
import os
import folium

from pyspark.sql import *
from pyspark.sql.functions import to_timestamp, when
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [57]:
txt_file = sc.textFile('../data/GdeltDomainsByCountry-May18.txt')

In [97]:
rdd = txt_file.map(lambda x: x.split("\t"))
url_sources = rdd.toDF(['web', 'code', 'country'])
url_sources_pd = url_sources.toPandas()

In [169]:
# so far for the first 100 top countries
wrong_codes = ['GM','GA','GB','UK','RS','AU','AS','CN','CH','SV','ES','SP','TU','KN','SC','SE','SW','ZA','SF','UP','JA','SZ','IS','EZ','EI','VM','PO','CI','KR','KS','RP','DA','NU','NE','NG','NI','nic','SN','SG','sig','LO','IC','TK','TL','BM','BD','BG','BU','EN','LI','LS','LT','LH','BY','BO','RI','LG','MC','MN','MG','MA','MO','moc','CE','MU','PA','PM','LE','AJ','ZI','CJ','IZ','BH','BA','BK','BL','AG','YM','DO','DR','TO','TN','TS','GG','HA','WA','CB','BF','SU','KU','GJ','MP','GV','IV','ST','TP','TT','TD','NS','AV','AC','HO','CD','BC','TI','UV','CG','CF','RQ','AN','BN','MJ','KV','MI','FP','VI','VT','BX','MH','GQ','EK','PP','OD','WZ','VQ','GK','NH','SX','CK','TX','KT','PS','CT','AQ','AA','BP','CW','PC','MB','AY','CQ','PU','MF','RM','NT','FS','WE','FG']
right_codes = ['DE','GM','GA','GB','RU','AT','AU','KM','CN','SJ','SV','ES','TR','KP','KN','SC','SE','ZM','ZA','UA','JP','CH','IL','CZ','IE','VN','PT','CL','KI','KR','PH','DK','nic','NU','NE','NG','NI','sig','SN','SG','SK','IS','TC','TK','MM','BM','BD','BG','EE','LR','LI','LS','LT','BI','BY','RS','LV','moc','MC','MN','MG','MA','MO','LK','OM','PY','PA','LB','AZ','ZW','KY','IQ','BZ','BH','BA','BO','DZ','YE','DM','DO','TG','TO','TN','GE','HT','NA','KH','BS','SD','KW','GD','MU','GN','CI','LC','ST','TP','TT','SR','AI','AG','HN','TD','BW','TJ','BF','CD','CG','PR','AD','BJ','ME','XK','MW','PF','VG','VA','BN','MS','GU','GQ','PG','SS','SZ','VI','GG','VU','GS','CC','TM','CX','PW','CF','AS','AW','SB','CK','PN','MQ','AQ','MP','GW','YT','MH','AN','FQ','PS','GF']

#print('code\twrong\tright')
#for code in wrong_codes:
#    if code in right_codes:
#        if wrong_codes.index(code) >= right_codes.index(code):
#            print(code + '\t' + str(wrong_codes.index(code)) + '\t' + str(right_codes.index(code)))

corr_codes = pd.DataFrame({'wrong': wrong_codes, 'right': right_codes}, columns=['wrong', 'right'])

invalid_codes = ['CV','RB','OS','TT']
for code in invalid_codes:
    url_sources_pd = url_sources_pd[url_sources_pd['code'] != code]
    
url_sources_corr = pd.merge(url_sources_pd, corr_codes, how='left', left_on='code', right_on='wrong')
url_sources_corr.loc[url_sources_corr['right'].isnull(), 'right'] = url_sources_corr['code']
url_sources_corr.head()

Unnamed: 0,web,code,country,wrong,right
0,0-100.it,IT,Italy,,IT
1,0-50.ru,RS,Russia,RS,RU
2,0-60mag.com,US,United States,,US
3,0-debt.com,US,United States,,US
4,000fff.org,US,United States,,US


In [231]:
url_sum = url_sources_corr.groupby(['country', 'right'], as_index=False).count()
url_sum = url_sum.drop(['code', 'wrong'], axis=1)
url_sum = url_sum.rename(index=str, columns={"right": "code", "web": "count"})
url_sum = url_sum.sort_values('count', ascending=False)
url_sum.head()

Unnamed: 0,country,code,count
225,United States,US,56160
224,United Kingdom,GB,14892
177,Russia,RU,8216
103,Italy,IT,7672
38,Canada,CA,7331


In [172]:
country_info = pd.read_csv('../data/countries-info.csv')
country_info = country_info[['name', 'alpha-2', 'alpha-3', 'region', 'sub-region']]
country_info.head(5)

Unnamed: 0,name,alpha-2,alpha-3,region,sub-region
0,Afghanistan,AF,AFG,Asia,Southern Asia
1,Åland Islands,AX,ALA,Europe,Northern Europe
2,Albania,AL,ALB,Europe,Southern Europe
3,Algeria,DZ,DZA,Africa,Northern Africa
4,American Samoa,AS,ASM,Oceania,Polynesia


In [232]:
df_m = pd.merge(country_info, url_sum, left_on='alpha-2', right_on='code')
df_m = df_m.drop(['country', 'code'], axis=1)
df_m.sort_values(['count'], ascending=False).head(5)

Unnamed: 0,name,alpha-2,alpha-3,region,sub-region,count
218,United States of America,US,USA,Americas,Northern America,56160
217,United Kingdom of Great Britain and Northern I...,GB,GBR,Europe,Northern Europe,14892
173,Russian Federation,RU,RUS,Europe,Eastern Europe,8216
104,Italy,IT,ITA,Europe,Southern Europe,7672
37,Canada,CA,CAN,Americas,Northern America,7331


In [234]:
import branca.colormap as cm


world_map = '../data/countries-land-10km.geo.json'

color_scale = cm.linear.YlOrRd_04.to_step(40).scale(min(df_m['count']), max(df_m['count']))


def get_color(code):
    country = df_m[df_m['alpha-3'] == code]['name'].count()
    if country == 0:
        # print(code)
        return '#8c8c8c' # MISSING -> gray
    else:
        return color_scale(int(df_m[df_m['alpha-3'] == code]['count']))

    
m = folium.Map(location=[46.8, 8.3], tiles='cartodbpositron', zoom_start=2)
folium.GeoJson(
    data = world_map,
    style_function = lambda feature: {
        'fillColor': get_color(feature['properties']['A3']),
        'fillOpacity': 0.7,
        'color' : 'black',
        'weight' : 1,
    }    
).add_to(m)
m.add_child(color_scale)
m