In [1]:
import pandas as pd
import datetime
import numpy as np

In [2]:
from fetch_data import fetch_df

In [3]:
start = datetime.date(2017, 3, 1) # <--- change this to start of interval (YYY, MM, DD)
end = datetime.date(2017, 3, 10) # <--- change this to end of interval (YYY, MM, DD)

In [4]:
df = fetch_df(start)

In [5]:
df.columns

Index(['Day', 'MonthYear', 'Year', 'FractionDate', 'Actor1Code', 'Actor1Name',
       'Actor1CountryCode', 'Actor1KnownGroupCode', 'Actor1EthnicCode',
       'Actor1Religion1Code', 'Actor1Religion2Code', 'Actor1Type1Code',
       'Actor1Type2Code', 'Actor1Type3Code', 'Actor2Code', 'Actor2Name',
       'Actor2CountryCode', 'Actor2KnownGroupCode', 'Actor2EthnicCode',
       'Actor2Religion1Code', 'Actor2Religion2Code', 'Actor2Type1Code',
       'Actor2Type2Code', 'Actor2Type3Code', 'IsRootEvent', 'EventCode',
       'EventBaseCode', 'EventRootCode', 'QuadClass', 'GoldsteinScale',
       'NumMentions', 'NumSources', 'NumArticles', 'AvgTone', 'Actor1Geo_Type',
       'Actor1Geo_Fullname', 'Actor1Geo_CountryCode', 'Actor1Geo_ADM1Code',
       'Actor1Geo_Lat', 'Actor1Geo_Long', 'Actor1Geo_FeatureID',
       'Actor2Geo_Type', 'Actor2Geo_Fullname', 'Actor2Geo_CountryCode',
       'Actor2Geo_ADM1Code', 'Actor2Geo_Lat', 'Actor2Geo_Long',
       'Actor2Geo_FeatureID', 'ActionGeo_Type', 'ActionGeo

In [6]:
data_kept = df[['IsRootEvent', 'QuadClass', 'EventCode', 'EventRootCode', 'ActionGeo_Lat', 'ActionGeo_Long', 'ActionGeo_CountryCode', 'Sources']]

In [7]:
data_kept = data_kept.dropna(axis=0)

In [8]:
from bs4 import BeautifulSoup
import requests

html_doc = requests.get('https://www.cia.gov/library/publications/the-world-factbook/appendix/appendix-d.html')

soup = BeautifulSoup(html_doc.content, 'html.parser')

mapping = []

for elem in soup.find_all('div'):
    if elem.get('class') != None and elem.get('class')[0] == 'category_data':
        mapping.append(np.array([elem.find_all('td')[0].a.string, elem.find_all('td')[1].string, elem.find_all('td')[4].string, elem.find_all('td')[7].string]))

mapping = np.array(mapping)

In [9]:
fips_to_alpha = {}
alpha_to_name = {}
tld_to_name = {}


for x in mapping:
    fips_to_alpha[x[1]] = x[2]
    alpha_to_name[x[2]] = x[0]
    tld_to_name[x[3]] = x[0]
    
fips_to_alpha['RB'] = 'SRB'


In [10]:
data_kept['country_code_alpha'] = data_kept['ActionGeo_CountryCode'].apply(lambda x: fips_to_alpha[x] if x in fips_to_alpha else 'None')

In [11]:
data_kept = data_kept[data_kept['country_code_alpha'] != 'None']

In [12]:
data_kept['country_name'] = data_kept['country_code_alpha'].apply(lambda x: alpha_to_name[x] if x in alpha_to_name else 'None')

In [13]:
import re

In [14]:
def get_tld(url, tld=False):
            
    url_pair = re.findall(r'\b(?!www\.)([a-zA-Z0-9-]+(\.[a-z]+)+)', url.lower())

    if(url_pair == []):
        return url
    else:
        if tld:
            return url_pair[0][1]
        else:
            return url_pair[0][0]

In [15]:
data_kept['source_country_name'] = data_kept['Sources'].apply(lambda x: get_tld(x, tld=True)).apply(lambda x: tld_to_name[x] if x in tld_to_name else 'Unknown')

In [16]:
data_kept['source_country_name'].value_counts()/len(data_kept['source_country_name'].values) * 100

Unknown                  79.671801
United Kingdom            4.379800
Australia                 3.058759
India                     1.100664
Pakistan                  0.980272
Canada                    0.940686
South Africa              0.859064
Nigeria                   0.637463
Ireland                   0.588082
New Zealand               0.517071
China                     0.464833
Philippines               0.390150
Tuvalu                    0.333423
Kenya                     0.321588
Azerbaijan                0.315875
Malaysia                  0.274656
Vietnam                   0.271391
Germany                   0.244048
Armenia                   0.239967
Ghana                     0.183240
Switzerland               0.173037
Colombia                  0.164875
Korea, South              0.146102
Zimbabwe                  0.145286
Iran                      0.143654
Saudi Arabia              0.138756
Bangladesh                0.136716
Turkey                    0.129370
Sri Lanka           

In [17]:
source_country = pd.read_csv('../data/clean_url_to_country.csv')

In [18]:
mapping = source_country[['Country name', 'Clean URL']].drop_duplicates().set_index('Clean URL')['Country name']

In [19]:
dic = mapping.to_dict()

In [20]:
dic['4-traders.com'] = 'France'
dic['news.xinhuanet.com'] = 'China'
dic['sputniknews.com'] = 'Russia'
dic['yahoo.com'] = 'United States'
dic['globalsecurity.org'] = 'United States'
dic['india.com'] = 'India'
dic['malaysiandigest.com'] = 'Malaysia'
dic['freerepublic.com'] = 'United States'

In [21]:
data_kept.loc[data_kept['source_country_name'] == 'Unknown', 'source_country_name'] = data_kept.loc[data_kept['source_country_name'] == 'Unknown', 'Sources'].apply(lambda x: get_tld(x)).apply(lambda x: dic[x] if x in dic else 'Unknown')

In [22]:
data_kept['source_country_name'].value_counts()/len(data_kept['source_country_name'].values) * 100

United States                    30.440265
Unknown                          30.091334
United Kingdom                    4.776480
India                             4.687513
Australia                         3.184048
Canada                            2.264176
Nigeria                           2.017679
Pakistan                          1.337774
South Africa                      1.038631
China                             0.922321
Malaysia                          0.897426
International                     0.866410
Philippines                       0.838251
Africa Regional                   0.827232
Ireland                           0.817029
France                            0.663581
Ghana                             0.651338
Israel                            0.590122
New Zealand                       0.523601
Russia                            0.443612
Bangladesh                        0.426471
Near and Middle East Regional     0.408923
Kenya                             0.373826
Europe Regi

In [23]:
data_kept.loc[data_kept['source_country_name'] == 'Unknown', 'Sources'].apply(lambda x: get_tld(x)).value_counts()

english.sina.com              453
godlikeproductions.com        414
sott.net                      365
usa.onlinenigeria.com         363
bostonglobe.com               354
article.wn.com                339
news.trust.org                331
whio.com                      325
beijingbulletin.com           307
openpr.com                    297
irishsun.com                  281
indiagazette.com              261
arabherald.com                260
hosted2.ap.org                245
wsws.org                      243
blogs.worldbank.org           241
worldbulletin.net             237
refworld.org                  233
theiranproject.com            232
m.themalaymailonline.com      226
nasdaq.com                    222
juancole.com                  212
mypalmbeachpost.com           208
benarnews.org                 203
m.philstar.com                200
pressreleasepoint.com         194
criticalthreats.org           188
iraqsun.com                   188
targetednews.com              188
oneindia.com  

In [24]:
data_kept = data_kept[['IsRootEvent', 'QuadClass', 'EventCode', 'EventRootCode', 'ActionGeo_Lat', 'ActionGeo_Long', 'country_name', 'source_country_name']]

In [25]:
data_kept.to_csv('../data/data_cleaned.csv')

In [28]:
alpha_to_name

{'-': 'Zaire',
 'ABW': 'Aruba',
 'AFG': 'Afghanistan',
 'AGO': 'Angola',
 'AIA': 'Anguilla',
 'ALB': 'Albania',
 'AND': 'Andorra',
 'ARE': 'United Arab Emirates',
 'ARG': 'Argentina',
 'ARM': 'Armenia',
 'ASM': 'American Samoa',
 'ATA': 'Antarctica',
 'ATF': 'French Southern and Antarctic Lands',
 'ATG': 'Antigua and Barbuda',
 'AUS': 'Australia',
 'AUT': 'Austria',
 'AZE': 'Azerbaijan',
 'BDI': 'Burundi',
 'BEL': 'Belgium',
 'BEN': 'Benin',
 'BFA': 'Burkina Faso',
 'BGD': 'Bangladesh',
 'BGR': 'Bulgaria',
 'BHR': 'Bahrain',
 'BHS': 'Bahamas, The',
 'BIH': 'Bosnia and Herzegovina',
 'BLM': 'Saint Barthelemy',
 'BLR': 'Belarus',
 'BLZ': 'Belize',
 'BMU': 'Bermuda',
 'BOL': 'Bolivia',
 'BRA': 'Brazil',
 'BRB': 'Barbados',
 'BRN': 'Brunei',
 'BTN': 'Bhutan',
 'BVT': 'Bouvet Island',
 'BWA': 'Botswana',
 'CAF': 'Central African Republic',
 'CAN': 'Canada',
 'CCK': 'Cocos (Keeling) Islands',
 'CHE': 'Switzerland',
 'CHL': 'Chile',
 'CHN': 'China',
 'CIV': "Cote d'Ivoire",
 'CMR': 'Cameroon'

In [41]:
name_to_alpha = {alpha_to_name[x].strip(): x for x in alpha_to_name}

In [65]:
data_kept['source_country_name'] = data_kept['source_country_name'].apply(lambda x: name_to_alpha[x.strip()] if x.strip() in name_to_alpha else 'INT')

In [61]:
data_kept.loc[data_kept['source_country_name'] == 'South Korea', 'source_country_name'] = 'Korea, South'
data_kept.loc[data_kept['source_country_name'] == 'North Korea', 'source_country_name'] = 'Korea, North'
data_kept.loc[data_kept['source_country_name'] == 'Bahamas', 'source_country_name'] = 'Bahamas, The'
data_kept.loc[data_kept['source_country_name'] == 'Czech Republic', 'source_country_name'] = 'Czechia'
data_kept.loc[data_kept['source_country_name'] == 'Western Samoa', 'source_country_name'] = 'Samoa'
data_kept.loc[data_kept['source_country_name'] == 'Myanmar', 'source_country_name'] = 'Burma'
data_kept.loc[data_kept['source_country_name'] == 'United States Virgin Islands', 'source_country_name'] = 'Virgin Islands'
data_kept.loc[data_kept['source_country_name'] == 'Gambia', 'source_country_name'] = 'Gambia, The'
data_kept.loc[data_kept['source_country_name'] == 'Falkland Islands', 'source_country_name'] = 'Falkland Islands (Islas Malvinas)'
data_kept.loc[data_kept['source_country_name'] == 'Brunei Darussalam', 'source_country_name'] = 'Brunei'