In [1]:
import pandas as pd
import numpy as np
import requests
import io
import re
import datetime
import math
import itertools 

In [2]:
urls = {
    'master': 'http://data.gdeltproject.org/gdeltv2/masterfilelist.txt'
}

In [3]:
def clean_master(master):
    
    # drop nan cols
    master = master.dropna(how='any')

    # create date col
    def parse_date(string):
        return datetime \
            .datetime \
            .strptime(
                str(re.findall('/([0-9]*)\.', string)[0]),
                "%Y%m%d%H%M%S"
            )
    master = master.assign(date=master.url.apply(parse_date))
    
    # create col_type
    master = master.assign(col_type=master.url.apply(lambda url: url.split('.')[-3]))
    
    return master


def fetch_master():
    master = pd.read_csv(
        urls['master'],
        sep=" ",
        header=None,
        names=['size', 'code', 'url']
    )
    return clean_master(master)


master = fetch_master()

master.head(7)

Unnamed: 0,size,code,url,date,col_type
0,150383,297a16b493de7cf6ca809a7cc31d0b93,http://data.gdeltproject.org/gdeltv2/201502182...,2015-02-18 23:00:00,export
1,318084,bb27f78ba45f69a17ea6ed7755e9f8ff,http://data.gdeltproject.org/gdeltv2/201502182...,2015-02-18 23:00:00,mentions
2,10768507,ea8dde0beb0ba98810a92db068c0ce99,http://data.gdeltproject.org/gdeltv2/201502182...,2015-02-18 23:00:00,gkg
3,149211,2a91041d7e72b0fc6a629e2ff867b240,http://data.gdeltproject.org/gdeltv2/201502182...,2015-02-18 23:15:00,export
4,339037,dec3f427076b716a8112b9086c342523,http://data.gdeltproject.org/gdeltv2/201502182...,2015-02-18 23:15:00,mentions
5,10269336,2f1a504a3c4558694ade0442e9a5ae6f,http://data.gdeltproject.org/gdeltv2/201502182...,2015-02-18 23:15:00,gkg
6,149723,12268e821823aae2da90882621feda18,http://data.gdeltproject.org/gdeltv2/201502182...,2015-02-18 23:30:00,export


In [4]:
def split_master(master):
    splitted = {}
    elements = [['export', 'events'], ['mentions', 'mentions'], ['gkg', 'gkg']]
    for element in elements:
        splitted[element[1]] = master[master.col_type == element[0]]
    return splitted

def sample_master(master, date_start, date_end):
    return master[(master.date >= date_start) & (master.date <= date_end)]

# test
d1 = datetime.datetime.strptime("2015-04-18 22:00:00", "%Y-%m-%d %H:%M:%S")
d2 = datetime.datetime.strptime("2015-04-18 22:15:00", "%Y-%m-%d %H:%M:%S")
split_master(sample_master(master, d1, d2))['events']

Unnamed: 0,size,code,url,date,col_type
16953,50933,f4d2e1df34f4051ebb4d67b6038c3d28,http://data.gdeltproject.org/gdeltv2/201504182...,2015-04-18 22:00:00,export
16956,52319,d132eaaa0769f837036d727ecffc5c25,http://data.gdeltproject.org/gdeltv2/201504182...,2015-04-18 22:15:00,export


In [5]:
def clean_events(events):
    
    events = events.dropna()
    
    return events


def fetch_events(url):
    
    # @TODO refactor this
    columns = "GlobalEventID Day MonthYear Year FractionDate Actor1Code Actor1Name Actor1CountryCode Actor1KnownGroupCode Actor1EthnicCode Actor1Religioni1Code Actor1Religion2Code Actor1Type1Code Actor1Type2Code Actor1Type3Code Actor2Code Actor2Name Actor2CountryCode Actor2KnownGroupCode Actor2EthnicCode Actor2Religioni1Code Actor2Religion2Code Actor2Type1Code Actor2Type2Code Actor2Type3Code IsRootEvent EventCode EventBaseCode EventRootCode QuadClass GoldsteinScale NumMentions NumSources NumArticles AvgTone Actor1Geo_Type Actor1Geo_Fullname Actor1Geo_CountryCode Actor1Geo_ADM1Code Actor1Geo_ADM2Code Actor1Geo_Lat Actor1Geo_Long Actor1Geo_FeatureID Actor2Geo_Type Actor2Geo_Fullname Actor2Geo_CountryCode Actor2Geo_ADM1Code Actor2Geo_ADM2Code Actor2Geo_Lat Actor2Geo_Long Actor2Geo_FeatureID ActionGeo_Type ActionGeo_Fullname ActionGeo_CountryCode ActionGeo_ADM1Code ActionGeo_ADM2Code ActionGeo_Lat ActionGeo_Long ActionGeo_FeatureID DATEADDED SOURCEURL".split(" ")
    columns_use = ["GlobalEventID", "Day", "Actor1Name", "Actor1Geo_Lat", "Actor1Geo_Long", "Actor2Name", "Actor2Geo_Lat", "Actor2Geo_Long", 'NumMentions', 'NumSources', 'NumArticles', 'AvgTone', 'ActionGeo_Lat', 'ActionGeo_Long', 'DATEADDED', 'SOURCEURL']
    
    events = pd.read_csv(
        url,
        sep='\t',
        names=columns,
        usecols=columns_use
    )
    
    events = clean_events(events)

    return events


fetch_events(master.url[0]).head(4)

Unnamed: 0,GlobalEventID,Day,Actor1Name,Actor2Name,NumMentions,NumSources,NumArticles,AvgTone,Actor1Geo_Lat,Actor1Geo_Long,Actor2Geo_Lat,Actor2Geo_Long,ActionGeo_Lat,ActionGeo_Long,DATEADDED,SOURCEURL
5,410412352,20140218,AUSTRALIA,INVESTOR,2,1,2,-1.263823,-27.5,153.017,-27.5,153.017,-27.5,153.017,20150218230000,http://www.businessspectator.com.au/article/20...
6,410412353,20140218,AUSTRALIAN,NEW ZEALAND,1,1,1,7.517084,-36.0667,146.483,-41.0,174.0,-36.0667,146.483,20150218230000,http://www.voxy.co.nz/entertainment/coast-new-...
7,410412354,20140218,AUSTRALIA,KIWI,1,1,1,7.517084,-36.0667,146.483,-41.0,174.0,-41.0,174.0,20150218230000,http://www.voxy.co.nz/entertainment/coast-new-...
8,410412355,20140218,AUSTRALIA,KIWI,1,1,1,7.517084,-41.0,174.0,-36.0667,146.483,-36.0667,146.483,20150218230000,http://www.voxy.co.nz/entertainment/coast-new-...


In [6]:
def fetch_all(master, date_start=None, date_end=None, verbose=True):
    if date_start == None:
        date_start = datetime.datetime(2000, 1, 1)
    if date_end == None:
        date_end = datetime.datetime.now()
    sampled = sample_master(master, date_start, date_end)
    splitted = split_master(sampled)
    all_events = []
    for events_url in splitted['events']['url']:
        if verbose:
            print(events_url)
        try:
            events = fetch_events(events_url)
            all_events.append(events)
        except:
            if verbose:
                print("Error downloading url %s" % events_url)
            pass
    return pd.concat(all_events)


# test
d1 = datetime.datetime.strptime("2015-04-18 22:00:00", "%Y-%m-%d %H:%M:%S")
d2 = datetime.datetime.strptime("2015-04-18 23:15:00", "%Y-%m-%d %H:%M:%S")

# Duplicate source url check
#fetch_all(master, d1, d2, True) \
#    .groupby('SOURCEURL') \
#    .agg({'Day': 'count'}) \
#    .sort_values('Day', ascending=False) \
#    .head(5)

# Same source url but different actors etc etc
#d = fetch_all(master, d1, d2, True)
#d = d[d.SOURCEURL == 'http://www.theguardian.com/uk-news/2015/apr/18/abubaker-deghayes-brighton-jihadi-syria-amer']
#d

fetch_all(master, d1, d2, True).head(6)

http://data.gdeltproject.org/gdeltv2/20150418220000.export.CSV.zip
http://data.gdeltproject.org/gdeltv2/20150418221500.export.CSV.zip
http://data.gdeltproject.org/gdeltv2/20150418223000.export.CSV.zip
http://data.gdeltproject.org/gdeltv2/20150418224500.export.CSV.zip
http://data.gdeltproject.org/gdeltv2/20150418230000.export.CSV.zip
http://data.gdeltproject.org/gdeltv2/20150418231500.export.CSV.zip


Unnamed: 0,GlobalEventID,Day,Actor1Name,Actor2Name,NumMentions,NumSources,NumArticles,AvgTone,Actor1Geo_Lat,Actor1Geo_Long,Actor2Geo_Lat,Actor2Geo_Long,ActionGeo_Lat,ActionGeo_Long,DATEADDED,SOURCEURL
10,426533227,20150319,MINISTRY,SCHOOL,5,1,5,-0.619195,21.15,79.1,21.15,79.1,21.15,79.1,20150418220000,http://timesofindia.indiatimes.com/city/nagpur...
13,426533230,20150411,COMPANY,WORKER,10,1,10,0.724638,21.1098,-157.531,21.1098,-157.531,21.1098,-157.531,20150418220000,http://khon2.com/2014/02/07/mass-layoff-at-kai...
14,426533231,20150411,MINIST OF FINANCE,AMMAN,10,1,10,-0.531915,31.95,35.9333,31.95,35.9333,31.95,35.9333,20150418220000,http://jordantimes.com/jordan-awards-icd-sukuk...
17,426533234,20150411,AMMAN,MINIST OF FINANCE,10,1,10,-0.531915,31.95,35.9333,31.95,35.9333,31.95,35.9333,20150418220000,http://jordantimes.com/jordan-awards-icd-sukuk...
18,426533235,20150411,AMMAN,MINIST OF FINANCE,10,1,10,-0.531915,31.95,35.9333,31.95,35.9333,31.95,35.9333,20150418220000,http://jordantimes.com/jordan-awards-icd-sukuk...
19,426533236,20150411,MILITANT,IRAQI,10,1,10,-8.978328,33.4206,43.3078,33.4206,43.3078,33.4206,43.3078,20150418220000,http://jordantimes.com/iraqi-forces-retake-mos...


In [None]:
def sample_data(data, step_years, step_months, step_hours, step_minutes):
    years = np.arange(2015,  2018, step_years)
    months = np.arange(1, 12, step_months)
    days = np.arange(1, 31, step_days)
    hours = np.arange(0, 23, step_hours)
    minutes = np.arange(0, 45, step_minutes)
    
    permutations = [
        (d, mo, y, h, mi, 0)
        for y in years
        for mo in months
        for d in days
        for h in hours
        for mi in minutes
    ]
    return data.Date.filter(items=permutations)

In [None]:
def in_list(x,permutations):
    if x in permutations:
        return x

years = np.arange(2015,2018,1)
months = np.arange(1,12,1)
days = np.arange(1,31,5)
hours = np.arange(0,23,5)
minutes = np.arange(0,45,15)

permutations = [ [d,mo,y,h,mi,0] for y in years for mo in months for d in days for h in hours for mi in minutes]
master.Date.apply(lambda x: in_list(x,permutations))

In [None]:
export_names = "GlobalEventID Day MonthYear Year FractionDate Actor1Code Actor1Name Actor1CountryCode Actor1KnownGroupCode Actor1EthnicCode Actor1Religioni1Code Actor1Religion2Code Actor1Type1Code Actor1Type2Code Actor1Type3Code Actor2Code Actor2Name Actor2CountryCode Actor2KnownGroupCode Actor2EthnicCode Actor2Religioni1Code Actor2Religion2Code Actor2Type1Code Actor2Type2Code Actor2Type3Code IsRootEvent EventCode EventBaseCode EventRootCode QuadClass GoldsteinScale NumMentions NumSources NumArticles AvgTone Actor1Geo_Type Actor1Geo_Fullname Actor1Geo_CountryCode Actor1Geo_ADM1Code Actor1Geo_ADM2Code Actor1Geo_Lat Actor1Geo_Long Actor1Geo_FeatureID Actor2Geo_Type Actor2Geo_Fullname Actor2Geo_CountryCode Actor2Geo_ADM1Code Actor2Geo_ADM2Code Actor2Geo_Lat Actor2Geo_Long Actor2Geo_FeatureID ActionGeo_Type ActionGeo_Fullname ActionGeo_CountryCode ActionGeo_ADM1Code ActionGeo_ADM2Code ActionGeo_Lat ActionGeo_Long ActionGeo_FeatureID DATEADDED SOURCEURL"
export_columns = export_names.split(" ")
export = pd.DataFrame(columns=export_columns)

export = pd.read_csv(master['url'][6], sep = '\t', names = export_columns,
                     usecols = ["GlobalEventID", "Day", "Actor1Geo_Lat", "Actor1Geo_Long", "Actor2Geo_Lat", "Actor2Geo_Long",
                                'NumMentions','NumSources','NumArticles','AvgTone', 'ActionGeo_Lat', 'ActionGeo_Long','DATEADDED','SOURCEURL'])
export.dropna()

export.head(4)

In [None]:
mention_names = "GlobalEventId EventTimeDate MentionTimeDate MentionType MentionSourceName MentionIdentifier SentenceID ActorCharOffset Actor2CharOffset ActionCharOffset InRawTest Confidence MentionDocLen MentionDocTone MentionDocTranslationinfo Extras"
mention_columns = mention_names.split(" ")
mention = pd.DataFrame(columns=mention_columns)

mention = pd.read_csv(master['URL'][1], sep = '\t', names = mention_columns,
                      usecols = ["GlobalEventId", "EventTimeDate", "MentionTimeDate"])
mention.dropna()

In [None]:
for value in list1:
    if len(list2[list2 == value]) == 1:
        list3.append(value)