In [None]:
import pandas as pd
import numpy as np
import requests
import io
import re
import datetime
import math
import itertools 

In [None]:
urls = {
    'master': 'http://data.gdeltproject.org/gdeltv2/masterfilelist.txt'
}

In [None]:
def clean_master(master):
    
    # drop nan cols
    master = master.dropna(how='any')

    # create date col
    def parse_date(string):
        return datetime \
            .datetime \
            .strptime(
                str(re.findall('/([0-9]*)\.', string)[0]),
                "%Y%m%d%H%M%S"
            )
    master = master.assign(date=master.url.apply(parse_date))
    
    # create col_type
    master = master.assign(col_type=master.url.apply(lambda url: url.split('.')[-3]))
    
    return master


def fetch_master():
    master = pd.read_csv(
        urls['master'],
        sep=" ",
        header=None,
        names=['size', 'code', 'url']
    )
    return clean_master(master)


master = fetch_master()

master.head(7)

In [None]:
def sample_interval(master, date_min, date_max):
    return master[(master.date >= d1) & (master.date <= d2)]

# test
d1 = datetime.datetime.strptime("2015-04-18 22:00:00", "%Y-%m-%d %H:%M:%S")
d2 = datetime.datetime.strptime("2015-04-18 22:15:00", "%Y-%m-%d %H:%M:%S")
sample_interval(master, d1, d2)

In [None]:
def clean_events(events):
    
    events = events.dropna()
    
    return events


def fetch_events(url):
    
    # @TODO refactor this
    columns = "GlobalEventID Day MonthYear Year FractionDate Actor1Code Actor1Name Actor1CountryCode Actor1KnownGroupCode Actor1EthnicCode Actor1Religioni1Code Actor1Religion2Code Actor1Type1Code Actor1Type2Code Actor1Type3Code Actor2Code Actor2Name Actor2CountryCode Actor2KnownGroupCode Actor2EthnicCode Actor2Religioni1Code Actor2Religion2Code Actor2Type1Code Actor2Type2Code Actor2Type3Code IsRootEvent EventCode EventBaseCode EventRootCode QuadClass GoldsteinScale NumMentions NumSources NumArticles AvgTone Actor1Geo_Type Actor1Geo_Fullname Actor1Geo_CountryCode Actor1Geo_ADM1Code Actor1Geo_ADM2Code Actor1Geo_Lat Actor1Geo_Long Actor1Geo_FeatureID Actor2Geo_Type Actor2Geo_Fullname Actor2Geo_CountryCode Actor2Geo_ADM1Code Actor2Geo_ADM2Code Actor2Geo_Lat Actor2Geo_Long Actor2Geo_FeatureID ActionGeo_Type ActionGeo_Fullname ActionGeo_CountryCode ActionGeo_ADM1Code ActionGeo_ADM2Code ActionGeo_Lat ActionGeo_Long ActionGeo_FeatureID DATEADDED SOURCEURL".split(" ")
    columns_use = ["GlobalEventID", "Day", "Actor1Geo_Lat", "Actor1Geo_Long", "Actor2Geo_Lat", "Actor2Geo_Long", 'NumMentions', 'NumSources', 'NumArticles', 'AvgTone', 'ActionGeo_Lat', 'ActionGeo_Long', 'DATEADDED', 'SOURCEURL']
    
    events = pd.read_csv(
        url,
        sep='\t',
        names=columns,
        usecols=columns_use
    )
    
    events = clean_events(events)

    return events


fetch_events(master.url[0]).head(4)

In [None]:
def sample_data(data, step_years, step_months, step_hours, step_minutes):
    years = np.arange(2015,  2018, step_years)
    months = np.arange(1, 12, step_months)
    days = np.arange(1, 31, step_days)
    hours = np.arange(0, 23, step_hours)
    minutes = np.arange(0, 45, step_minutes)
    
    permutations = [
        (d, mo, y, h, mi, 0)
        for y in years
        for mo in months
        for d in days
        for h in hours
        for mi in minutes
    ]
    return data.Date.filter(items=permutations)

In [None]:
def in_list(x,permutations):
    if x in permutations:
        return x

years = np.arange(2015,2018,1)
months = np.arange(1,12,1)
days = np.arange(1,31,5)
hours = np.arange(0,23,5)
minutes = np.arange(0,45,15)

permutations = [ [d,mo,y,h,mi,0] for y in years for mo in months for d in days for h in hours for mi in minutes]
master.Date.apply(lambda x: in_list(x,permutations))

In [None]:
export_names = "GlobalEventID Day MonthYear Year FractionDate Actor1Code Actor1Name Actor1CountryCode Actor1KnownGroupCode Actor1EthnicCode Actor1Religioni1Code Actor1Religion2Code Actor1Type1Code Actor1Type2Code Actor1Type3Code Actor2Code Actor2Name Actor2CountryCode Actor2KnownGroupCode Actor2EthnicCode Actor2Religioni1Code Actor2Religion2Code Actor2Type1Code Actor2Type2Code Actor2Type3Code IsRootEvent EventCode EventBaseCode EventRootCode QuadClass GoldsteinScale NumMentions NumSources NumArticles AvgTone Actor1Geo_Type Actor1Geo_Fullname Actor1Geo_CountryCode Actor1Geo_ADM1Code Actor1Geo_ADM2Code Actor1Geo_Lat Actor1Geo_Long Actor1Geo_FeatureID Actor2Geo_Type Actor2Geo_Fullname Actor2Geo_CountryCode Actor2Geo_ADM1Code Actor2Geo_ADM2Code Actor2Geo_Lat Actor2Geo_Long Actor2Geo_FeatureID ActionGeo_Type ActionGeo_Fullname ActionGeo_CountryCode ActionGeo_ADM1Code ActionGeo_ADM2Code ActionGeo_Lat ActionGeo_Long ActionGeo_FeatureID DATEADDED SOURCEURL"
export_columns = export_names.split(" ")
export = pd.DataFrame(columns=export_columns)

export = pd.read_csv(master['url'][6], sep = '\t', names = export_columns,
                     usecols = ["GlobalEventID", "Day", "Actor1Geo_Lat", "Actor1Geo_Long", "Actor2Geo_Lat", "Actor2Geo_Long",
                                'NumMentions','NumSources','NumArticles','AvgTone', 'ActionGeo_Lat', 'ActionGeo_Long','DATEADDED','SOURCEURL'])
export.dropna()

export.head(4)

In [None]:
mention_names = "GlobalEventId EventTimeDate MentionTimeDate MentionType MentionSourceName MentionIdentifier SentenceID ActorCharOffset Actor2CharOffset ActionCharOffset InRawTest Confidence MentionDocLen MentionDocTone MentionDocTranslationinfo Extras"
mention_columns = mention_names.split(" ")
mention = pd.DataFrame(columns=mention_columns)

mention = pd.read_csv(master['URL'][1], sep = '\t', names = mention_columns,
                      usecols = ["GlobalEventId", "EventTimeDate", "MentionTimeDate"])
mention.dropna()

In [None]:
for value in list1:
    if len(list2[list2 == value]) == 1:
        list3.append(value)