In [1]:
import os
import pandas as pd
from os.path import exists as path_exists

TMP_FOLDER = './tmp/'
DATA_FILE = TMP_FOLDER + 'masterfilelist-translation.txt'

if not path_exists(TMP_FOLDER):
    os.makedirs(TMP_FOLDER)

In [2]:
# fetch the data requiered for the analysis
if not path_exists(DATA_FILE):
    !wget --directory-prefix tmp http://data.gdeltproject.org/gdeltv2/masterfilelist-translation.txt

In [3]:
data = pd.read_csv(DATA_FILE, 
                   sep=' ', 
                   header=None, 
                   names=['Size', 'Hash', 'Url'],
                   dtype={'Size': str})

# Parsing fails sometimes
data = data[~data.Size.str.contains('http:')]

# casting correct types
data.Size = data.Size.astype(int)

In [4]:
mentions = data[ data.Url.str.contains('mentions', na=False)]
events = data[ data.Url.str.contains('export', na=False)]

In [5]:
q = [0.5, 0.9, 0.99, 1]

In [6]:
mentions.Size.quantile(q)

0.50     135748.50
0.90     238908.00
0.99     288842.18
1.00    1195815.00
Name: Size, dtype: float64

In [7]:
events.Size.quantile(q)

0.50     79185.00
0.90    133599.80
0.99    167524.85
1.00    858663.00
Name: Size, dtype: float64

In [8]:
mentions_time = mentions.Url.str.extract('/gdeltv2/(\d+)\.translation').iloc[:,0]
mentions_time.head()

1     20150218224500
4     20150218230000
7     20150218231500
10    20150218233000
13    20150218234500
Name: 0, dtype: object

In [9]:
mentions_time.str.extract(
    '(?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2})(?P<hour>\d{2})(?P<minute>\d{2})(?P<else>\.*)') \
    .head()

Unnamed: 0,year,month,day,hour,minute,else
1,2015,2,18,22,45,
4,2015,2,18,23,0,
7,2015,2,18,23,15,
10,2015,2,18,23,30,
13,2015,2,18,23,45,
