In [1]:
import requests
import lxml.html as lh

gdelt_base_url = 'http://data.gdeltproject.org/events/'

# get the list of all the links on the gdelt file page
page = requests.get(gdelt_base_url+'index.html')
doc = lh.fromstring(page.content)
link_list = doc.xpath("//*/ul/li/a/@href")

# separate out those links that begin with four digits 
file_list = [x for x in link_list if str.isdigit(x[0:4])]

In [23]:

outfilecounter = 0
import os.path
import urllib
import zipfile
import glob
import operator

fips_country_code = 'US'
local_path = os.getcwd() + "/data/GDELT/"
for compressed_file in file_list[:10]:
    print(compressed_file, end=' ')
    
    # if we dont have the compressed file stored locally, go get it. Keep trying if necessary.
    while not os.path.isfile(local_path+compressed_file): 
        print('downloading,', end='')
        urllib.request.urlretrieve(url=gdelt_base_url+compressed_file, 
                           filename=local_path+compressed_file)
        
    # extract the contents of the compressed file to a temporary directory    
    print ('extracting,', end='')
    z = zipfile.ZipFile(file=local_path+compressed_file, mode='r')    
    z.extractall(path=local_path+'tmp/')
    
    # parse each of the csv files in the working directory, 
    print('parsing,', end='')
    for infile_name in glob.glob(local_path+'tmp/*'):
        outfile_name = local_path+'country/'+fips_country_code+'%04i.tsv'%outfilecounter
        
        # open the infile and outfile
        with open(infile_name, mode='r') as infile, open(outfile_name, mode='w') as outfile:
            for line in infile:
                # extract lines with our interest country code
                if fips_country_code in operator.itemgetter(51, 37, 44)(line.split('\t')):    
                    outfile.write(line)
            outfilecounter +=1
            
        # delete the temporary file
        os.remove(infile_name)

    print('done')

20180823.export.CSV.zip downloading,extracting,parsing,done
20180822.export.CSV.zip extracting,parsing,done
20180821.export.CSV.zip extracting,parsing,done
20180820.export.CSV.zip extracting,parsing,done
20180819.export.CSV.zip extracting,parsing,done
20180818.export.CSV.zip extracting,parsing,done
20180817.export.CSV.zip extracting,parsing,done
20180816.export.CSV.zip extracting,parsing,done
20180815.export.CSV.zip extracting,parsing,done
20180814.export.CSV.zip extracting,parsing,done


In [24]:
import glob
import pandas as pd

# Get the GDELT field names from a helper file
colnames = pd.read_excel('data/GDELT/CSV.header.fieldids.xlsx', sheetname='Sheet1', 
                         index_col='Column ID', parse_cols=1)['Field Name']

# Build DataFrames from each of the intermediary files
files = glob.glob(local_path+'country/'+fips_country_code+'*')
DFlist = []
for active_file in files:
    print(active_file)
    DFlist.append(pd.read_csv(active_file, sep='\t', header=None, dtype=str,
                              names=colnames, index_col=['GLOBALEVENTID']))

# Merge the file-based dataframes and save a pickle
DF = pd.concat(DFlist)
DF.to_pickle(local_path+'backup'+fips_country_code+'.pickle')    
    
# once everythin is safely stored away, remove the temporary files
for active_file in files:
    os.remove(active_file)

  
  **kwds)


/Users/claasmeiners/PycharmProjects/MasterThesis/data/GDELT/country/US0000.tsv
/Users/claasmeiners/PycharmProjects/MasterThesis/data/GDELT/country/US0001.tsv
/Users/claasmeiners/PycharmProjects/MasterThesis/data/GDELT/country/US0002.tsv
/Users/claasmeiners/PycharmProjects/MasterThesis/data/GDELT/country/US0003.tsv
/Users/claasmeiners/PycharmProjects/MasterThesis/data/GDELT/country/US0004.tsv
/Users/claasmeiners/PycharmProjects/MasterThesis/data/GDELT/country/US0005.tsv
/Users/claasmeiners/PycharmProjects/MasterThesis/data/GDELT/country/US0006.tsv
/Users/claasmeiners/PycharmProjects/MasterThesis/data/GDELT/country/US0007.tsv
/Users/claasmeiners/PycharmProjects/MasterThesis/data/GDELT/country/US0008.tsv
/Users/claasmeiners/PycharmProjects/MasterThesis/data/GDELT/country/US0009.tsv


In [38]:
DF.head()

Unnamed: 0_level_0,SQLDATE,MonthYear,Year,FractionDate,Actor1Code,Actor1Name,Actor1CountryCode,Actor1KnownGroupCode,Actor1EthnicCode,Actor1Religion1Code,...,Actor2Geo_FeatureID,ActionGeo_Type,ActionGeo_FullName,ActionGeo_CountryCode,ActionGeo_ADM1Code,ActionGeo_Lat,ActionGeo_Long,ActionGeo_FeatureID,DATEADDED,SOURCEURL
GLOBALEVENTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
781299981,20170823,201708,2017,2017.6384,,,,,,,...,MA,2,"Massachusetts, United States",US,USMA,42.2373,-71.5314,MA,20180823,https://www.pressofatlanticcity.com/news/breaking/offshore-wind-fishing-industries-work-to-co-exist/article_10008015-77c7-5587-8fbb-77f1a8c8d74b.html
781299984,20170823,201708,2017,2017.6384,BUS,EMPLOYER,,,,,...,661511,3,"Hoyt Lakes, Minnesota, United States",US,USMN,47.5196,-92.1385,661511,20180823,"http://www.timberjay.com/stories/economic-boom-or-workforce-woes,14305"
781299985,20170823,201708,2017,2017.6384,CHN,CHINA,CHN,,,,...,,2,"Alaska, United States",US,USAK,61.385,-152.268,AK,20180823,http://kmxt.org/2018/08/asmi-says-fish-meal-included-tariff-changes-calls-comments/
781300001,20170823,201708,2017,2017.6384,USA,UNITED STATES,USA,,,,...,,3,"Raleigh, North Carolina, United States",US,USNC,35.7721,-78.6386,1024242,20180823,https://www.michigansthumb.com/news/crime/article/Mentally-ill-man-pleads-guilty-to-helping-13174947.php
781300002,20170823,201708,2017,2017.6384,USA,MASSACHUSETTS,USA,,,,...,,3,"Atlantic City, New Jersey, United States",US,USNJ,39.3643,-74.4229,874413,20180823,https://www.pressofatlanticcity.com/news/breaking/offshore-wind-fishing-industries-work-to-co-exist/article_10008015-77c7-5587-8fbb-77f1a8c8d74b.html


In [28]:
DF.keys()

Index(['SQLDATE', 'MonthYear', 'Year', 'FractionDate', 'Actor1Code',
       'Actor1Name', 'Actor1CountryCode', 'Actor1KnownGroupCode',
       'Actor1EthnicCode', 'Actor1Religion1Code', 'Actor1Religion2Code',
       'Actor1Type1Code', 'Actor1Type2Code', 'Actor1Type3Code', 'Actor2Code',
       'Actor2Name', 'Actor2CountryCode', 'Actor2KnownGroupCode',
       'Actor2EthnicCode', 'Actor2Religion1Code', 'Actor2Religion2Code',
       'Actor2Type1Code', 'Actor2Type2Code', 'Actor2Type3Code', 'IsRootEvent',
       'EventCode', 'EventBaseCode', 'EventRootCode', 'QuadClass',
       'GoldsteinScale', 'NumMentions', 'NumSources', 'NumArticles', 'AvgTone',
       'Actor1Geo_Type', 'Actor1Geo_FullName', 'Actor1Geo_CountryCode',
       'Actor1Geo_ADM1Code', 'Actor1Geo_Lat', 'Actor1Geo_Long',
       'Actor1Geo_FeatureID', 'Actor2Geo_Type', 'Actor2Geo_FullName',
       'Actor2Geo_CountryCode', 'Actor2Geo_ADM1Code', 'Actor2Geo_Lat',
       'Actor2Geo_Long', 'Actor2Geo_FeatureID', 'ActionGeo_Type',
      

In [37]:
pd.set_option('display.max_colwidth', -1)
DF["SOURCEURL"]

GLOBALEVENTID
781299981    https://www.pressofatlanticcity.com/news/breaking/offshore-wind-fishing-industries-work-to-co-exist/article_10008015-77c7-5587-8fbb-77f1a8c8d74b.html                                                             
781299984    http://www.timberjay.com/stories/economic-boom-or-workforce-woes,14305                                                                                                                                            
781299985    http://kmxt.org/2018/08/asmi-says-fish-meal-included-tariff-changes-calls-comments/                                                                                                                               
781300001    https://www.michigansthumb.com/news/crime/article/Mentally-ill-man-pleads-guilty-to-helping-13174947.php                                                                                                          
781300002    https://www.pressofatlanticcity.com/news/breaking/offshore-wind-fishing-indus

In [20]:
print(DF.describe().to_string())

         SQLDATE MonthYear    Year FractionDate Actor1Code     Actor1Name Actor1CountryCode Actor1KnownGroupCode Actor1EthnicCode Actor1Religion1Code Actor1Religion2Code Actor1Type1Code Actor1Type2Code Actor1Type3Code Actor2Code     Actor2Name Actor2CountryCode Actor2KnownGroupCode Actor2EthnicCode Actor2Religion1Code Actor2Religion2Code Actor2Type1Code Actor2Type2Code Actor2Type3Code IsRootEvent EventCode EventBaseCode EventRootCode QuadClass GoldsteinScale NumMentions NumSources NumArticles AvgTone Actor1Geo_Type                               Actor1Geo_FullName Actor1Geo_CountryCode Actor1Geo_ADM1Code Actor1Geo_Lat Actor1Geo_Long Actor1Geo_FeatureID Actor2Geo_Type                               Actor2Geo_FullName Actor2Geo_CountryCode Actor2Geo_ADM1Code Actor2Geo_Lat Actor2Geo_Long Actor2Geo_FeatureID ActionGeo_Type                               ActionGeo_FullName ActionGeo_CountryCode ActionGeo_ADM1Code ActionGeo_Lat ActionGeo_Long ActionGeo_FeatureID DATEADDED                       