# Using GDELT Python Package to URL list

I had a large gap in my data from my initial GDELT Big Query export. I had exhausted my credits so I turned to the GDELT python package. This script attempts to approximate the query I passed to Big Query. The output dataset is then passed to my article scraper.

In [1]:
#install gdelt python package
!pip install gdelt

Collecting gdelt
  Downloading gdelt-0.1.10.6.1-py2.py3-none-any.whl (773 kB)
[K     |████████████████████████████████| 773 kB 4.4 MB/s eta 0:00:01
Installing collected packages: gdelt
Successfully installed gdelt-0.1.10.6
You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
#imports
import gdelt
import pandas as pd

In [3]:
#import list of themes I used for filtering in my Big Query query
themes = pd.read_csv('/floyd/home/Capstone/cap_notebooks/data/Gdelt/gdelt_themes_oct_13.csv', header=None, names=['Themes'], dtype='str')

In [4]:
themes

Unnamed: 0,Themes
0,WB_567
1,DEMOCRACY
2,CONSTITUTIONAL
3,CORRUPTION
4,ECON_COST_OF_LIVING
...,...
66,TAX_RELIGION
67,TRANSPARENCY
68,UNEMPLOYMENT
69,UNREST_POLICEBRUTALITY


In [5]:
#creates theme_list for searching
theme_list = list(themes['Themes'])


In [6]:
#checking hte theme list
theme_list

['WB_567',
 'DEMOCRACY',
 'CONSTITUTIONAL',
 'CORRUPTION',
 'ECON_COST_OF_LIVING',
 'ECON_MONOPOLY',
 'ECON_STOCKMARKET',
 'ECON_TAXATION',
 'EDUCATION',
 'ELECTION',
 'ELECTION_FRAUD',
 'ENV_BIOFUEL',
 'ENV_CARBONCAPTURE',
 'ENV_CLIMATECHANGE',
 'ENV_COAL',
 'ENV_DEFESTATION',
 'ENV_FISHERY',
 'ENV_FESTRY',
 'ENV_GEOTHERMAL',
 'ENV_HYDRO',
 'ENV_METALS',
 'ENV_MINING',
 'ENV_NATURALGAS',
 'ENV_NUCLEARPOWER',
 'ENV_OIL',
 'ENV_POACHING',
 'ENV_SOLAR',
 'ENV_SPECIESENDANGERED',
 'ENV_SPECIESEXTINCT',
 'ENV_WATERWAYS',
 'ENV_WINDPOWER',
 'ETH_INDIGINOUS',
 'EXTREMISM',
 'FIREARM_OWNERSHIP',
 'FOOD_SECURITY',
 'FREESPEECH',
 'GENDER_VIOLENCE',
 'GENERAL_GOVERNMENT',
 'GENERAL_HEALTH',
 'GOV_DISSOLVEGOV',
 'GOV_DIVISIONOFPOWER',
 'GOV_REFM',
 'GRIEVANCES',
 'HATE_SPEECH',
 'HEALTH_PANDEMIC',
 'INFO_HOAX',
 'INFO_RUM',
 'INTERNET_CENSSHIP',
 'LEADER',
 'LEGISLATION',
 'LGBT',
 'MEDIA_CENSSHIP',
 'MOVEMENT_ENVIRONMENTAL',
 'MOVEMENT_GENERAL',
 'MOVEMENT_OTHER',
 'MOVEMENT_SOCIAL',
 'MOVEMENT

In [7]:
#imports source list I used for filtering in Big Query
sources = pd.read_csv('/floyd/home/Capstone/cap_notebooks/data/Gdelt/gdelt_sources.csv', header=None, names=['Sources'], dtype='str')

In [8]:
#converts sources to list
source_list = list(sources['Sources'])

In [9]:
#check the sources
source_list

['foxnews.com',
 'nytimes.com',
 'buzzfeed.com',
 'usatoday.com',
 'cbsnews.com',
 'huffpost.com',
 'nbcnews.com',
 'cnn.com',
 'abcnews.com',
 'msnbc.com',
 'npr.org',
 'cnbc.com',
 'msn.com',
 'reuters.com',
 'latimes.com',
 'vox.com',
 'breitbart.com',
 'breitbart.com',
 'drudgereport.com',
 'nationalreview.com']

In [7]:
#create gdelt object
gd = gdelt.gdelt(version=2)

The following code: 
- Defines a date range I need to collect data for
- Sets up a while loop which stops when it reachs final date in range
- Passes each day in range to gdelt package, extracts data, filters based on theme, source, and location
- Appends data to file

In [9]:
%%time

#create location list
locations = ['USA', 'United States', 'US', 'U.S.']

import datetime
#sets start_date
start_date = datetime.date(2019, 9, 1)
#sets end_date
end_date = datetime.date(2019, 10, 1)
#sets the number of days to increment through
delta = datetime.timedelta(days=1)

while start_date <= end_date:
    #extracts date in the loop and creates date string to pass the gdelt package
    date_str = str(start_date.year) + ' ' + str(start_date.month) + ' ' + str(start_date.day)
    
    #loads gdelt object for the date_str
    results = gd.Search([date_str], table='gkg', coverage=True, output='df')
    print('day_loaded')

    #dropping unused columns
    results = results.drop(['SharingImage', 'RelatedImages', 'SocialImageEmbeds', 'SocialVideoEmbeds', 'TranslationInfo', 'Extras'], axis=1)
    print('cols_dropped')
    
    #removing key na columns
    results.dropna(subset=['Themes', 'Locations'], inplace=True)
    print('null themes dropped')
    

    #checks for us locations
    results['location_bool'] = results['Locations'].apply(lambda x: any(location in x for location in locations))
    print('locations checked')


    #filters for US locations
    results = results[results['location_bool']==True]
    print('locations filtered')

    #checks for political themese
    results['theme_bool'] = results['Themes'].apply(lambda x: any(theme in x for theme in theme_list))
    print('themes checked')

    #filters for relevant themes
    results = results[results['theme_bool']==True]
    print('themes filtered')
    
    #creates boolean mask to filter for source
    results['source_bool'] = results['SourceCommonName'].apply(lambda x: any(source in str(x) for source in source_list))
    print('sources checked')

    #filters source
    results = results[results['source_bool']==True]
    print('sources filtered')
    
    #appends days worth of URLs to new file
    results.to_csv('gdelt_gkg_2019-06-1_2019-10-1.csv', mode='a', header=False)
    print(f'{start_date} written to file')
    
    #advances loop by one day
    start_date += delta

print('job done')



day_loaded
cols_dropped
null themes dropped
locations checked
locations filtered
themes checked
themes filtered
sources checked
sources filtered
2019-09-01 written to file




day_loaded
cols_dropped
null themes dropped
locations checked
locations filtered
themes checked
themes filtered
sources checked
sources filtered
2019-09-02 written to file




day_loaded
cols_dropped
null themes dropped
locations checked
locations filtered
themes checked
themes filtered
sources checked
sources filtered
2019-09-03 written to file




day_loaded
cols_dropped
null themes dropped
locations checked
locations filtered
themes checked
themes filtered
sources checked
sources filtered
2019-09-04 written to file




day_loaded
cols_dropped
null themes dropped
locations checked
locations filtered
themes checked
themes filtered
sources checked
sources filtered
2019-09-05 written to file




day_loaded
cols_dropped
null themes dropped
locations checked
locations filtered
themes checked
themes filtered
sources checked
sources filtered
2019-09-06 written to file




day_loaded
cols_dropped
null themes dropped
locations checked
locations filtered
themes checked
themes filtered
sources checked
sources filtered
2019-09-07 written to file




day_loaded
cols_dropped
null themes dropped
locations checked
locations filtered
themes checked
themes filtered
sources checked
sources filtered
2019-09-08 written to file




day_loaded
cols_dropped
null themes dropped
locations checked
locations filtered
themes checked
themes filtered
sources checked
sources filtered
2019-09-09 written to file




day_loaded
cols_dropped
null themes dropped
locations checked
locations filtered
themes checked
themes filtered
sources checked
sources filtered
2019-09-10 written to file




day_loaded
cols_dropped
null themes dropped
locations checked
locations filtered
themes checked
themes filtered
sources checked
sources filtered
2019-09-11 written to file




day_loaded
cols_dropped
null themes dropped
locations checked
locations filtered
themes checked
themes filtered
sources checked
sources filtered
2019-09-12 written to file




day_loaded
cols_dropped
null themes dropped
locations checked
locations filtered
themes checked
themes filtered
sources checked
sources filtered
2019-09-13 written to file




day_loaded
cols_dropped
null themes dropped
locations checked
locations filtered
themes checked
themes filtered
sources checked
sources filtered
2019-09-14 written to file




day_loaded
cols_dropped
null themes dropped
locations checked
locations filtered
themes checked
themes filtered
sources checked
sources filtered
2019-09-15 written to file




day_loaded
cols_dropped
null themes dropped
locations checked
locations filtered
themes checked
themes filtered
sources checked
sources filtered
2019-09-16 written to file




day_loaded
cols_dropped
null themes dropped
locations checked
locations filtered
themes checked
themes filtered
sources checked
sources filtered
2019-09-17 written to file




day_loaded
cols_dropped
null themes dropped
locations checked
locations filtered
themes checked
themes filtered
sources checked
sources filtered
2019-09-18 written to file




day_loaded
cols_dropped
null themes dropped
locations checked
locations filtered
themes checked
themes filtered
sources checked
sources filtered
2019-09-19 written to file




day_loaded
cols_dropped
null themes dropped
locations checked
locations filtered
themes checked
themes filtered
sources checked
sources filtered
2019-09-20 written to file




day_loaded
cols_dropped
null themes dropped
locations checked
locations filtered
themes checked
themes filtered
sources checked
sources filtered
2019-09-21 written to file




day_loaded
cols_dropped
null themes dropped
locations checked
locations filtered
themes checked
themes filtered
sources checked
sources filtered
2019-09-22 written to file




day_loaded
cols_dropped
null themes dropped
locations checked
locations filtered
themes checked
themes filtered
sources checked
sources filtered
2019-09-23 written to file




day_loaded
cols_dropped
null themes dropped
locations checked
locations filtered
themes checked
themes filtered
sources checked
sources filtered
2019-09-24 written to file




day_loaded
cols_dropped
null themes dropped
locations checked
locations filtered
themes checked
themes filtered
sources checked
sources filtered
2019-09-25 written to file




day_loaded
cols_dropped
null themes dropped
locations checked
locations filtered
themes checked
themes filtered
sources checked
sources filtered
2019-09-26 written to file




day_loaded
cols_dropped
null themes dropped
locations checked
locations filtered
themes checked
themes filtered
sources checked
sources filtered
2019-09-27 written to file




day_loaded
cols_dropped
null themes dropped
locations checked
locations filtered
themes checked
themes filtered
sources checked
sources filtered
2019-09-28 written to file




day_loaded
cols_dropped
null themes dropped
locations checked
locations filtered
themes checked
themes filtered
sources checked
sources filtered
2019-09-29 written to file




day_loaded
cols_dropped
null themes dropped
locations checked
locations filtered
themes checked
themes filtered
sources checked
sources filtered
2019-09-30 written to file




day_loaded
cols_dropped
null themes dropped
locations checked
locations filtered
themes checked
themes filtered
sources checked
sources filtered
2019-10-01 written to file
job done
CPU times: user 2min 4s, sys: 1min 34s, total: 3min 38s
Wall time: 7min 56s


In [127]:
check = pd.read_csv('/floyd/home/Capstone/cap_notebooks/notebooks/Scappers/test_csv.csv')

In [129]:
check.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11315 entries, 0 to 11314
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0                  11315 non-null  int64  
 1   GKGRECORDID                 11315 non-null  object 
 2   DATE                        11315 non-null  float64
 3   SourceCollectionIdentifier  11315 non-null  float64
 4   SourceCommonName            11315 non-null  object 
 5   DocumentIdentifier          11315 non-null  object 
 6   Counts                      3221 non-null   object 
 7   V2Counts                    3221 non-null   object 
 8   Themes                      11315 non-null  object 
 9   V2Themes                    11315 non-null  object 
 10  Locations                   11315 non-null  object 
 11  V2Locations                 11312 non-null  object 
 12  Persons                     10653 non-null  object 
 13  V2Persons                   106