### Import Modules and Python Functions

In [1]:
import os
import numpy as np
import pandas as pd
import yaml
with open('../python/variables.yaml') as info:
    VARS = yaml.load(info, Loader=yaml.FullLoader)

#os.path.join( os.path.dirname('__file__' ), '../python' )
#from from_csv_to_df import get_select_data

### Import Variables

In [2]:
# Events
events_csv = VARS['EVENTS_CSV']
events_cols_all = VARS['EVENTS_COLS_ALL']
events_cols_select = VARS['EVENTS_COLS_SELECT']

# Mentions
mentions_csv = VARS['MENTIONS_CSV']
mentions_cols_all = VARS['MENTIONS_COLS_ALL']
mentions_cols_select = VARS['MENTIONS_COLS_SELECT']

# Pilot
cameo_codes = VARS['PILOT_CAMEO_VERB_CODES']
select_countries_60 = VARS['PILOT_COUNTRIES_IOS2']

### Import Data

In [3]:
def get_select_data(filepath: str, all_cols: list, select_cols: list) -> object:
    """Get Select Columns of Data from GDELT Latest Update CSV
    :param filepath: full filename and path to CSV file to be imported
    :param all_cols: column names of the CSV imported
    :param select_cols: derivative list of columns from all_cols
    :rtype: dataframe
    :return: dataframe
    """

    # Import entire CSV
    latest_update_df = pd.DataFrame(pd.read_csv(filepath,
                                                names=all_cols,
                                                delimiter="\t"))

    # Select specific columns
    return latest_update_df[select_cols]

#### Events Data

In [4]:
# get data
gdelt_se_df = get_select_data(events_csv,
                              events_cols_all,
                              events_cols_select)
print(gdelt_se_df.shape)
print(gdelt_se_df.info())
gdelt_se_df.head(1)

(1212, 11)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1212 entries, 0 to 1211
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   GLOBALEVENTID          1212 non-null   int64  
 1   EventCode              1212 non-null   int64  
 2   EventRootCode          1212 non-null   int64  
 3   QuadClass              1212 non-null   int64  
 4   GoldsteinScale         1212 non-null   float64
 5   ActionGeo_Type         1212 non-null   int64  
 6   ActionGeo_FullName     1177 non-null   object 
 7   ActionGeo_CountryCode  1179 non-null   object 
 8   ActionGeo_Lat          1177 non-null   float64
 9   ActionGeo_Long         1177 non-null   float64
 10  SOURCEURL              1212 non-null   object 
dtypes: float64(3), int64(5), object(3)
memory usage: 104.3+ KB
None


Unnamed: 0,GLOBALEVENTID,EventCode,EventRootCode,QuadClass,GoldsteinScale,ActionGeo_Type,ActionGeo_FullName,ActionGeo_CountryCode,ActionGeo_Lat,ActionGeo_Long,SOURCEURL
0,972238884,20,2,1,3.0,4,"Canberra, Australian Capital Territory, Australia",AS,-35.2833,149.217,https://www.aninews.in/news/world/asia/china-a...


In [5]:
print('Number of Global Event Ids: ', len(gdelt_se_df['GLOBALEVENTID'].unique()))

Number of Global Event Ids:  1212


### Mentions Data

In [6]:
# get data
gdelt_sm_df = get_select_data(mentions_csv,
                              mentions_cols_all,
                              mentions_cols_select)
print(gdelt_sm_df.shape)
print(gdelt_sm_df.info())
gdelt_sm_df.head(1)

(4325, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4325 entries, 0 to 4324
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   GLOBALEVENTID    4325 non-null   int64  
 1   EventTimeDate    4325 non-null   int64  
 2   MentionTimeDate  4325 non-null   int64  
 3   Confidence       4325 non-null   int64  
 4   MentionDocTone   4325 non-null   float64
dtypes: float64(1), int64(4)
memory usage: 169.1 KB
None


Unnamed: 0,GLOBALEVENTID,EventTimeDate,MentionTimeDate,Confidence,MentionDocTone
0,972238884,20210301204500,20210301204500,20,-6.52921


In [7]:
print('Number of Global Event Ids: ', len(gdelt_sm_df['GLOBALEVENTID'].unique()))

Number of Global Event Ids:  3229


#### Select Mentions within first 60 Days of an Event

In [8]:
# Calculate days between
gdelt_sm_df['DaysBetween'] = gdelt_sm_df['MentionTimeDate'] - gdelt_sm_df['EventTimeDate']
#print(gdelt_sm_df['DaysBetween'][0])

# Drop rows where days_between <= 60 days
gdelt_sm_60d_df = gdelt_sm_df[gdelt_sm_df['DaysBetween'] <= 60].reset_index(drop=True)
print(gdelt_sm_60d_df.shape)
print('Number of Global Event Ids: ', len(gdelt_sm_60d_df['GLOBALEVENTID'].unique()))
gdelt_sm_60d_df.head()

(1324, 6)
Number of Global Event Ids:  1212


Unnamed: 0,GLOBALEVENTID,EventTimeDate,MentionTimeDate,Confidence,MentionDocTone,DaysBetween
0,972238884,20210301204500,20210301204500,20,-6.52921,0
1,972238885,20210301204500,20210301204500,20,-6.52921,0
2,972238886,20210301204500,20210301204500,60,-6.52921,0
3,972238887,20210301204500,20210301204500,40,-4.385965,0
4,972238888,20210301204500,20210301204500,40,-4.385965,0


In [9]:
# Verify output
max(gdelt_sm_60d_df['DaysBetween'])

0

#### Change int64 dates to datetimes

In [10]:
date_format = '%Y%m%d%H%M%S'
gdelt_sm_60d_df['EventTimeDate'] = pd.to_datetime(gdelt_sm_60d_df['EventTimeDate'].astype(str), format=date_format)
gdelt_sm_60d_df['MentionTimeDate'] = pd.to_datetime(gdelt_sm_60d_df['MentionTimeDate'].astype(str), format=date_format)
gdelt_sm_60d_df.head()

Unnamed: 0,GLOBALEVENTID,EventTimeDate,MentionTimeDate,Confidence,MentionDocTone,DaysBetween
0,972238884,2021-03-01 20:45:00,2021-03-01 20:45:00,20,-6.52921,0
1,972238885,2021-03-01 20:45:00,2021-03-01 20:45:00,20,-6.52921,0
2,972238886,2021-03-01 20:45:00,2021-03-01 20:45:00,60,-6.52921,0
3,972238887,2021-03-01 20:45:00,2021-03-01 20:45:00,40,-4.385965,0
4,972238888,2021-03-01 20:45:00,2021-03-01 20:45:00,40,-4.385965,0


### Join Events and Mentions Data on GlobalEventId

In [11]:
print('Events Data: ', gdelt_se_df.shape)
print('Mentions Data: ', gdelt_sm_60d_df.shape)

# Merge dataframes
merged_df = gdelt_se_df.merge(gdelt_sm_60d_df, how='left', on='GLOBALEVENTID')
print('Merged Data w/ Duplicates: ', merged_df.shape)

# Drop duplicates
merged_df = merged_df.drop_duplicates()
print('Merged Data w/o Duplicates: ',merged_df.shape)
print(merged_df.info())
merged_df.head()

Events Data:  (1212, 11)
Mentions Data:  (1324, 6)
Merged Data w/ Duplicates:  (1324, 16)
Merged Data w/o Duplicates:  (1301, 16)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1301 entries, 0 to 1323
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   GLOBALEVENTID          1301 non-null   int64         
 1   EventCode              1301 non-null   int64         
 2   EventRootCode          1301 non-null   int64         
 3   QuadClass              1301 non-null   int64         
 4   GoldsteinScale         1301 non-null   float64       
 5   ActionGeo_Type         1301 non-null   int64         
 6   ActionGeo_FullName     1259 non-null   object        
 7   ActionGeo_CountryCode  1267 non-null   object        
 8   ActionGeo_Lat          1259 non-null   float64       
 9   ActionGeo_Long         1259 non-null   float64       
 10  SOURCEURL              1301 non-null   object     

Unnamed: 0,GLOBALEVENTID,EventCode,EventRootCode,QuadClass,GoldsteinScale,ActionGeo_Type,ActionGeo_FullName,ActionGeo_CountryCode,ActionGeo_Lat,ActionGeo_Long,SOURCEURL,EventTimeDate,MentionTimeDate,Confidence,MentionDocTone,DaysBetween
0,972238884,20,2,1,3.0,4,"Canberra, Australian Capital Territory, Australia",AS,-35.2833,149.217,https://www.aninews.in/news/world/asia/china-a...,2021-03-01 20:45:00,2021-03-01 20:45:00,20,-6.52921,0
1,972238885,20,2,1,3.0,4,"Beijing, Beijing, China",CH,39.9289,116.388,https://www.aninews.in/news/world/asia/china-a...,2021-03-01 20:45:00,2021-03-01 20:45:00,20,-6.52921,0
2,972238886,20,2,1,3.0,4,"Beijing, Beijing, China",CH,39.9289,116.388,https://www.aninews.in/news/world/asia/china-a...,2021-03-01 20:45:00,2021-03-01 20:45:00,60,-6.52921,0
3,972238887,111,11,3,-2.0,4,"Mumbai, Maharashtra, India",IN,18.975,72.8258,https://www.techshout.com/security/2021/01/chi...,2021-03-01 20:45:00,2021-03-01 20:45:00,40,-4.385965,0
4,972238888,110,11,3,-2.0,4,"Mumbai, Maharashtra, India",IN,18.975,72.8258,https://www.techshout.com/security/2021/01/chi...,2021-03-01 20:45:00,2021-03-01 20:45:00,40,-4.385965,0


#### Group Mentions Data by GlobalEventId for Average Confidence and Mention Tone

In [39]:
agg_cols = ['GLOBALEVENTID','Confidence', 'MentionDocTone']

gdelt_sm_60d_agg_df = pd.DataFrame()

for gId in gdelt_sm_60d_df['GLOBALEVENTID'].unique():
    
    ge_df = gdelt_sm_60d_df.loc[ gdelt_sm_60d_df['GLOBALEVENTID'] == gId ][agg_cols]
    
    # 3 day average
    ge_grouped_df = ge_df.groupby(['GLOBALEVENTID', 'EventTimeDate']).mean() # get mean Confidence, Tone, DaysBetween
    

KeyError: 'EventTimeDate'