### Import Modules and Python Functions

In [1]:
import os
import numpy as np
import pandas as pd
import yaml
with open('../python/variables.yaml') as info:
    VARS = yaml.load(info, Loader=yaml.FullLoader)

#os.path.join( os.path.dirname('__file__' ), '../python' )
#from from_csv_to_df import get_select_data

### Import Variables

In [2]:
# Events
events_csv = VARS['EVENTS_CSV']
events_cols_all = VARS['EVENTS_COLS_ALL']
events_cols_select = VARS['EVENTS_COLS_SELECT']

# Mentions
mentions_csv = VARS['MENTIONS_CSV']
mentions_cols_all = VARS['MENTIONS_COLS_ALL']
mentions_cols_select = VARS['MENTIONS_COLS_SELECT']

# Pilot
cameo_codes = VARS['PILOT_CAMEO_VERB_CODES']
select_countries_60 = VARS['PILOT_COUNTRIES_IOS2']

### Import Data

In [3]:
def get_select_data(filepath: str, all_cols: list, select_cols: list) -> object:
    """Get Select Columns of Data from GDELT Latest Update CSV
    :param filepath: full filename and path to CSV file to be imported
    :param all_cols: column names of the CSV imported
    :param select_cols: derivative list of columns from all_cols
    :rtype: dataframe
    :return: dataframe
    """

    # Import entire CSV
    latest_update_df = pd.DataFrame(pd.read_csv(filepath,
                                                names=all_cols,
                                                delimiter="\t"))

    # Select specific columns
    return latest_update_df[select_cols]

#### Events Data

In [4]:
# get data
gdelt_se_df = get_select_data(events_csv,
                              events_cols_all,
                              events_cols_select)
print(gdelt_se_df.shape)
print(gdelt_se_df.info())
gdelt_se_df.head()

(1212, 16)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1212 entries, 0 to 1211
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   GLOBALEVENTID          1212 non-null   int64  
 1   SQLDATE                1212 non-null   int64  
 2   EventCode              1212 non-null   int64  
 3   EventRootCode          1212 non-null   int64  
 4   QuadClass              1212 non-null   int64  
 5   GoldsteinScale         1212 non-null   float64
 6   NumMentions            1212 non-null   int64  
 7   NumSources             1212 non-null   int64  
 8   NumArticles            1212 non-null   int64  
 9   AvgTone                1212 non-null   float64
 10  ActionGeo_Type         1212 non-null   int64  
 11  ActionGeo_FullName     1177 non-null   object 
 12  ActionGeo_CountryCode  1179 non-null   object 
 13  ActionGeo_Lat          1177 non-null   float64
 14  ActionGeo_Long         1177 non-null   float6

Unnamed: 0,GLOBALEVENTID,SQLDATE,EventCode,EventRootCode,QuadClass,GoldsteinScale,NumMentions,NumSources,NumArticles,AvgTone,ActionGeo_Type,ActionGeo_FullName,ActionGeo_CountryCode,ActionGeo_Lat,ActionGeo_Long,SOURCEURL
0,972238884,20200301,20,2,1,3.0,2,1,2,-6.52921,4,"Canberra, Australian Capital Territory, Australia",AS,-35.2833,149.217,https://www.aninews.in/news/world/asia/china-a...
1,972238885,20200301,20,2,1,3.0,2,1,2,-6.52921,4,"Beijing, Beijing, China",CH,39.9289,116.388,https://www.aninews.in/news/world/asia/china-a...
2,972238886,20200301,20,2,1,3.0,6,1,6,-6.52921,4,"Beijing, Beijing, China",CH,39.9289,116.388,https://www.aninews.in/news/world/asia/china-a...
3,972238887,20200301,111,11,3,-2.0,4,1,4,-4.385965,4,"Mumbai, Maharashtra, India",IN,18.975,72.8258,https://www.techshout.com/security/2021/01/chi...
4,972238888,20200301,110,11,3,-2.0,4,1,4,-4.385965,4,"Mumbai, Maharashtra, India",IN,18.975,72.8258,https://www.techshout.com/security/2021/01/chi...


In [10]:
print('Number of Global Event Ids: ', len(gdelt_se_df['GLOBALEVENTID'].unique()))

Number of Global Event Ids:  1212


### Mentions Data

In [30]:
# get data
gdelt_sm_df = get_select_data(mentions_csv,
                              mentions_cols_all,
                              mentions_cols_select)
print(gdelt_sm_df.shape)
print(gdelt_sm_df.info())
gdelt_sm_df.head()

(4325, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4325 entries, 0 to 4324
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   GLOBALEVENTID      4325 non-null   int64  
 1   EventTimeDate      4325 non-null   int64  
 2   MentionIdentifier  4325 non-null   object 
 3   MentionTimeDate    4325 non-null   int64  
 4   Confidence         4325 non-null   int64  
 5   MentionDocLen      4325 non-null   int64  
 6   MentionDocTone     4325 non-null   float64
dtypes: float64(1), int64(5), object(1)
memory usage: 236.6+ KB
None


Unnamed: 0,GLOBALEVENTID,EventTimeDate,MentionIdentifier,MentionTimeDate,Confidence,MentionDocLen,MentionDocTone
0,972238884,20210301204500,https://www.aninews.in/news/world/asia/china-a...,20210301204500,20,1886,-6.52921
1,972238885,20210301204500,https://www.aninews.in/news/world/asia/china-a...,20210301204500,20,1886,-6.52921
2,972238886,20210301204500,https://www.aninews.in/news/world/asia/china-a...,20210301204500,60,1886,-6.52921
3,909428780,20200301123000,https://www.dailyexaminer.com.au/news/australi...,20210301204500,50,3931,0.4329
4,972238887,20210301204500,https://www.techshout.com/security/2021/01/chi...,20210301204500,40,4931,-4.385965


In [31]:
print('Number of Global Event Ids: ', len(gdelt_sm_df['GLOBALEVENTID'].unique()))

Number of Global Event Ids:  3229


#### Select Mentions within first 60 Days of an Event

In [36]:
# Calculate days between
gdelt_sm_df['DaysBetween'] = gdelt_sm_df['MentionTimeDate'] - gdelt_sm_df['EventTimeDate']
#print(gdelt_sm_df['DaysBetween'][0])

# Drop rows where days_between <= 60 days
gdelt_sm_60d_df = gdelt_sm_df[gdelt_sm_df['DaysBetween'] <= 60].reset_index(drop=True)
print(gdelt_sm_60d_df.shape)
print('Number of Global Event Ids: ', len(gdelt_sm_60d_df['GLOBALEVENTID'].unique()))
gdelt_sm_60d_df.head()

(1324, 8)
Number of Global Event Ids:  1212


Unnamed: 0,GLOBALEVENTID,EventTimeDate,MentionIdentifier,MentionTimeDate,Confidence,MentionDocLen,MentionDocTone,DaysBetween
0,972238884,20210301204500,https://www.aninews.in/news/world/asia/china-a...,20210301204500,20,1886,-6.52921,0
1,972238885,20210301204500,https://www.aninews.in/news/world/asia/china-a...,20210301204500,20,1886,-6.52921,0
2,972238886,20210301204500,https://www.aninews.in/news/world/asia/china-a...,20210301204500,60,1886,-6.52921,0
3,972238887,20210301204500,https://www.techshout.com/security/2021/01/chi...,20210301204500,40,4931,-4.385965,0
4,972238888,20210301204500,https://www.techshout.com/security/2021/01/chi...,20210301204500,40,4931,-4.385965,0


In [37]:
# Verify output
max(gdelt_sm_60d_df['DaysBetween'])

0

#### Group Mentions Data by GlobalEventId for Average Confidence and Mention Tone

In [38]:
# Change date int64 columns to datetimes
date_format = '%Y%m%d%H%M%S'
gdelt_sm_60d_df['EventTimeDate'] = pd.to_datetime(gdelt_sm_60d_df['EventTimeDate'].astype(str), format=date_format)
gdelt_sm_60d_df['MentionTimeDate'] = pd.to_datetime(gdelt_sm_60d_df['MentionTimeDate'].astype(str), format=date_format)
gdelt_sm_60d_df.head()

Unnamed: 0,GLOBALEVENTID,EventTimeDate,MentionIdentifier,MentionTimeDate,Confidence,MentionDocLen,MentionDocTone,DaysBetween
0,972238884,2021-03-01 20:45:00,https://www.aninews.in/news/world/asia/china-a...,2021-03-01 20:45:00,20,1886,-6.52921,0
1,972238885,2021-03-01 20:45:00,https://www.aninews.in/news/world/asia/china-a...,2021-03-01 20:45:00,20,1886,-6.52921,0
2,972238886,2021-03-01 20:45:00,https://www.aninews.in/news/world/asia/china-a...,2021-03-01 20:45:00,60,1886,-6.52921,0
3,972238887,2021-03-01 20:45:00,https://www.techshout.com/security/2021/01/chi...,2021-03-01 20:45:00,40,4931,-4.385965,0
4,972238888,2021-03-01 20:45:00,https://www.techshout.com/security/2021/01/chi...,2021-03-01 20:45:00,40,4931,-4.385965,0


In [35]:
gdelt_sm_3d_agg_df = pd.DataFrame()
gdelt_sm_14d_agg_df = pd.DataFrame()
gdelt_sm_60d_agg_df = pd.DataFrame()

for gId in gdelt_sm_60d_df['GLOBALEVENTID'].unique():
    ge_df = gdelt_sm_60d_df.loc[ gdelt_sm_60d_df['GLOBALEVENTID'] == gId ]
    ge_grouped_df = ge_df.groupby(['GLOBALEVENTID', 'EventTimeDate']).mean() # get mean Confidence, Tone, DaysBetween
    
    
    
    ge_grouped_df.columns = ['GLOBALEVENTID', 'EventTimeDate', 'MeanConfidence', 'MeanDocTone', ]
    
    #print(ge_grouped_df.head(1))

                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972238884     2021-03-01 20:45:00          20           1886        -6.52921   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972238884     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972238885     2021-03-01 20:45:00          20           1886        -6.52921   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972238885     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972238886     2021-03-01 20:45:00          60           1886    

972238933     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972238934     2021-03-01 20:45:00          40           1963       -5.135952   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972238934     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972238935     2021-03-01 20:45:00          40           4368       -9.183673   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972238935     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972238936     2

972238981     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972238982     2021-03-01 20:45:00          60           3000        2.868852   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972238982     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972238983     2021-03-01 20:45:00         100           4354        0.564972   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972238983     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972238984     2

972239033     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239034     2021-03-01 20:45:00          20            718         6.10687   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239034     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239035     2021-03-01 20:45:00          20           7901       -6.855792   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239035     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239036     2

972239090     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239091     2021-03-01 20:45:00         100           3179       -4.848485   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239091     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239092     2021-03-01 20:45:00         100          11096       -3.581746   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239092     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239093     2

972239133     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239134     2021-03-01 20:45:00          30           1877        0.649351   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239134     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239135     2021-03-01 20:45:00          60           3507       -1.441441   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239135     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239136     2

972239172     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239173     2021-03-01 20:45:00          40          17457       -3.116046   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239173     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239174     2021-03-01 20:45:00          10          17457       -3.116046   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239174     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239175     2

972239215     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239216     2021-03-01 20:45:00          60           2325       -6.527415   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239216     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239217     2021-03-01 20:45:00         100           8339        1.572552   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239217     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239218     2

                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239258     2021-03-01 20:45:00          50           7565       -1.338912   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239258     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239259     2021-03-01 20:45:00          10           1773        5.902778   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239259     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239260     2021-03-01 20:45:00          10           1773    

972239305     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239306     2021-03-01 20:45:00          20           1655       -1.324503   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239306     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239307     2021-03-01 20:45:00          80           1655       -1.324503   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239307     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239308     2

972239367     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239368     2021-03-01 20:45:00          50           1504       -1.265823   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239368     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239369     2021-03-01 20:45:00          20           2105       -0.311526   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239369     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239370     2

972239424     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239425     2021-03-01 20:45:00          20           7259       -4.952215   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239425     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239426     2021-03-01 20:45:00          80           7259       -4.952215   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239426     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239427     2

972239478     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239479     2021-03-01 20:45:00          10          21011       -3.190996   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239479     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239480     2021-03-01 20:45:00          20          21011       -3.190996   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239480     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239481     2

972239535     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239536     2021-03-01 20:45:00          16           2400        1.080691   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239536     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239537     2021-03-01 20:45:00          10           2631        1.448642   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239537     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239538     2

972239592     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239593     2021-03-01 20:45:00          60           1770       -3.040541   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239593     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239594     2021-03-01 20:45:00         100           2870        0.869565   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239594     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239595     2

                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239647     2021-03-01 20:45:00          40           7817       -3.725166   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239647     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239648     2021-03-01 20:45:00          20           2568       -4.535147   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239648     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239649     2021-03-01 20:45:00         100          17457    

972239704     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239705     2021-03-01 20:45:00          10            968             0.0   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239705     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239706     2021-03-01 20:45:00          20           2405       -4.883721   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239706     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239707     2

972239761     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239762     2021-03-01 20:45:00          20           6084       -2.529183   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239762     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239763     2021-03-01 20:45:00          20           6084       -2.529183   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239763     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239764     2

972239818     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239819     2021-03-01 20:45:00          70           7259       -4.952215   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239819     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239820     2021-03-01 20:45:00          10           7259       -4.952215   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239820     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239821     2

972239865     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239866     2021-03-01 20:45:00         100          11936       -2.465166   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239866     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239867     2021-03-01 20:45:00          50          11936       -2.465166   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239867     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239868     2

972239908     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239909     2021-03-01 20:45:00          20           2781             0.0   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239909     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239910     2021-03-01 20:45:00          10           2165        3.264095   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239910     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239911     2

972239955     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239956     2021-03-01 20:45:00          40           9218        1.909477   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239956     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239957     2021-03-01 20:45:00          20           4734        2.297297   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239957     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239958     2

972239998     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972239999     2021-03-01 20:45:00          30           4797        1.612903   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972239999     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972240000     2021-03-01 20:45:00         100           3810       -4.269294   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972240000     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972240001     2

972240050     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972240051     2021-03-01 20:45:00          60           6420        0.958773   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972240051     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972240052     2021-03-01 20:45:00          40           6420        0.958773   

                                   DaysBetween  
GLOBALEVENTID EventTimeDate                     
972240052     2021-03-01 20:45:00            0  
                                   Confidence  MentionDocLen  MentionDocTone  \
GLOBALEVENTID EventTimeDate                                                    
972240053     2

### Join Events and Mentions Data on GlobalEventId

In [9]:
print('Events Data: ', gdelt_se_df.shape)
print('Mentions Data: ', gdelt_sm_df.shape)

# Merge dataframes
merged_df = gdelt_se_df.merge(gdelt_sm_df, how='left', on='GLOBALEVENTID')
print('Merged Data w/ Duplicates: ', merged_df.shape)

# Drop duplicates
merged_df = merged_df.drop_duplicates()
print('Merged Data w/o Duplicates: ',merged_df.shape)
print(merged_df.info())
merged_df.head()

Events Data:  (1212, 16)
Mentions Data:  (4325, 6)
Merged Data w/ Duplicates:  (1324, 21)
Merged Data w/o Duplicates:  (1324, 21)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1324 entries, 0 to 1323
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   GLOBALEVENTID          1324 non-null   int64  
 1   SQLDATE                1324 non-null   int64  
 2   EventCode              1324 non-null   int64  
 3   EventRootCode          1324 non-null   int64  
 4   QuadClass              1324 non-null   int64  
 5   GoldsteinScale         1324 non-null   float64
 6   NumMentions            1324 non-null   int64  
 7   NumSources             1324 non-null   int64  
 8   NumArticles            1324 non-null   int64  
 9   AvgTone                1324 non-null   float64
 10  ActionGeo_Type         1324 non-null   int64  
 11  ActionGeo_FullName     1279 non-null   object 
 12  ActionGeo_CountryCode  1289 no

Unnamed: 0,GLOBALEVENTID,SQLDATE,EventCode,EventRootCode,QuadClass,GoldsteinScale,NumMentions,NumSources,NumArticles,AvgTone,...,ActionGeo_FullName,ActionGeo_CountryCode,ActionGeo_Lat,ActionGeo_Long,SOURCEURL,MentionIdentifier,MentionTimeDate,Confidence,MentionDocLen,MentionDocTone
0,972238884,20200301,20,2,1,3.0,2,1,2,-6.52921,...,"Canberra, Australian Capital Territory, Australia",AS,-35.2833,149.217,https://www.aninews.in/news/world/asia/china-a...,https://www.aninews.in/news/world/asia/china-a...,20210301204500,20,1886,-6.52921
1,972238885,20200301,20,2,1,3.0,2,1,2,-6.52921,...,"Beijing, Beijing, China",CH,39.9289,116.388,https://www.aninews.in/news/world/asia/china-a...,https://www.aninews.in/news/world/asia/china-a...,20210301204500,20,1886,-6.52921
2,972238886,20200301,20,2,1,3.0,6,1,6,-6.52921,...,"Beijing, Beijing, China",CH,39.9289,116.388,https://www.aninews.in/news/world/asia/china-a...,https://www.aninews.in/news/world/asia/china-a...,20210301204500,60,1886,-6.52921
3,972238887,20200301,111,11,3,-2.0,4,1,4,-4.385965,...,"Mumbai, Maharashtra, India",IN,18.975,72.8258,https://www.techshout.com/security/2021/01/chi...,https://www.techshout.com/security/2021/01/chi...,20210301204500,40,4931,-4.385965
4,972238888,20200301,110,11,3,-2.0,4,1,4,-4.385965,...,"Mumbai, Maharashtra, India",IN,18.975,72.8258,https://www.techshout.com/security/2021/01/chi...,https://www.techshout.com/security/2021/01/chi...,20210301204500,40,4931,-4.385965
