### Import Modules and Python Functions

In [1]:
import os
import numpy as np
import pandas as pd
import yaml
with open('../python/variables.yaml') as info:
    VARS = yaml.load(info, Loader=yaml.FullLoader)

### Import Variables

In [2]:
# Events
events_csv = VARS['EVENTS_CSV']
events_cols_all = VARS['EVENTS_COLS_ALL']
events_cols_select = VARS['EVENTS_COLS_SELECT']

# Mentions
mentions_csv = VARS['MENTIONS_CSV']
mentions_cols_all = VARS['MENTIONS_COLS_ALL']
mentions_cols_select = VARS['MENTIONS_COLS_SELECT']

# CAMEO
cameo_verbs = VARS['CAMEO_VERBS']
cameo_quadclass = VARS['CAMEO_QUADCLASS']

# Output
desired_columns = VARS['DESIRED_COLUMNS']

### Define Reusable Python Functions

In [3]:
def get_select_data(filepath: str, all_cols: list, select_cols: list) -> pd:
    """Get Select Columns of Data from GDELT Latest Update CSV
    :param filepath: full filename and path to CSV file to be imported
    :param all_cols: column names of the CSV imported
    :param select_cols: derivative list of columns from all_cols
    :rtype: dataframe
    :return: dataframe
    """

    # Import entire CSV
    latest_update_df = pd.DataFrame(pd.read_csv(filepath,
                                                names=all_cols,
                                                delimiter="\t"))

    # Select specific columns
    return latest_update_df[select_cols]

### Import Data

#### Events Data

In [4]:
# get data
e1_df = get_select_data(events_csv,
                        events_cols_all,
                        events_cols_select)

e2_df = get_select_data('../select_data/data/latest_gdelt_events_2.csv',
                        events_cols_all,
                        events_cols_select)

e3_df = get_select_data('../select_data/data/latest_gdelt_events_3.csv',
                        events_cols_all,
                        events_cols_select)

merge_e_df = pd.merge(e1_df, e2_df, on=events_cols_select, how='outer')
gdelt_se_df = pd.merge(merge_e_df, e3_df, on=events_cols_select, how='outer')
print(gdelt_se_df.shape)
print(gdelt_se_df.info())
gdelt_se_df.head(1)

(3896, 12)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3896 entries, 0 to 3895
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   GLOBALEVENTID          3896 non-null   int64  
 1   SQLDATE                3896 non-null   int64  
 2   EventCode              3896 non-null   int64  
 3   EventRootCode          3896 non-null   int64  
 4   QuadClass              3896 non-null   int64  
 5   GoldsteinScale         3896 non-null   float64
 6   ActionGeo_Type         3896 non-null   int64  
 7   ActionGeo_FullName     3790 non-null   object 
 8   ActionGeo_CountryCode  3791 non-null   object 
 9   ActionGeo_Lat          3790 non-null   float64
 10  ActionGeo_Long         3790 non-null   float64
 11  SOURCEURL              3896 non-null   object 
dtypes: float64(3), int64(6), object(3)
memory usage: 395.7+ KB
None


Unnamed: 0,GLOBALEVENTID,SQLDATE,EventCode,EventRootCode,QuadClass,GoldsteinScale,ActionGeo_Type,ActionGeo_FullName,ActionGeo_CountryCode,ActionGeo_Lat,ActionGeo_Long,SOURCEURL
0,972833876,20200304,51,5,1,3.4,3,"Baltimore, Maryland, United States",US,39.2904,-76.6122,https://patch.com/maryland/fallston/boyle-buic...


In [5]:
print('Number of Global Event Ids: ', len(gdelt_se_df['GLOBALEVENTID'].unique()))

Number of Global Event Ids:  3896


In [6]:
# Assess date range
print('Min Event Date: ', gdelt_se_df['SQLDATE'].min())
print('Max Event Date: ', gdelt_se_df['SQLDATE'].max())

Min Event Date:  20200302
Max Event Date:  20210304


### Mentions Data

In [7]:
# get data
m1_df = get_select_data(mentions_csv,
                              mentions_cols_all,
                              mentions_cols_select)

m2_df = get_select_data('../select_data/data/latest_gdelt_mentions_2.csv',
                              mentions_cols_all,
                              mentions_cols_select)

m3_df = get_select_data('../select_data/data/latest_gdelt_mentions_3.csv',
                              mentions_cols_all,
                              mentions_cols_select)

merge_m_df = pd.merge(m1_df, m2_df, on=mentions_cols_select, how='outer')
gdelt_sm_df = pd.merge(merge_m_df, m2_df, on=mentions_cols_select, how='outer')
print(gdelt_sm_df.shape)
print(gdelt_sm_df.info())
gdelt_sm_df.head(1)

(4304, 5)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4304 entries, 0 to 4303
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   GLOBALEVENTID    4304 non-null   int64  
 1   EventTimeDate    4304 non-null   int64  
 2   MentionTimeDate  4304 non-null   int64  
 3   Confidence       4304 non-null   int64  
 4   MentionDocTone   4304 non-null   float64
dtypes: float64(1), int64(4)
memory usage: 201.8 KB
None


Unnamed: 0,GLOBALEVENTID,EventTimeDate,MentionTimeDate,Confidence,MentionDocTone
0,910234371,20200304184500,20210304184500,60,0.223714


In [8]:
print('Number of Global Event Ids: ', len(gdelt_sm_df['GLOBALEVENTID'].unique()))

Number of Global Event Ids:  2911


In [9]:
# Assess date range
print('Min Mentions Date: ', gdelt_sm_df['MentionTimeDate'].min())
print('Max Mentions Date: ', gdelt_sm_df['MentionTimeDate'].max())

Min Mentions Date:  20210304184500
Max Mentions Date:  20210304184500


#### Select Mentions within first 60 Days of an Event

In [10]:
# Calculate days between
gdelt_sm_df['DaysBetween'] = gdelt_sm_df['MentionTimeDate'] - gdelt_sm_df['EventTimeDate']
#print(gdelt_sm_df['DaysBetween'][0])

# Drop rows where days_between <= 60 days
gdelt_sm_60d_df = gdelt_sm_df[gdelt_sm_df['DaysBetween'] <= 60].reset_index(drop=True)
print(gdelt_sm_60d_df.shape)
print('Number of Global Event Ids: ', len(gdelt_sm_60d_df['GLOBALEVENTID'].unique()))
gdelt_sm_60d_df.head()

(1385, 6)
Number of Global Event Ids:  1286


Unnamed: 0,GLOBALEVENTID,EventTimeDate,MentionTimeDate,Confidence,MentionDocTone,DaysBetween
0,972833876,20210304184500,20210304184500,10,5.288462,0
1,972833877,20210304184500,20210304184500,10,5.288462,0
2,972833878,20210304184500,20210304184500,30,5.084746,0
3,972833879,20210304184500,20210304184500,40,0.223714,0
4,972833880,20210304184500,20210304184500,50,0.641849,0


In [11]:
# Verify output
max(gdelt_sm_60d_df['DaysBetween'])

0

#### Change int64 dates to datetimes

In [12]:
date_format = '%Y%m%d%H%M%S'
gdelt_sm_60d_df['EventTimeDate'] = pd.to_datetime(gdelt_sm_60d_df['EventTimeDate'].astype(str), format=date_format)
gdelt_sm_60d_df['MentionTimeDate'] = pd.to_datetime(gdelt_sm_60d_df['MentionTimeDate'].astype(str), format=date_format)
gdelt_sm_60d_df.head()

Unnamed: 0,GLOBALEVENTID,EventTimeDate,MentionTimeDate,Confidence,MentionDocTone,DaysBetween
0,972833876,2021-03-04 18:45:00,2021-03-04 18:45:00,10,5.288462,0
1,972833877,2021-03-04 18:45:00,2021-03-04 18:45:00,10,5.288462,0
2,972833878,2021-03-04 18:45:00,2021-03-04 18:45:00,30,5.084746,0
3,972833879,2021-03-04 18:45:00,2021-03-04 18:45:00,40,0.223714,0
4,972833880,2021-03-04 18:45:00,2021-03-04 18:45:00,50,0.641849,0


#### Group Mentions Data by GlobalEventId for Average Confidence and Mention Tone

agg_cols = ['GLOBALEVENTID','MeanConfidence', 'MeanMentionDocTone']

gdelt_sm_60d_agg_df = gdelt_sm_60d_df.groupby(['GLOBALEVENTID', 'EventTimeDate'], as_index=False,
                                             ).mean() # get mean Confidence, Tone, DaysBetween
gdelt_sm_60d_agg_df.head()

### Join Events and Mentions Data on GlobalEventId

In [13]:
print('Events Data: ', gdelt_se_df.shape)
print('Mentions Data: ', gdelt_sm_60d_df.shape)

# Merge dataframes
merged_df = gdelt_se_df.merge(gdelt_sm_60d_df, how='left', on='GLOBALEVENTID')
print('Merged Data w/ Duplicates: ', merged_df.shape)

# Drop duplicates
merged_df = merged_df.drop_duplicates()
print('Merged Data w/o Duplicates: ',merged_df.shape)
merged_df.head()

Events Data:  (3896, 12)
Mentions Data:  (1385, 6)
Merged Data w/ Duplicates:  (3995, 17)
Merged Data w/o Duplicates:  (3918, 17)


Unnamed: 0,GLOBALEVENTID,SQLDATE,EventCode,EventRootCode,QuadClass,GoldsteinScale,ActionGeo_Type,ActionGeo_FullName,ActionGeo_CountryCode,ActionGeo_Lat,ActionGeo_Long,SOURCEURL,EventTimeDate,MentionTimeDate,Confidence,MentionDocTone,DaysBetween
0,972833876,20200304,51,5,1,3.4,3,"Baltimore, Maryland, United States",US,39.2904,-76.6122,https://patch.com/maryland/fallston/boyle-buic...,2021-03-04 18:45:00,2021-03-04 18:45:00,10.0,5.288462,0.0
1,972833877,20200304,51,5,1,3.4,2,"Maryland, United States",US,39.0724,-76.7902,https://patch.com/maryland/fallston/boyle-buic...,2021-03-04 18:45:00,2021-03-04 18:45:00,10.0,5.288462,0.0
2,972833878,20200304,10,1,1,0.0,3,"Long Island, California, United States",US,38.1669,-121.625,http://www.nydailynews.com/snyde/ny-party-down...,2021-03-04 18:45:00,2021-03-04 18:45:00,30.0,5.084746,0.0
3,972833879,20200304,50,5,1,3.5,1,Russia,RS,60.0,100.0,http://www.tribtown.com/2021/03/04/ap-eu-virus...,2021-03-04 18:45:00,2021-03-04 18:45:00,40.0,0.223714,0.0
4,972833880,20200304,120,12,3,-4.0,4,"Ahmedabad, Gujarat, India",IN,23.0333,72.6167,https://www.business-standard.com/article/opin...,2021-03-04 18:45:00,2021-03-04 18:45:00,50.0,0.641849,0.0


In [14]:
print(merged_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3918 entries, 0 to 3994
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   GLOBALEVENTID          3918 non-null   int64         
 1   SQLDATE                3918 non-null   int64         
 2   EventCode              3918 non-null   int64         
 3   EventRootCode          3918 non-null   int64         
 4   QuadClass              3918 non-null   int64         
 5   GoldsteinScale         3918 non-null   float64       
 6   ActionGeo_Type         3918 non-null   int64         
 7   ActionGeo_FullName     3812 non-null   object        
 8   ActionGeo_CountryCode  3813 non-null   object        
 9   ActionGeo_Lat          3812 non-null   float64       
 10  ActionGeo_Long         3812 non-null   float64       
 11  SOURCEURL              3918 non-null   object        
 12  EventTimeDate          1308 non-null   datetime64[ns]
 13  Men

### Replace Cameo Code Root Integer Values with Associated String

In [15]:
cameo_root_code = list(merged_df['EventRootCode'].sort_values(ascending=True).unique())
cameo_root_code

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

In [16]:
print(cameo_verbs)

['MAKE PUBLIC STATEMENT', 'APPEAL', 'EXPRESS INTENT TO COOPERATE', 'CONSULT', 'ENGAGE IN DIPLOMATIC COOPERATION', 'ENGAGE IN MATERIAL COOPERATION', 'PROVIDE AID', 'YIELD', 'INVESTIGATE', 'DEMAND', 'DISAPPROVE', 'REJECT', 'THREATEN', 'PROTEST', 'EXHIBIT MILITARY POSTURE', 'REDUCE RELATIONS', 'COERCE', 'ASSAULT', 'FIGHT', 'ENGAGE IN UNCONVENTIONAL MASS VIOLENCE']


In [17]:
# Convert lists to dictionary 
cameo_code_dict = {cameo_root_code[i]: cameo_verbs[i] for i in range(len(cameo_root_code))}
cameo_code_dict

{1: 'MAKE PUBLIC STATEMENT',
 2: 'APPEAL',
 3: 'EXPRESS INTENT TO COOPERATE',
 4: 'CONSULT',
 5: 'ENGAGE IN DIPLOMATIC COOPERATION',
 6: 'ENGAGE IN MATERIAL COOPERATION',
 7: 'PROVIDE AID',
 8: 'YIELD',
 9: 'INVESTIGATE',
 10: 'DEMAND',
 11: 'DISAPPROVE',
 12: 'REJECT',
 13: 'THREATEN',
 14: 'PROTEST',
 15: 'EXHIBIT MILITARY POSTURE',
 16: 'REDUCE RELATIONS',
 17: 'COERCE',
 18: 'ASSAULT',
 19: 'FIGHT',
 20: 'ENGAGE IN UNCONVENTIONAL MASS VIOLENCE'}

In [18]:
# Add column for cameo code root strings (verbs)
merged_df['EventRootCodeString'] = merged_df['EventRootCode'].map(cameo_code_dict)

# verify output
cameo_code_df = merged_df[['EventRootCode', 'EventRootCodeString']].sort_values(by='EventRootCode',
                                                                                ascending=True).drop_duplicates()
cameo_code_df

Unnamed: 0,EventRootCode,EventRootCodeString
3994,1,MAKE PUBLIC STATEMENT
2991,2,APPEAL
1481,3,EXPRESS INTENT TO COOPERATE
2976,4,CONSULT
2394,5,ENGAGE IN DIPLOMATIC COOPERATION
3745,6,ENGAGE IN MATERIAL COOPERATION
3514,7,PROVIDE AID
397,8,YIELD
745,9,INVESTIGATE
2478,10,DEMAND


### Replace Cameo Code Root Integer Values with Associated String

In [19]:
cameo_quadclass_code = list(merged_df['QuadClass'].sort_values(ascending=True).unique())
cameo_quadclass_code

[1, 2, 3, 4]

In [20]:
print(cameo_quadclass)

['Verbal Cooperation', 'Material Cooperation', 'Verbal Conflict', 'Material Conflict']


In [21]:
# Convert lists to dictionary 
cameo_quadclass_dict = {cameo_quadclass_code[i]: cameo_quadclass[i] for i in range(len(cameo_quadclass_code))}
cameo_quadclass_dict

{1: 'Verbal Cooperation',
 2: 'Material Cooperation',
 3: 'Verbal Conflict',
 4: 'Material Conflict'}

In [22]:
# Add column for cameo code root strings (verbs)
merged_df['QuadClassString'] = merged_df['QuadClass'].map(cameo_quadclass_dict)

# verify output
cameo_quadclass_df = merged_df[['QuadClass', 'QuadClassString']].sort_values(by='QuadClass',
                                                                                ascending=True).drop_duplicates()
cameo_quadclass_df

Unnamed: 0,QuadClass,QuadClassString
0,1,Verbal Cooperation
3931,2,Material Cooperation
174,3,Verbal Conflict
1508,4,Material Conflict


### Assess Null Values

In [23]:
null_df = merged_df[pd.isnull(merged_df).any(axis=1)]
print(null_df.shape)
print(null_df.info())
null_df.head()

(2645, 19)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2645 entries, 38 to 3994
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   GLOBALEVENTID          2645 non-null   int64         
 1   SQLDATE                2645 non-null   int64         
 2   EventCode              2645 non-null   int64         
 3   EventRootCode          2645 non-null   int64         
 4   QuadClass              2645 non-null   int64         
 5   GoldsteinScale         2645 non-null   float64       
 6   ActionGeo_Type         2645 non-null   int64         
 7   ActionGeo_FullName     2539 non-null   object        
 8   ActionGeo_CountryCode  2540 non-null   object        
 9   ActionGeo_Lat          2539 non-null   float64       
 10  ActionGeo_Long         2539 non-null   float64       
 11  SOURCEURL              2645 non-null   object        
 12  EventTimeDate          35 non-null     datetime64[

Unnamed: 0,GLOBALEVENTID,SQLDATE,EventCode,EventRootCode,QuadClass,GoldsteinScale,ActionGeo_Type,ActionGeo_FullName,ActionGeo_CountryCode,ActionGeo_Lat,ActionGeo_Long,SOURCEURL,EventTimeDate,MentionTimeDate,Confidence,MentionDocTone,DaysBetween,EventRootCodeString,QuadClassString
38,972833914,20210304,40,4,1,1.0,0,,,,,https://www.readingchronicle.co.uk/news/191368...,2021-03-04 18:45:00,2021-03-04 18:45:00,100.0,0.0,0.0,CONSULT,Verbal Cooperation
74,972833950,20210304,45,4,1,5.0,0,,,,,https://www.ghanaweb.com/GhanaHomePage/regiona...,2021-03-04 18:45:00,2021-03-04 18:45:00,100.0,1.324503,0.0,CONSULT,Verbal Cooperation
110,972833984,20210304,842,8,2,7.0,0,,,,,https://www.ign.com/articles/star-trek-picard-...,2021-03-04 18:45:00,2021-03-04 18:45:00,100.0,3.667954,0.0,YIELD,Material Cooperation
167,972834034,20210304,40,4,1,1.0,0,,,,,https://www.readingchronicle.co.uk/news/191368...,2021-03-04 18:45:00,2021-03-04 18:45:00,100.0,0.0,0.0,CONSULT,Verbal Cooperation
173,972834040,20210304,22,2,1,3.2,0,,,,,https://www.chathamdailynews.ca/news/local-new...,2021-03-04 18:45:00,2021-03-04 18:45:00,100.0,1.578947,0.0,APPEAL,Verbal Cooperation


#### Based on project requirements, the data source for visualization presence of *non-null* values in the following columns:
- GlobalEventId
- EventTimeDate
- ActionGeo_CountryCode
- EventCode
- GoldsteinScale
- MentionDocTone

In [24]:
# Drop all rows in merged_df with nulls in the specified columns
required_value_columns = ['GLOBALEVENTID', 'EventTimeDate', 'ActionGeo_CountryCode', 
                          'EventCode', 'GoldsteinScale', 'MentionDocTone']

cleaned_merged_df = merged_df[~pd.isnull(merged_df[required_value_columns]).any(axis=1)].reset_index(drop=True)
print(cleaned_merged_df.shape)
cleaned_merged_df.head(1)

(1273, 19)


Unnamed: 0,GLOBALEVENTID,SQLDATE,EventCode,EventRootCode,QuadClass,GoldsteinScale,ActionGeo_Type,ActionGeo_FullName,ActionGeo_CountryCode,ActionGeo_Lat,ActionGeo_Long,SOURCEURL,EventTimeDate,MentionTimeDate,Confidence,MentionDocTone,DaysBetween,EventRootCodeString,QuadClassString
0,972833876,20200304,51,5,1,3.4,3,"Baltimore, Maryland, United States",US,39.2904,-76.6122,https://patch.com/maryland/fallston/boyle-buic...,2021-03-04 18:45:00,2021-03-04 18:45:00,10.0,5.288462,0.0,ENGAGE IN DIPLOMATIC COOPERATION,Verbal Cooperation


In [25]:
# Select Desired Columns for Data Factory Output
cleaned_merged_df = cleaned_merged_df[desired_columns]
print('Cleaned Data with Desired Columns: ',cleaned_merged_df.shape)
cleaned_merged_df.head()

Cleaned Data with Desired Columns:  (1273, 16)


Unnamed: 0,GLOBALEVENTID,EventTimeDate,MentionTimeDate,DaysBetween,Confidence,MentionDocTone,ActionGeo_CountryCode,ActionGeo_FullName,EventCode,EventRootCodeString,QuadClass,QuadClassString,GoldsteinScale,ActionGeo_Lat,ActionGeo_Long,SOURCEURL
0,972833876,2021-03-04 18:45:00,2021-03-04 18:45:00,0.0,10.0,5.288462,US,"Baltimore, Maryland, United States",51,ENGAGE IN DIPLOMATIC COOPERATION,1,Verbal Cooperation,3.4,39.2904,-76.6122,https://patch.com/maryland/fallston/boyle-buic...
1,972833877,2021-03-04 18:45:00,2021-03-04 18:45:00,0.0,10.0,5.288462,US,"Maryland, United States",51,ENGAGE IN DIPLOMATIC COOPERATION,1,Verbal Cooperation,3.4,39.0724,-76.7902,https://patch.com/maryland/fallston/boyle-buic...
2,972833878,2021-03-04 18:45:00,2021-03-04 18:45:00,0.0,30.0,5.084746,US,"Long Island, California, United States",10,MAKE PUBLIC STATEMENT,1,Verbal Cooperation,0.0,38.1669,-121.625,http://www.nydailynews.com/snyde/ny-party-down...
3,972833879,2021-03-04 18:45:00,2021-03-04 18:45:00,0.0,40.0,0.223714,RS,Russia,50,ENGAGE IN DIPLOMATIC COOPERATION,1,Verbal Cooperation,3.5,60.0,100.0,http://www.tribtown.com/2021/03/04/ap-eu-virus...
4,972833880,2021-03-04 18:45:00,2021-03-04 18:45:00,0.0,50.0,0.641849,IN,"Ahmedabad, Gujarat, India",120,REJECT,3,Verbal Conflict,-4.0,23.0333,72.6167,https://www.business-standard.com/article/opin...


In [26]:
# Store dataframe globally
%store cleaned_merged_df

Stored 'cleaned_merged_df' (DataFrame)
