In [1]:
import numpy as np
import pandas as pd

### Import Data

In [7]:
def get_select_data(filepath: str, all_cols: list, select_cols: list) -> object:
    """Get Select Columns of Data from GDELT Latest Update CSV
    :param filepath: full filename and path to CSV file to be imported
    :param all_cols: column names of the CSV imported
    :param select_cols: derivative list of columns from all_cols
    :rtype: dataframe
    :return: dataframe
    """

    # Import entire CSV
    latest_update_df = pd.DataFrame(pd.read_csv(filepath,
                                                names=all_cols, delimiter="\t"))

    # Select specific columns
    return latest_update_df[select_cols]

#### Events Data

In [8]:
# filepath
events_csv = '../data/latest_gdelt_events.csv'

# all columns
event_cols =['GLOBALEVENTID','SQLDATE','MonthYear','Year','FractionDate','Actor1Code',
             'Actor1Name','Actor1CountryCode','Actor1KnownGroupCode','Actor1EthnicCode',
             'Actor1Religion1Code','Actor1Religion2Code','Actor1Type1Code','Actor1Type2Code',
             'Actor1Type3Code','Actor2Code','Actor2Name','Actor2CountryCode','Actor2KnownGroupCode',
             'Actor2EthnicCode','Actor2Religion1Code','Actor2Religion2Code','Actor2Type1Code',
             'Actor2Type2Code','Actor2Type3Code','IsRootEvent','EventCode','EventBaseCode',
             'EventRootCode','QuadClass','GoldsteinScale','NumMentions','NumSources',
             'NumArticles','AvgTone','Actor1Geo_Type','Actor1Geo_FullName','Actor1Geo_CountryCode',
             'Actor1Geo_ADM1Code','Actor1Geo_ADM2Code','Actor1Geo_Lat','Actor1Geo_Long',
             'Actor1Geo_FeatureID','Actor2Geo_Type','Actor2Geo_FullName','Actor2Geo_CountryCode',
             'Actor2Geo_ADM1Code','Actor2Geo_ADM2Code','Actor2Geo_Lat','Actor2Geo_Long',
             'Actor2Geo_FeatureID','ActionGeo_Type','ActionGeo_FullName','ActionGeo_CountryCode',
             'ActionGeo_ADM1Code','ActionGeo_ADM2Code','ActionGeo_Lat','ActionGeo_Long',
             'ActionGeo_FeatureID','DATEADDED','SOURCEURL'] 

# specific columns
select_event_cols =['GLOBALEVENTID','SQLDATE','EventCode','EventRootCode','QuadClass',
                    'GoldsteinScale','NumMentions','NumSources','NumArticles','AvgTone',
                    'ActionGeo_Type','ActionGeo_FullName','ActionGeo_CountryCode',
                    'ActionGeo_Lat','ActionGeo_Long','SOURCEURL'] 

# get data
gdelt_se_df = get_select_data(events_csv, event_cols, select_event_cols)
print(gdelt_se_df.shape)
print(gdelt_se_df.info())
gdelt_se_df.head()

(1212, 16)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1212 entries, 0 to 1211
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   GLOBALEVENTID          1212 non-null   int64  
 1   SQLDATE                1212 non-null   int64  
 2   EventCode              1212 non-null   int64  
 3   EventRootCode          1212 non-null   int64  
 4   QuadClass              1212 non-null   int64  
 5   GoldsteinScale         1212 non-null   float64
 6   NumMentions            1212 non-null   int64  
 7   NumSources             1212 non-null   int64  
 8   NumArticles            1212 non-null   int64  
 9   AvgTone                1212 non-null   float64
 10  ActionGeo_Type         1212 non-null   int64  
 11  ActionGeo_FullName     1177 non-null   object 
 12  ActionGeo_CountryCode  1179 non-null   object 
 13  ActionGeo_Lat          1177 non-null   float64
 14  ActionGeo_Long         1177 non-null   float6

Unnamed: 0,GLOBALEVENTID,SQLDATE,EventCode,EventRootCode,QuadClass,GoldsteinScale,NumMentions,NumSources,NumArticles,AvgTone,ActionGeo_Type,ActionGeo_FullName,ActionGeo_CountryCode,ActionGeo_Lat,ActionGeo_Long,SOURCEURL
0,972238884,20200301,20,2,1,3.0,2,1,2,-6.52921,4,"Canberra, Australian Capital Territory, Australia",AS,-35.2833,149.217,https://www.aninews.in/news/world/asia/china-a...
1,972238885,20200301,20,2,1,3.0,2,1,2,-6.52921,4,"Beijing, Beijing, China",CH,39.9289,116.388,https://www.aninews.in/news/world/asia/china-a...
2,972238886,20200301,20,2,1,3.0,6,1,6,-6.52921,4,"Beijing, Beijing, China",CH,39.9289,116.388,https://www.aninews.in/news/world/asia/china-a...
3,972238887,20200301,111,11,3,-2.0,4,1,4,-4.385965,4,"Mumbai, Maharashtra, India",IN,18.975,72.8258,https://www.techshout.com/security/2021/01/chi...
4,972238888,20200301,110,11,3,-2.0,4,1,4,-4.385965,4,"Mumbai, Maharashtra, India",IN,18.975,72.8258,https://www.techshout.com/security/2021/01/chi...


#### Mentions Data

In [8]:
# filepath
mentions_csv = '../data/latest_gdelt_events.csv'

# all columns
mention_cols =['GLOBALEVENTID','SQLDATE','MonthYear','Year','FractionDate','Actor1Code',
             'Actor1Name','Actor1CountryCode','Actor1KnownGroupCode','Actor1EthnicCode',
             'Actor1Religion1Code','Actor1Religion2Code','Actor1Type1Code','Actor1Type2Code',
             'Actor1Type3Code','Actor2Code','Actor2Name','Actor2CountryCode','Actor2KnownGroupCode',
             'Actor2EthnicCode','Actor2Religion1Code','Actor2Religion2Code','Actor2Type1Code',
             'Actor2Type2Code','Actor2Type3Code','IsRootEvent','EventCode','EventBaseCode',
             'EventRootCode','QuadClass','GoldsteinScale','NumMentions','NumSources',
             'NumArticles','AvgTone','Actor1Geo_Type','Actor1Geo_FullName','Actor1Geo_CountryCode',
             'Actor1Geo_ADM1Code','Actor1Geo_ADM2Code','Actor1Geo_Lat','Actor1Geo_Long',
             'Actor1Geo_FeatureID','Actor2Geo_Type','Actor2Geo_FullName','Actor2Geo_CountryCode',
             'Actor2Geo_ADM1Code','Actor2Geo_ADM2Code','Actor2Geo_Lat','Actor2Geo_Long',
             'Actor2Geo_FeatureID','ActionGeo_Type','ActionGeo_FullName','ActionGeo_CountryCode',
             'ActionGeo_ADM1Code','ActionGeo_ADM2Code','ActionGeo_Lat','ActionGeo_Long',
             'ActionGeo_FeatureID','DATEADDED','SOURCEURL'] 

# specific columns
select_mention_cols =['GLOBALEVENTID','SQLDATE','EventCode','EventRootCode','QuadClass',
                    'GoldsteinScale','NumMentions','NumSources','NumArticles','AvgTone',
                    'ActionGeo_Type','ActionGeo_FullName','ActionGeo_CountryCode',
                    'ActionGeo_Lat','ActionGeo_Long','SOURCEURL'] 

# get data
gdelt_sm_df = get_select_data(mentions_csv, mention_cols, select_mention_cols)
print(gdelt_sm_df.shape)
print(gdelt_sm_df.info())
gdelt_sm_df.head()

(1212, 16)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1212 entries, 0 to 1211
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   GLOBALEVENTID          1212 non-null   int64  
 1   SQLDATE                1212 non-null   int64  
 2   EventCode              1212 non-null   int64  
 3   EventRootCode          1212 non-null   int64  
 4   QuadClass              1212 non-null   int64  
 5   GoldsteinScale         1212 non-null   float64
 6   NumMentions            1212 non-null   int64  
 7   NumSources             1212 non-null   int64  
 8   NumArticles            1212 non-null   int64  
 9   AvgTone                1212 non-null   float64
 10  ActionGeo_Type         1212 non-null   int64  
 11  ActionGeo_FullName     1177 non-null   object 
 12  ActionGeo_CountryCode  1179 non-null   object 
 13  ActionGeo_Lat          1177 non-null   float64
 14  ActionGeo_Long         1177 non-null   float6

Unnamed: 0,GLOBALEVENTID,SQLDATE,EventCode,EventRootCode,QuadClass,GoldsteinScale,NumMentions,NumSources,NumArticles,AvgTone,ActionGeo_Type,ActionGeo_FullName,ActionGeo_CountryCode,ActionGeo_Lat,ActionGeo_Long,SOURCEURL
0,972238884,20200301,20,2,1,3.0,2,1,2,-6.52921,4,"Canberra, Australian Capital Territory, Australia",AS,-35.2833,149.217,https://www.aninews.in/news/world/asia/china-a...
1,972238885,20200301,20,2,1,3.0,2,1,2,-6.52921,4,"Beijing, Beijing, China",CH,39.9289,116.388,https://www.aninews.in/news/world/asia/china-a...
2,972238886,20200301,20,2,1,3.0,6,1,6,-6.52921,4,"Beijing, Beijing, China",CH,39.9289,116.388,https://www.aninews.in/news/world/asia/china-a...
3,972238887,20200301,111,11,3,-2.0,4,1,4,-4.385965,4,"Mumbai, Maharashtra, India",IN,18.975,72.8258,https://www.techshout.com/security/2021/01/chi...
4,972238888,20200301,110,11,3,-2.0,4,1,4,-4.385965,4,"Mumbai, Maharashtra, India",IN,18.975,72.8258,https://www.techshout.com/security/2021/01/chi...
