# Data

In [1]:
# Install necessary packages
!pip install pandas requests bs4 lxml gdelt -q

In [2]:
# Let's load the GPI data
import pandas as pd
import requests
from bs4 import BeautifulSoup
from io import BytesIO

# data download
URL = "https://github.com/Dani3113R0se/SoDA/raw/d1f409f211b698cf8aec715304efb3b87c0cee0b/replication_voukelatou/data_peace.dta"
response = requests.get(URL)
response.raise_for_status()  # Verify successful download

# load to pandas
with BytesIO(response.content) as f:
    gpi_df = pd.read_stata(f)

# do we have all data
print(gpi_df)

      year      country  peace     id
0     2008  Afghanistan  3.153    1.0
1     2008      Albania  1.247    2.0
2     2008      Algeria  2.111    3.0
3     2008       Angola  1.655    4.0
4     2008    Argentina  1.403    5.0
...    ...          ...    ...    ...
2603  2023    Venezuela  2.086  159.0
2604  2023      Vietnam  1.403  160.0
2605  2023        Yemen  3.486  161.0
2606  2023       Zambia  1.577  162.0
2607  2023     Zimbabwe  2.006  163.0

[2608 rows x 4 columns]


## GDELT data

In [7]:
# GDELT analysis by event type, country, and time
import pandas as pd
import time
import gdelt

def analyze_gdelt_3d():
    start_time = time.time()
    
    # Initialize API with version 2
    gd = gdelt.gdelt(version=2)
    
    # Fetch data with valid parameters
    print("🕒 Fetching 2020-2023 data...")
    events = gd.Search(
        ['2020-01-01', '2023-12-31'],
        table='events',
        output='pd',
        normcols=True,
        coverage=False
    )
    
    # Validate data
    if events.empty:
        print("❌ No data retrieved")
        return pd.DataFrame()
    
    # Filter and prepare
    print("\n🔍 Processing data...")
    valid_data = events[
        (events['eventbasecode'].notna()) &
        (events['actiongeocountrycode'].notna()) &  # New filter
        (events['monthyear'].between(202001, 202312))
    ].copy()
    
    # Format monthyear as string
    valid_data['monthyear'] = valid_data['monthyear'].astype(str).str[:6]
    
    # Triple grouping
    print("📊 Grouping by month, event type, and country...")
    agg_data = valid_data.groupby(
        ['monthyear', 'eventbasecode', 'actiongeocountrycode']
    ).agg(
        total_events=('globaleventid', 'count'),
        avg_tone=('avgtone', 'mean'),
        avg_goldstein=('goldsteinscale', 'mean')
    ).reset_index()
    
    # Sort results
    final_df = agg_data.sort_values(
        ['monthyear', 'eventbasecode', 'actiongeocountrycode']
    )
    
    print(f"\n✅ Completed in {time.time()-start_time:.1f} seconds")
    print(f"📦 Final shape: {final_df.shape}")
    return final_df

# Run analysis
result = analyze_gdelt_3d()
result.head()


🕒 Fetching 2020-2023 data...





🔍 Processing data...
📊 Grouping by month, event type, and country...

✅ Completed in 21.7 seconds
📦 Final shape: (151587, 6)


Unnamed: 0,monthyear,eventbasecode,actiongeocountrycode,total_events,avg_tone,avg_goldstein
0,202001,10,AE,9,-1.269132,0.0
1,202001,10,AF,9,-4.436126,0.0
2,202001,10,AJ,3,-2.294024,0.0
3,202001,10,AL,1,-4.371585,0.0
4,202001,10,AM,2,-5.507403,0.0


Now we will save the file.

In [14]:
# Save file
result.rename(columns={'actiongeocountrycode': 'ActionGeo_CountryCode', 'monthyear': 'MonthYear', 'eventbasecode': 'EventBaseCode', 'total_events' : 'eventcount', 'avg_tone' : 'tonecount', 'avg_goldstein':'goldstein'}, inplace=True)
result = result[['ActionGeo_CountryCode','MonthYear', 'EventBaseCode', 
                 'eventcount', 'tonecount', 'goldstein']]
print(result.head())
result.to_csv('gdelt_data_2020_2023.csv', index=False)

  ActionGeo_CountryCode MonthYear EventBaseCode  eventcount  tonecount  \
0                    AE    202001           010           9  -1.269132   
1                    AF    202001           010           9  -4.436126   
2                    AJ    202001           010           3  -2.294024   
3                    AL    202001           010           1  -4.371585   
4                    AM    202001           010           2  -5.507403   

   goldstein  
0        0.0  
1        0.0  
2        0.0  
3        0.0  
4        0.0  
