# Explanation of the paper

## Data

In [1]:
# Install necessary packages
!pip install pandas requests bs4 lxml gdelt -q

In [2]:
# Let's load the GPI data
import pandas as pd
import requests
from bs4 import BeautifulSoup
from io import BytesIO

# data download
URL = "https://github.com/Dani3113R0se/SoDA/raw/d1f409f211b698cf8aec715304efb3b87c0cee0b/replication_voukelatou/data_peace.dta"
response = requests.get(URL)
response.raise_for_status()  # Verify successful download

# load to pandas
with BytesIO(response.content) as f:
    gpi_df = pd.read_stata(f)

# do we have all data
print(gpi_df)

      year      country  peace     id
0     2008  Afghanistan  3.153    1.0
1     2008      Albania  1.247    2.0
2     2008      Algeria  2.111    3.0
3     2008       Angola  1.655    4.0
4     2008    Argentina  1.403    5.0
...    ...          ...    ...    ...
2603  2023    Venezuela  2.086  159.0
2604  2023      Vietnam  1.403  160.0
2605  2023        Yemen  3.486  161.0
2606  2023       Zambia  1.577  162.0
2607  2023     Zimbabwe  2.006  163.0

[2608 rows x 4 columns]


## GDELT data

In [7]:
import pandas as pd
import time
from datetime import datetime

# Install first if needed: !pip install gdelt
import gdelt  

def analyze_gdelt():
    # Initialize API and track time
    start_time = time.time()
    gd = gdelt.gdelt(version=2)
    
    # Fetch data
    print("🕒 Fetching 2020-2023 data...")
    events = gd.Search(
        ['2020-01-01', '2023-12-31'],
        table='events',
        output='pd',
        normcols=True,
        coverage=False
    )
    
    # Check data
    if events.empty:
        print("❌ No data retrieved")
        return
    
    # Filter and prepare
    print("\n🔍 Processing data...")
    valid_data = events[
        (events['actiongeocountrycode'] != '') &
        (events['monthyear'].between(202001, 202312))
    ].copy()
    
    # Convert monthyear to proper format
    valid_data['monthyear'] = valid_data['monthyear'].astype(str).str[:6]
    
    # Aggregate with progress
    results = []
    unique_months = valid_data['monthyear'].unique()
    
    for i, ym in enumerate(sorted(unique_months), 1):
        iter_start = time.time()
        
        # Filter month and aggregate
        monthly = valid_data[valid_data['monthyear'] == ym]
        agg_data = monthly.groupby('actiongeocountrycode').agg(
            events=('globaleventid', 'count'),
            avg_tone=('avgtone', 'mean')
        ).reset_index()
        
        # Add month info
        agg_data['monthyear'] = ym
        results.append(agg_data)
        
        # Print progress
        print(f"📅 {ym} | {len(agg_data)} countries | {time.time()-iter_start:.1f}s | "
              f"Total: {i}/{len(unique_months)} months")
    
    # Combine results
    final_df = pd.concat(results).sort_values(['monthyear', 'actiongeocountrycode'])
    
    # Runtime summary
    print(f"\n✅ Completed in {time.time()-start_time:.1f} seconds")
    return final_df

# Run analysis
result = analyze_gdelt()
result.head()

🕒 Fetching 2020-2023 data...





🔍 Processing data...
📅 202001 | 204 countries | 0.1s | Total: 1/48 months
📅 202002 | 209 countries | 0.1s | Total: 2/48 months
📅 202003 | 206 countries | 0.1s | Total: 3/48 months
📅 202004 | 196 countries | 0.1s | Total: 4/48 months
📅 202005 | 201 countries | 0.1s | Total: 5/48 months
📅 202006 | 203 countries | 0.1s | Total: 6/48 months
📅 202007 | 196 countries | 0.1s | Total: 7/48 months
📅 202008 | 201 countries | 0.1s | Total: 8/48 months
📅 202009 | 197 countries | 0.1s | Total: 9/48 months
📅 202010 | 159 countries | 0.1s | Total: 10/48 months
📅 202011 | 168 countries | 0.1s | Total: 11/48 months
📅 202012 | 191 countries | 0.1s | Total: 12/48 months
📅 202101 | 188 countries | 0.1s | Total: 13/48 months
📅 202102 | 188 countries | 0.1s | Total: 14/48 months
📅 202103 | 198 countries | 0.1s | Total: 15/48 months
📅 202104 | 195 countries | 0.1s | Total: 16/48 months
📅 202105 | 194 countries | 0.1s | Total: 17/48 months
📅 202106 | 193 countries | 0.1s | Total: 18/48 months
📅 202107 | 188 

Unnamed: 0,actiongeocountrycode,events,avg_tone,monthyear
0,AA,4,1.807802,202001
1,AE,60,-0.161035,202001
2,AF,121,-3.437488,202001
3,AG,25,-4.523524,202001
4,AJ,29,-0.913064,202001


Now we will save the file.

In [9]:
# Save file
final_result.to_csv('gdelt_data_2020_2023.csv', index=False)