# Explanation of the paper

## Data

In [1]:
# Install necessary packages
!pip install pandas requests bs4 lxml gdelt -q

In [2]:
# Let's load the GPI data
import pandas as pd
import requests
from bs4 import BeautifulSoup
from io import BytesIO

# data download
URL = "https://github.com/Dani3113R0se/SoDA/raw/d1f409f211b698cf8aec715304efb3b87c0cee0b/replication_voukelatou/data_peace.dta"
response = requests.get(URL)
response.raise_for_status()  # Verify successful download

# load to pandas
with BytesIO(response.content) as f:
    gpi_df = pd.read_stata(f)

# do we have all data
print(gpi_df)

      year      country  peace     id
0     2008  Afghanistan  3.153    1.0
1     2008      Albania  1.247    2.0
2     2008      Algeria  2.111    3.0
3     2008       Angola  1.655    4.0
4     2008    Argentina  1.403    5.0
...    ...          ...    ...    ...
2603  2023    Venezuela  2.086  159.0
2604  2023      Vietnam  1.403  160.0
2605  2023        Yemen  3.486  161.0
2606  2023       Zambia  1.577  162.0
2607  2023     Zimbabwe  2.006  163.0

[2608 rows x 4 columns]


## GDELT data

In [3]:
# import GDELT data
import gdelt
import pandas as pd
from datetime import datetime, timedelta

# initialize GDELT API
gd = gdelt.gdelt(version=2)

# date range 2020-2023
start_date = datetime.strptime('2020 Jan 01', '%Y %b %d')
end_date = datetime.strptime('2023 Dec 31', '%Y %b %d')

# chunk size (30 days to reduce memory usage)
chunk_size = timedelta(days=30)

# empty df for results
aggregated_data = pd.DataFrame()

# data in chunks
current_start_date = start_date
while current_start_date < end_date:
    # end date for current chunk
    current_end_date = min(current_start_date + chunk_size, end_date)
    
    # format dates for GDELT
    formatted_start_date = current_start_date.strftime('%Y %b %d')
    formatted_end_date = current_end_date.strftime('%Y %b %d')
    
    print(f"Processing data from {formatted_start_date} to {formatted_end_date}")
    
    # GDELT data for current chunk
    events = gd.Search(
        [formatted_start_date, formatted_end_date],
        table='events',
        output='pd',
        normcols=True,
        coverage=True
    )
    
    # check and filter data
    if not events.empty:
        filtered_events = events[events['actiongeocountrycode'] != '']
        
        # aggregation
        grouped = filtered_events.groupby(['actiongeocountrycode', 'monthyear', 'eventbasecode'])
        chunk_aggregated = pd.DataFrame({
            'No_events': grouped.size(),
            'Tone': grouped['avgtone'].sum(),
            'Goldstein': grouped['goldsteinscale'].sum()
        }).reset_index()
        
        # append to aggregated data
        aggregated_data = pd.concat([aggregated_data, chunk_aggregated], ignore_index=True)
    
    # next chunk
    current_start_date = current_end_date

# aggregation
final_result = aggregated_data.groupby(
    ['actiongeocountrycode', 'monthyear', 'eventbasecode']
).agg({
    'No_events': 'sum',
    'Tone': 'sum',
    'Goldstein': 'sum'
}).reset_index()

# Sort
final_result = final_result.sort_values(['actiongeocountrycode', 'eventbasecode', 'monthyear'])
print(final_result.head())

here
Processing data from 2020 Jan 01 to 2020 Jan 31




Processing data from 2020 Jan 31 to 2020 Mar 01




Processing data from 2020 Mar 01 to 2020 Mar 31




Processing data from 2020 Mar 31 to 2020 Apr 30




Processing data from 2020 Apr 30 to 2020 May 30




Processing data from 2020 May 30 to 2020 Jun 29




Processing data from 2020 Jun 29 to 2020 Jul 29




Processing data from 2020 Jul 29 to 2020 Aug 28




Processing data from 2020 Aug 28 to 2020 Sep 27








Processing data from 2020 Sep 27 to 2020 Oct 27
































































































Processing data from 2020 Oct 27 to 2020 Nov 26






























































































































Processing data from 2020 Nov 26 to 2020 Dec 26




Processing data from 2020 Dec 26 to 2021 Jan 25




Processing data from 2021 Jan 25 to 2021 Feb 24




Processing data from 2021 Feb 24 to 2021 Mar 26




Processing data from 2021 Mar 26 to 2021 Apr 25




Processing data from 2021 Apr 25 to 2021 May 25




Processing data from 2021 May 25 to 2021 Jun 24




Processing data from 2021 Jun 24 to 2021 Jul 24




Processing data from 2021 Jul 24 to 2021 Aug 23






















































Processing data from 2021 Aug 23 to 2021 Sep 22




Processing data from 2021 Sep 22 to 2021 Oct 22




Processing data from 2021 Oct 22 to 2021 Nov 21




Processing data from 2021 Nov 21 to 2021 Dec 21




Processing data from 2021 Dec 21 to 2022 Jan 20




Processing data from 2022 Jan 20 to 2022 Feb 19




Processing data from 2022 Feb 19 to 2022 Mar 21




Processing data from 2022 Mar 21 to 2022 Apr 20




Processing data from 2022 Apr 20 to 2022 May 20




Processing data from 2022 May 20 to 2022 Jun 19




Processing data from 2022 Jun 19 to 2022 Jul 19




Processing data from 2022 Jul 19 to 2022 Aug 18




Processing data from 2022 Aug 18 to 2022 Sep 17




Processing data from 2022 Sep 17 to 2022 Oct 17




Processing data from 2022 Oct 17 to 2022 Nov 16










Processing data from 2022 Nov 16 to 2022 Dec 16




Processing data from 2022 Dec 16 to 2023 Jan 15




Processing data from 2023 Jan 15 to 2023 Feb 14




Processing data from 2023 Feb 14 to 2023 Mar 16




Processing data from 2023 Mar 16 to 2023 Apr 15
















Processing data from 2023 Apr 15 to 2023 May 15




Processing data from 2023 May 15 to 2023 Jun 14




Processing data from 2023 Jun 14 to 2023 Jul 14




Processing data from 2023 Jul 14 to 2023 Aug 13




Processing data from 2023 Aug 13 to 2023 Sep 12




Processing data from 2023 Sep 12 to 2023 Oct 12




Processing data from 2023 Oct 12 to 2023 Nov 11




Processing data from 2023 Nov 11 to 2023 Dec 11




Processing data from 2023 Dec 11 to 2023 Dec 31




   actiongeocountrycode  monthyear eventbasecode  No_events       Tone  \
0                    AA     192001           010          1   0.312012   
9                    AA     201907           010          3   7.154731   
14                   AA     202001           010          9  14.196024   
38                   AA     202002           010          9  14.193118   
64                   AA     202003           010         23 -19.816198   

    Goldstein  
0         0.0  
9         0.0  
14        0.0  
38        0.0  
64        0.0  


Now we will save the file.

In [5]:
# Save file
final_result.to_csv('gdelt_data_2020_2023.csv', index=False)