# Setting up enviroment

In [None]:
import requests
import zipfile
import io
import pandas as pd
from google.colab import files
from glob import glob
import calendar
import os

# Exploring the columns for the GDELT

In [None]:
def inspect_monthly_gdelt(year, month, nrows=5):
    month_str = f"{month:02d}"
    url = f"http://data.gdeltproject.org/events/{year}{month_str}.zip"
    print(f"⬇️ Downloading: {url}")

    r = requests.get(url)
    if r.status_code != 200:
        print(f"❌ Error downloading {url}")
        return

    with zipfile.ZipFile(io.BytesIO(r.content)) as z:
        filename = z.namelist()[0]
        with z.open(filename) as f:
            print(f"\n📄 Showing {nrows} raw rows (without header):\n")
            for _ in range(nrows):
                print(f.readline().decode("utf-8").strip())

        with z.open(filename) as f:
            df = pd.read_csv(f, sep='\t', header=None, nrows=nrows)
            print(f"\n📊 Number of columns detected: {df.shape[1]}")
            print(f"\n📋 Partial DataFrame:\n{df.head()}")

In [None]:
inspect_monthly_gdelt(2006, 1)

⬇️ Descargando: http://data.gdeltproject.org/events/200601.zip

📄 Mostrando 5 filas crudas (sin encabezado):

175632684	20060101	200601	2006	2006.0027											AFG	AFGHANISTAN	AFG								0	040	040	04	1	1	2	1	1	3.70125578321216					0	0	0	4	Kandahar, Kandahar, Afghanistan	AF	AF23	31.6133	65.7101	-3379064	4	Kandahar, Kandahar, Afghanistan	AF	AF23	31.6133	65.7101	6281982	20130203
175632685	20060101	200601	2006	2006.0027											AFG	KANDAHAR	AFG								0	040	040	04	1	1	3	1	1	3.70125578321216					0	0	0	4	Kandahar, Kandahar, Afghanistan	AF	AF23	31.6133	65.7101	-3379064	4	Panjwai, Kandahar, Afghanistan	AF	AF23	31.5253	65.4585	9088665	20130203
175632686	20060101	200601	2006	2006.0027											AFG	AFGHANISTAN	AFG								0	040	040	04	1	1	4	1	1	3.70125578321216					0	0	0	4	Panjwai, Kandahar, Afghanistan	AF	AF23	31.5253	65.4585	9088665	4	Panjwai, Kandahar, Afghanistan	AF	AF23	31.5253	65.4585	9088665	20130203
175632687	20060101	200601	2006	2006.0027											AFG	AFGHAN	AFG								1	040	040	04	1

In [None]:
def inspect_monthly_gdelt_with_header(year, month, nrows=2):
    month_str = f"{month:02d}"
    url = f"http://data.gdeltproject.org/events/{year}{month_str}.zip"
    print(f"⬇️ Downloading: {url}")

    r = requests.get(url)
    if r.status_code != 200:
        print(f"❌ Download error {url}")
        return

    columns = [
        "GLOBALEVENTID", "SQLDATE", "MonthYear", "Year", "FractionDate",
        "Actor1Code", "Actor1Name", "Actor1CountryCode", "Actor1KnownGroupCode",
        "Actor1EthnicCode", "Actor1Religion1Code", "Actor1Religion2Code",
        "Actor2Code", "Actor2Name", "Actor2CountryCode", "Actor2KnownGroupCode",
        "Actor2EthnicCode", "Actor2Religion1Code", "Actor2Religion2Code",
        "IsRootEvent", "EventCode", "EventBaseCode", "EventRootCode",
        "QuadClass", "GoldsteinScale", "NumMentions", "NumSources",
        "NumArticles", "AvgTone", "Actor1Geo_Type", "Actor1Geo_FullName",
        "Actor1Geo_CountryCode", "Actor1Geo_ADM1Code", "Actor1Geo_Lat",
        "Actor1Geo_Long", "Actor1Geo_FeatureID", "Actor2Geo_Type",
        "Actor2Geo_FullName", "Actor2Geo_CountryCode", "Actor2Geo_ADM1Code",
        "Actor2Geo_Lat", "Actor2Geo_Long", "Actor2Geo_FeatureID",
        "ActionGeo_Type", "ActionGeo_FullName", "ActionGeo_CountryCode",
        "ActionGeo_ADM1Code", "ActionGeo_Lat", "ActionGeo_Long",
        "ActionGeo_FeatureID", "DATEADDED", "SOURCEURL"
    ]

    with zipfile.ZipFile(io.BytesIO(r.content)) as z:
        filename = z.namelist()[0]
        with z.open(filename) as f:
            df = pd.read_csv(f, sep='\t', header=None, names=columns, nrows=nrows)

    print(f"\n📋 First {nrows} rows of the file {year}-{month_str}:\n")
    print(df.head(nrows))
    print(f"\n🧾 Total detected columns: {df.shape[1]}")

In [None]:
inspect_monthly_gdelt_with_header(2006, 1)

⬇️ Descargando: http://data.gdeltproject.org/events/200601.zip

📋 Primeras 2 filas del archivo 2006-01:

                                          GLOBALEVENTID  SQLDATE  MonthYear  \
175632684 20060101 200601 2006 2006.0027            NaN      NaN        NaN   
175632685 20060101 200601 2006 2006.0027            NaN      NaN        NaN   

                                          Year  FractionDate  Actor1Code  \
175632684 20060101 200601 2006 2006.0027   NaN           NaN         NaN   
175632685 20060101 200601 2006 2006.0027   NaN           NaN         NaN   

                                          Actor1Name  Actor1CountryCode  \
175632684 20060101 200601 2006 2006.0027         NaN                NaN   
175632685 20060101 200601 2006 2006.0027         NaN                NaN   

                                          Actor1KnownGroupCode  \
175632684 20060101 200601 2006 2006.0027                   NaN   
175632685 20060101 200601 2006 2006.0027                   NaN   

   

In [None]:
inspect_monthly_gdelt_with_header(2008,6)

⬇️ Descargando: http://data.gdeltproject.org/events/200806.zip

📋 Primeras 5 filas del archivo 2008-06:

                                          GLOBALEVENTID  SQLDATE  MonthYear  \
174542847 20080601 200806 2008 2008.4137            NaN      NaN        NaN   
174542848 20080601 200806 2008 2008.4137            NaN      NaN        NaN   
174542849 20080601 200806 2008 2008.4137            NaN      NaN        NaN   
174542850 20080601 200806 2008 2008.4137            NaN      NaN        NaN   
174542851 20080601 200806 2008 2008.4137            NaN      NaN        NaN   

                                          Year  FractionDate  Actor1Code  \
174542847 20080601 200806 2008 2008.4137   NaN           NaN         NaN   
174542848 20080601 200806 2008 2008.4137   NaN           NaN         NaN   
174542849 20080601 200806 2008 2008.4137   NaN           NaN         NaN   
174542850 20080601 200806 2008 2008.4137   NaN           NaN         NaN   
174542851 20080601 200806 2008 2008.4137

In [None]:
# Filtering only the event codes that we are interested in:
event_codes = [
    '1413', '1414', '1431', '1432', '1433', '1434',
    '1451', '1452', '1453', '1454',
    '161', '163', '174', '175', '176',
    '190', '191', '192', '193', '194', '196',
    '200', '201', '202', '203', '2041', '2042'
]

def process_year_file(year,month):
    month_str = f"{month:02d}"
    url = f"http://data.gdeltproject.org/events/{year}{month_str}.zip"
    print(f" Downloading: {url}")

    r = requests.get(url)
    if r.status_code != 200:
        print(f" Could not download. Error in {url}")
        return

    with zipfile.ZipFile(io.BytesIO(r.content)) as z:
        filename = z.namelist()[0]
        with z.open(filename) as f:
            # Columns: MonthYear (2) and EventCode (26)
            df = pd.read_csv(f, sep='\t', header=None, usecols=[2, 26], dtype=str)

    df = df[df[26].isin(event_codes)].copy()
    df['Year'] = df[2].str[:4].astype(int)
    df['Month'] = df[2].str[4:6].astype(int)
    df = df.rename(columns={26: 'EventCode'})

    result = df.groupby(['Year', 'Month', 'EventCode']).size().reset_index(name='count')

    file_path = f"/content/gdelt_event_counts_{year}.csv"
    result.to_csv(file_path, index=False)
    print(f" File saved in: {file_path}")
    files.download(file_path)

In [None]:
def view_gdelt_dataframe(year, month, nrows=5):
    month_str = f"{month:02d}"
    url = f"http://data.gdeltproject.org/events/{year}{month_str}.zip"
    print(f"⬇️ Downloading file from {year}...")

    r = requests.get(url)
    if r.status_code != 200:
        print(f"❌ Error downloading {url}")
        return

    with zipfile.ZipFile(io.BytesIO(r.content)) as z:
        filename = z.namelist()[0]
        with z.open(filename) as f:
            # Official list of GDELT 1.0 columns
            columns = [
                "GLOBALEVENTID", "SQLDATE", "MonthYear", "Year", "FractionDate",
                "Actor1Code", "Actor1Name", "Actor1CountryCode", "Actor1KnownGroupCode", "Actor1EthnicCode", "Actor1Religion1Code", "Actor1Religion2Code",
                "Actor2Code", "Actor2Name", "Actor2CountryCode", "Actor2KnownGroupCode", "Actor2EthnicCode", "Actor2Religion1Code", "Actor2Religion2Code",
                "IsRootEvent", "EventCode", "EventBaseCode", "EventRootCode",
                "QuadClass", "GoldsteinScale", "NumMentions", "NumSources", "NumArticles",
                "AvgTone", "Actor1Geo_Type", "Actor1Geo_FullName", "Actor1Geo_CountryCode", "Actor1Geo_ADM1Code", "Actor1Geo_Lat", "Actor1Geo_Long", "Actor1Geo_FeatureID",
                "Actor2Geo_Type", "Actor2Geo_FullName", "Actor2Geo_CountryCode", "Actor2Geo_ADM1Code", "Actor2Geo_Lat", "Actor2Geo_Long", "Actor2Geo_FeatureID",
                "ActionGeo_Type", "ActionGeo_FullName", "ActionGeo_CountryCode", "ActionGeo_ADM1Code", "ActionGeo_Lat", "ActionGeo_Long", "ActionGeo_FeatureID",
                "DATEADDED", "SOURCEURL"
            ]
            df_1 = pd.read_csv(f, sep='\t', header=None, names=columns, nrows=nrows)
            return df_1
    print(f"\n📋 First {nrows} rows of the {year} file.zip:\n")
    print(df_1.head(nrows))
    print(f"\n🧾 Available columns: {len(df_1.columns)} columns")


In [None]:
view_gdelt_dataframe(2006,8)

⬇️ Descargando archivo de 2006...


Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,GLOBALEVENTID,SQLDATE,MonthYear,Year,FractionDate,Actor1Code,Actor1Name,Actor1CountryCode,Actor1KnownGroupCode,Actor1EthnicCode,...,Actor2Geo_FeatureID,ActionGeo_Type,ActionGeo_FullName,ActionGeo_CountryCode,ActionGeo_ADM1Code,ActionGeo_Lat,ActionGeo_Long,ActionGeo_FeatureID,DATEADDED,SOURCEURL
178886138,20060801,200608,2006,2006.5781,,,,,,,,,,,...,62.8669,-3393222,4,"Shirzad, Herat, Afghanistan",AF,AF11,33.6018,62.8669,-3393222.0,20130203
178886139,20060801,200608,2006,2006.5781,,,,,,,,,,,...,65.4343,-3386490,0,"Pashmul, Kandahar, Afghanistan",AF,AF23,31.5572,65.4343,,20130203
178886140,20060801,200608,2006,2006.5781,,,,,,,,,,,...,-75.7,-570760,4,"Ottawa, Ontario, Canada",CA,CA08,45.4167,-75.7,-570760.0,20130203
178886141,20060801,200608,2006,2006.5781,,,,,,,,,,,...,64.0,-3377342,5,"Helmand, Helmand, Afghanistan",AF,AF10,31.0,64.0,,20130203
178886142,20060801,200608,2006,2006.5781,,,,,,,,,,,...,69.1833,-3378435,4,"Kabul, Kabol, Afghanistan",AF,AF13,34.5167,69.1833,-3378435.0,20130203




# Running GDELT for all months and years

There are three structures for the GDELT files:
- From 1990 to 2005, the files are organized by year.
- From 2006 to 2013, they are organized by month and year.
- From 2014 onwards, they are organized by day.


First, we will download all files from 1990 to 2005:

In [None]:
# Filtering only the event codes that we are interested in:
event_codes = [
    '1413', '1414', '1431', '1432', '1433', '1434',
    '1451', '1452', '1453', '1454',
    '161', '163', '174', '175', '176',
    '190', '191', '192', '193', '194', '196',
    '200', '201', '202', '203', '2041', '2042'
]

def process_year_file(year):
    url = f"http://data.gdeltproject.org/events/{year}.zip"
    print(f" Downloading: {url}")

    r = requests.get(url)
    if r.status_code != 200:
        print(f" Could not download. Error in {url}")
        return

    with zipfile.ZipFile(io.BytesIO(r.content)) as z:
        filename = z.namelist()[0]
        with z.open(filename) as f:
            # Columns: MonthYear (2) and EventCode (26)
            df = pd.read_csv(f, sep='\t', header=None, usecols=[2, 26], dtype=str)

    df = df[df[26].isin(event_codes)].copy()
    df['Year'] = df[2].str[:4].astype(int)
    df['Month'] = df[2].str[4:6].astype(int)
    df = df.rename(columns={26: 'EventCode'})

    result = df.groupby(['Year', 'Month', 'EventCode']).size().reset_index(name='count')

    file_path = f"/content/gdelt_event_counts_{year}.csv"
    result.to_csv(file_path, index=False)
    print(f" File saved in: {file_path}")
    files.download(file_path)

In [None]:
for year in range(1990, 2006):
    try:
        print(f"\n Processing {year}...")
        process_year_file(year)
    except Exception as e:
        print(f" Error in {year}: {e}")


 Processing 2000...
 Downloading: http://data.gdeltproject.org/events/2000.zip
 File saved in: /content/gdelt_event_counts_2000.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


 Processing 2001...
 Downloading: http://data.gdeltproject.org/events/2001.zip
 File saved in: /content/gdelt_event_counts_2001.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


 Processing 2002...
 Downloading: http://data.gdeltproject.org/events/2002.zip
 File saved in: /content/gdelt_event_counts_2002.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


 Processing 2003...
 Downloading: http://data.gdeltproject.org/events/2003.zip
 File saved in: /content/gdelt_event_counts_2003.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


 Processing 2004...
 Downloading: http://data.gdeltproject.org/events/2004.zip
 File saved in: /content/gdelt_event_counts_2004.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


 Processing 2005...
 Downloading: http://data.gdeltproject.org/events/2005.zip
 File saved in: /content/gdelt_event_counts_2005.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Now, we download the files from 2005 to 2013:

In [None]:
# Filtering only the event codes that we are interested in:
event_codes = [
    '1413', '1414', '1431', '1432', '1433', '1434',
    '1451', '1452', '1453', '1454',
    '161', '163', '174', '175', '176',
    '190', '191', '192', '193', '194', '196',
    '200', '201', '202', '203', '2041', '2042'
]

def process_month_year_file(year, month):
    month_str = f"{month:02d}"
    url = f"http://data.gdeltproject.org/events/{year}{month_str}.zip"
    print(f" Downloading: {url}")

    r = requests.get(url)
    if r.status_code != 200:
        print(f" Could not download. Error in {url}")
        return

    with zipfile.ZipFile(io.BytesIO(r.content)) as z:
        filename = z.namelist()[0]
        with z.open(filename) as f:
            # Columns: MonthYear (2) and EventCode (26)
            df = pd.read_csv(f, sep='\t', header=None, usecols=[2, 26], dtype=str)

    df = df[df[26].isin(event_codes)].copy()
    df['Year'] = df[2].str[:4].astype(int)
    df['Month'] = df[2].str[4:6].astype(int)
    df = df.rename(columns={26: 'EventCode'})

    result = df.groupby(['Year', 'Month', 'EventCode']).size().reset_index(name='count')

    file_path = f"/content/gdelt_event_counts_{year}{month_str}.csv"
    result.to_csv(file_path, index=False)
    print(f" File saved in: {file_path}")
    files.download(file_path)

In [None]:
process_month_year_file(2010,12)

 Downloading: http://data.gdeltproject.org/events/201012.zip
 File saved in: /content/gdelt_event_counts_201012.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
for year in range(2013, 2014):
    for month in range(1,7):
        try:
            print(f"\n Processing {year}-{month:02d}...")
            process_month_year_file(year, month)
        except Exception as e:
            print(f" Error in {year}-{month:02d}: {e}")


 Processing 2013-01...
 Downloading: http://data.gdeltproject.org/events/201301.zip
 File saved in: /content/gdelt_event_counts_201301.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


 Processing 2013-02...
 Downloading: http://data.gdeltproject.org/events/201302.zip
 File saved in: /content/gdelt_event_counts_201302.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


 Processing 2013-03...
 Downloading: http://data.gdeltproject.org/events/201303.zip
 File saved in: /content/gdelt_event_counts_201303.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


 Processing 2013-04...
 Downloading: http://data.gdeltproject.org/events/201304.zip
 Could not download. Error in http://data.gdeltproject.org/events/201304.zip

 Processing 2013-05...
 Downloading: http://data.gdeltproject.org/events/201305.zip
 Could not download. Error in http://data.gdeltproject.org/events/201305.zip

 Processing 2013-06...
 Downloading: http://data.gdeltproject.org/events/201306.zip
 Could not download. Error in http://data.gdeltproject.org/events/201306.zip


In [None]:
for year in range(2012, 2013):
    for month in range(7,13):
        try:
            print(f"\n Processing {year}-{month:02d}...")
            process_month_year_file(year, month)
        except Exception as e:
            print(f" Error in {year}-{month:02d}: {e}")


 Processing 2012-07...
 Downloading: http://data.gdeltproject.org/events/201207.zip
 File saved in: /content/gdelt_event_counts_201207.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


 Processing 2012-08...
 Downloading: http://data.gdeltproject.org/events/201208.zip
 File saved in: /content/gdelt_event_counts_201208.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


 Processing 2012-09...
 Downloading: http://data.gdeltproject.org/events/201209.zip
 File saved in: /content/gdelt_event_counts_201209.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


 Processing 2012-10...
 Downloading: http://data.gdeltproject.org/events/201210.zip
 File saved in: /content/gdelt_event_counts_201210.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


 Processing 2012-11...
 Downloading: http://data.gdeltproject.org/events/201211.zip
 File saved in: /content/gdelt_event_counts_201211.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


 Processing 2012-12...
 Downloading: http://data.gdeltproject.org/events/201212.zip
 File saved in: /content/gdelt_event_counts_201212.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Now, we download the files from April 2013 to December 2024:

In [None]:
# Only getting the event codes that we are interested in
event_codes = [
    '1413', '1414', '1431', '1432', '1433', '1434',
    '1451', '1452', '1453', '1454',
    '161', '163', '174', '175', '176',
    '190', '191', '192', '193', '194', '196',
    '200', '201', '202', '203', '2041', '2042'
]

# Create a temporary folder for the files
start_year = 2024
end_year = start_year + 1
output_dir = f"/content/gdelt_{start_year}_1"
os.makedirs(output_dir, exist_ok=True)

# Function to count the days on each month
def get_days_in_month(year, month):
    return calendar.monthrange(year, month)[1]

# Process each month in range
for year in range(start_year,end_year):
    for month in range(1, 7):  # First semester
        print(f"\n--- Processing {year}-{month:02d} ---")
        monthly_results = []
        for day in range(1, get_days_in_month(year, month) + 1):
            date_str = f"{year}{month:02d}{day:02d}"
            url = f"http://data.gdeltproject.org/events/{date_str}.export.CSV.zip"
            try:
                r = requests.get(url, timeout=30)
                if r.status_code != 200:
                    print(f" Failed to download {url}")
                    continue
                with zipfile.ZipFile(io.BytesIO(r.content)) as z:
                    filename = z.namelist()[0]
                    with z.open(filename) as f:
                        df = pd.read_csv(f, sep='\t', header=None, usecols=[0, 26], dtype=str)

                df = df[df[26].isin(event_codes)].copy()
                df['Year'] = year
                df['Month'] = month
                df['Day'] = day
                df = df.rename(columns={26: 'EventCode'})

                daily_counts = df.groupby(['Year', 'Month', 'Day', 'EventCode']).size().reset_index(name='count')
                monthly_results.append(daily_counts)

            except Exception as e:
                print(f"Error processing {date_str}: {e}")

        # Join all the days of the month
        if monthly_results:
            monthly_df = pd.concat(monthly_results, ignore_index=True)
            csv_path = os.path.join(output_dir, f"gdelt_event_counts_{year}{month:02d}.csv")
            monthly_df.to_csv(csv_path, index=False)
            print(f"Saved monthly CSV: {csv_path}")
        else:
            print(f"No data for {year}-{month:02d}")

# Compress all CSVs into one ZIP
zip_path = f"/content/gdelt_{start_year}_1.zip"
with zipfile.ZipFile(zip_path, 'w') as zipf:
    for file_name in os.listdir(output_dir):
        full_path = os.path.join(output_dir, file_name)
        zipf.write(full_path, arcname=file_name)

# Downloading final zip
files.download(zip_path)


--- Processing 2024-01 ---
Saved monthly CSV: /content/gdelt_2024_1/gdelt_event_counts_202401.csv

--- Processing 2024-02 ---
Saved monthly CSV: /content/gdelt_2024_1/gdelt_event_counts_202402.csv

--- Processing 2024-03 ---
Saved monthly CSV: /content/gdelt_2024_1/gdelt_event_counts_202403.csv

--- Processing 2024-04 ---
Saved monthly CSV: /content/gdelt_2024_1/gdelt_event_counts_202404.csv

--- Processing 2024-05 ---
Saved monthly CSV: /content/gdelt_2024_1/gdelt_event_counts_202405.csv

--- Processing 2024-06 ---
Saved monthly CSV: /content/gdelt_2024_1/gdelt_event_counts_202406.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Only getting the event codes that we are interested in
event_codes = [
    '1413', '1414', '1431', '1432', '1433', '1434',
    '1451', '1452', '1453', '1454',
    '161', '163', '174', '175', '176',
    '190', '191', '192', '193', '194', '196',
    '200', '201', '202', '203', '2041', '2042'
]

# Create a temporary folder for the files
start_year = 2024
end_year = start_year + 1
output_dir = f"/content/gdelt_{start_year}_2"
os.makedirs(output_dir, exist_ok=True)

# Function to count the days on each month
def get_days_in_month(year, month):
    return calendar.monthrange(year, month)[1]

# Process each month in range
for year in range(start_year,end_year):
    for month in range(7, 13):  # Second semester
        print(f"\n--- Processing {year}-{month:02d} ---")
        monthly_results = []
        for day in range(1, get_days_in_month(year, month) + 1):
            date_str = f"{year}{month:02d}{day:02d}"
            url = f"http://data.gdeltproject.org/events/{date_str}.export.CSV.zip"
            try:
                r = requests.get(url, timeout=30)
                if r.status_code != 200:
                    print(f" Failed to download {url}")
                    continue
                with zipfile.ZipFile(io.BytesIO(r.content)) as z:
                    filename = z.namelist()[0]
                    with z.open(filename) as f:
                        df = pd.read_csv(f, sep='\t', header=None, usecols=[0, 26], dtype=str)

                df = df[df[26].isin(event_codes)].copy()
                df['Year'] = year
                df['Month'] = month
                df['Day'] = day
                df = df.rename(columns={26: 'EventCode'})

                daily_counts = df.groupby(['Year', 'Month', 'Day', 'EventCode']).size().reset_index(name='count')
                monthly_results.append(daily_counts)

            except Exception as e:
                print(f"Error processing {date_str}: {e}")

        # Join all the days of the month
        if monthly_results:
            monthly_df = pd.concat(monthly_results, ignore_index=True)
            csv_path = os.path.join(output_dir, f"gdelt_event_counts_{year}{month:02d}.csv")
            monthly_df.to_csv(csv_path, index=False)
            print(f"Saved monthly CSV: {csv_path}")
        else:
            print(f"No data for {year}-{month:02d}")

# Compress all CSVs into one ZIP
zip_path = f"/content/gdelt_{start_year}_2.zip"
with zipfile.ZipFile(zip_path, 'w') as zipf:
    for file_name in os.listdir(output_dir):
        full_path = os.path.join(output_dir, file_name)
        zipf.write(full_path, arcname=file_name)

# Downloading final zip
files.download(zip_path)


--- Processing 2024-07 ---
Saved monthly CSV: /content/gdelt_2024_2/gdelt_event_counts_202407.csv

--- Processing 2024-08 ---
Saved monthly CSV: /content/gdelt_2024_2/gdelt_event_counts_202408.csv

--- Processing 2024-09 ---
Saved monthly CSV: /content/gdelt_2024_2/gdelt_event_counts_202409.csv

--- Processing 2024-10 ---
Saved monthly CSV: /content/gdelt_2024_2/gdelt_event_counts_202410.csv

--- Processing 2024-11 ---
Saved monthly CSV: /content/gdelt_2024_2/gdelt_event_counts_202411.csv

--- Processing 2024-12 ---
Saved monthly CSV: /content/gdelt_2024_2/gdelt_event_counts_202412.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Consolidating everything in one single file

In [None]:
# Importing the zip
uploaded = files.upload()

Saving GDELT.zip to GDELT (1).zip


In [None]:
with zipfile.ZipFile("GDELT.zip", "r") as zip_ref:
    zip_ref.extractall("/content/gdelt_files")

In [None]:
csv_files = glob('/content/gdelt_files/GDELT-csv/*.csv')
print(f"Found files: {len(csv_files)}")
print(csv_files[:5])

Archivos encontrados: 244
['/content/gdelt_files/GDELT-csv/gdelt_event_counts_1991.csv', '/content/gdelt_files/GDELT-csv/gdelt_event_counts_202306.csv', '/content/gdelt_files/GDELT-csv/gdelt_event_counts_201309.csv', '/content/gdelt_files/GDELT-csv/gdelt_event_counts_201201.csv', '/content/gdelt_files/GDELT-csv/gdelt_event_counts_202001.csv']


In [None]:
# We use the files extracted from /content/gdelt_files/
csv_files = glob('/content/gdelt_files/GDELT-csv/*.csv')

# Ready to save the processed fragments
df_list = []

print(f"Found files: {len(csv_files)}")
print(csv_files[:5])  # shows the first 5 to verify

for file in csv_files:
    try:
        df = pd.read_csv(file)

        # Verificamos si tiene columna 'Day'
        if 'Day' in df.columns:
            # Group by Year, Month and EventCode
            df_grouped = df.groupby(['Year', 'Month', 'EventCode'], as_index=False)['count'].sum()
            df_list.append(df_grouped)
        else:
            # It is already grouped by month
            df_list.append(df)
    except Exception as e:
        print(f"Error reading {file}: {e}")

# Concatenate all dataframes
df_all = pd.concat(df_list, ignore_index=True)

# Sort and leave only desired columns
df_all = df_all[['Year', 'Month', 'EventCode', 'count']]
df_all = df_all.sort_values(by=['Year', 'Month', 'EventCode']).reset_index(drop=True)

# See a sample
print(df_all.head())

# Save final CSV in Colab
output_path = '/content/gdelt_event_counts_1990_2024.csv'
df_all.to_csv(output_path, index=False)

# Download final file
from google.colab import files
files.download(output_path)

Archivos encontrados: 244
['/content/gdelt_files/GDELT-csv/gdelt_event_counts_1991.csv', '/content/gdelt_files/GDELT-csv/gdelt_event_counts_202306.csv', '/content/gdelt_files/GDELT-csv/gdelt_event_counts_201309.csv', '/content/gdelt_files/GDELT-csv/gdelt_event_counts_201201.csv', '/content/gdelt_files/GDELT-csv/gdelt_event_counts_202001.csv']
   Year  Month  EventCode  count
0  1990      1        161    387
1  1990      1        163    120
2  1990      1        174    214
3  1990      1        175     63
4  1990      1        190   4622


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>