In [1]:
import pandas as pd
import networkx as nx

import os
import glob
import gc

In [2]:
def merge_data(file_paths, output_path):
    """Merge multiple text files into a single DataFrame and save as CSV.

    Args:
        file_paths (list): List of file paths to be merged.
        output_path (str): Path to the output CSV file.
    """
    all_files = []
    for path in file_paths:
        all_files.extend(glob.glob(path))
        
    if not all_files:
        print("No files found.")
        return pd.DataFrame()
    
    df_list = []
    for file in all_files:
        try:
            try:
                df = pd.read_csv(file, sep="\t", low_memory=False)
            except pd.errors.ParserError:
                print(f"Error reading {file}. Trying with different separator.")
                df = pd.read_csv(file, sep=",", low_memory=False)
            except Exception as e:
                print(f"Error reading {file}: {e}")
                continue
            df_list.append(df)
        except Exception as e:
            print(f"Error processing {file}: {e}")
            
    if not df_list:
        print("No valid files to merge.")
        return pd.DataFrame()
    
    merged_df = pd.concat(df_list, ignore_index=True)
    merged_df.drop_duplicates(inplace=True)
    merged_df.reset_index(drop=True, inplace=True)
    print(f"Merged data shape: {merged_df.shape}")
    
    merged_df.to_csv(output_path, index=False)
    


In [3]:
file_paths = ["datasets/ngecEvents.DV.2024.txt", "datasets/ngecEvents.DV.2023.txt", "datasets/ngecEvents.DV.2022.txt"]
output_path = "datasets/merged_events.csv"

merge_data(file_paths, output_path)

No files found.


In [4]:
data_org_df = pd.read_csv("/Users/adityasampath/Documents/Drive/College/MS/Sem_3/ITCS-6991_Thesis/Thesis/GraphRAG/data/data_original.csv")

#print each unique value
print("Unique Event Types:")
for event_type in data_org_df['Event Type'].unique():
    print(event_type)

Unique Event Types:
ACCUSE
ASSAULT
AID
REQUEST
PROTEST
COERCE
THREATEN
RETREAT
MOBILIZE
SANCTION
CONCEDE
COOPERATE
CONSULT
REJECT


In [8]:
#Find how many events have a CountryCode
merged_df = pd.read_csv(output_path)
print(f"Number of events with CountryCode: {len(merged_df[merged_df['Country'].notnull()])}")

missing_country_code = merged_df[merged_df['Country'].isnull()]

#Find how many of the missing CountryCode have RawPlacename
missing_raw_placename = missing_country_code[missing_country_code['Raw Placename'].notnull()]
print(f"Number of events with missing CountryCode but with RawPlacename: {len(missing_raw_placename)}")



Number of events with CountryCode: 434142
Number of events with missing CountryCode but with RawPlacename: 352466


In [13]:
#get unique values in Country column
unique_countries = merged_df['Country'].unique()
print(f"Unique countries: {unique_countries}")

Unique countries: [nan 'PSE' 'AZE' 'MAR' 'CHE' 'UKR' 'BRA' 'ZAF' 'HUN' 'ISR' 'KOR' 'LBN'
 'BGR' 'ESP' 'CHL' 'IND' 'SYR' 'ITA' 'COM' 'DEU' 'IDN' 'TZA' 'PRK' 'AFG'
 'GBR' 'BEL' 'UZB' 'EGY' 'VEN' 'PHL' 'SSD' 'YEM' 'THA' 'PER' 'USA' 'CZE'
 'VNM' 'RUS' 'JPN' 'ARM' 'PRI' 'COL' 'AUS' 'NLD' 'NOR' 'ARG' 'IRN' 'FRA'
 'PAK' 'KAZ' 'GEO' 'CHN' 'SWE' 'MLT' 'IRQ' 'TWN' 'LTU' 'BGD' 'ARE' 'MEX'
 'SLV' 'MYS' 'DZA' 'CUB' 'MDG' 'PNG' 'SGP' 'ROU' 'BLR' 'NCL' 'CAN' 'BEN'
 'JOR' 'BIH' 'TUR' 'NZL' 'EST' 'ETH' 'TUN' 'COD' 'SOM' 'HKG' 'BHR' 'KHM'
 'LBR' 'KGZ' 'IRL' 'QAT' 'GUY' 'POL' 'CIV' 'MLI' 'PAN' 'SDN' 'ISL' 'MOZ'
 'GRC' 'NER' 'VGB' 'MMR' 'PRT' 'COG' 'ZMB' 'GTM' 'ECU' 'TKM' 'LVA' 'MTQ'
 'CAF' 'LBY' 'NGA' 'RWA' 'KWT' 'ZWE' 'NAM' 'KEN' 'FIN' 'BDI' 'SEN' 'LUX'
 'AGO' 'SVK' 'URY' 'CMR' 'HRV' 'BOL' 'AUT' 'BRN' 'UGA' 'VIR' 'HTI' 'STP'
 'NIC' 'BFA' 'MDA' 'SYC' 'MKD' 'SVN' 'ATG' 'DOM' 'BRB' 'SAU' 'CRI' 'GAB'
 'VAT' 'TJK' 'MNG' 'MHL' 'DNK' 'CYP' 'HND' 'VCT' 'MDV' 'GRD' 'JAM' 'TCA'
 'ALB' 'SLE' 'TGO' 'GHA' 'TCD' 'LCA

In [10]:
#show values of RawPlacename where CountryCode is missing
print("RawPlacename values where CountryCode is missing:")

placenames = missing_country_code['Raw Placename'].unique()
for placename in placenames:
    print(placename)

RawPlacename values where CountryCode is missing:
Uttar Pradesh 's Deoria
northeastern Kharkiv region
nan
Uttar Pradesh
Hyderabad
Gaza Strip
Kerem Shalom
British
high level EU officials; foreign ministers
Sunday
Saint Therese - Al - Haddat
Lake City
south Donetsk area
central Israel
Kharkov direction
Hamra Street in Beirut
Telangana
Himachal Pradesh
Yerevan 's central Republic Square
Guo Haiyan
Chandigarh
2024
SANA
May 6 ,
Sydney Opera House
Stelmakhovka
north of occupied Jerusalem
Graivoron district of Russia 's Belgorod region
Jung District
last cabinet meeting
International Criminal Court
May 26 ,
Indonesian
downtown Yerevan
San Francisco
Uganda
Criminal Courts; international law obligations; Justice; arrogance
Belgorod region
Hoshiarpur
Djerba island
Kim Jong Un
Vivek Vihar
Gaza , targeting central Israel
party memorabilia; blue colored flags
West Bengal
Chilean police
Sydney Writers Festival
Northern Territory
northern occupied Palestine
Monday last week
Tel Aviv in central Israel

In [11]:
#print count of each unique value of RawPlacename
placename_counts = missing_country_code['Raw Placename'].value_counts()
print("Count of each unique value of RawPlacename where CountryCode is missing:")
for placename, count in placename_counts.items():
    print(f"{placename}: {count}")

Count of each unique value of RawPlacename where CountryCode is missing:
Gaza Strip: 6757
Wednesday: 4792
Tuesday: 4738
Thursday: 4638
Monday: 4205
Friday: 3147
Delhi: 2842
Ukraine: 2376
The Hague: 1784
Sunday: 1656
Moscow: 1531
Uttar Pradesh: 1372
Nagorno - Karabakh: 1349
Donetsk: 1283
Saturday: 1259
West Bengal: 1139
eastern Ukraine: 1110
Gaza: 996
Hong Kong: 980
Parliament: 952
here: 901
Israeli: 860
Ukrainian: 812
Geneva: 812
southern Lebanon: 800
Supreme Court: 781
yesterday: 775
Federal Supreme Court: 757
Sumy region: 700
Zaporozhye: 655
Peru: 646
Telangana: 592
Senate: 588
Zaporizhzhia: 581
Zaporozhye region: 580
parliament: 558
Himachal Pradesh: 558
São Paulo: 539
UN Security Council: 538
Peruvian: 530
Black Sea: 524
southern Ukraine: 523
Istanbul: 501
National Assembly: 465
southern Israel: 457
Donbass: 455
European Parliament: 449
Congress: 444
northern Iraq: 436
today: 435
International Court of Justice: 431
Red Sea: 429
Lok Sabha: 416
capital: 411
Washington: 408
Seoul: 406

In [18]:
#print place names with more than 5 occurrences
print("Place names with more than 5 occurrences:")
for placename, count in placename_counts.items():
    if count > 5:
        print(f"{placename}: {count}")

# Print place names with more than 10 occurrences
print("\nPlace names with more than 10 occurrences:")
for placename, count in placename_counts.items():
    if count > 10:
        print(f"{placename}: {count}")

Place names with more than 5 occurrences:
Gaza Strip: 6757
Wednesday: 4792
Tuesday: 4738
Thursday: 4638
Monday: 4205
Friday: 3147
Delhi: 2842
Ukraine: 2376
The Hague: 1784
Sunday: 1656
Moscow: 1531
Uttar Pradesh: 1372
Nagorno - Karabakh: 1349
Donetsk: 1283
Saturday: 1259
West Bengal: 1139
eastern Ukraine: 1110
Gaza: 996
Hong Kong: 980
Parliament: 952
here: 901
Israeli: 860
Ukrainian: 812
Geneva: 812
southern Lebanon: 800
Supreme Court: 781
yesterday: 775
Federal Supreme Court: 757
Sumy region: 700
Zaporozhye: 655
Peru: 646
Telangana: 592
Senate: 588
Zaporizhzhia: 581
Zaporozhye region: 580
parliament: 558
Himachal Pradesh: 558
São Paulo: 539
UN Security Council: 538
Peruvian: 530
Black Sea: 524
southern Ukraine: 523
Istanbul: 501
National Assembly: 465
southern Israel: 457
Donbass: 455
European Parliament: 449
Congress: 444
northern Iraq: 436
today: 435
International Court of Justice: 431
Red Sea: 429
Lok Sabha: 416
capital: 411
Washington: 408
Seoul: 406
Lima: 403
southern Gaza: 386
n

In [20]:

from geopy.geocoders import Nominatim
from tqdm import tqdm

def fill_countries_from_placename(
    df: pd.DataFrame,
    placename_col: str = 'Raw Placename',
    country_col: str = 'Country',
    min_count: int = 10,
    user_agent: str = 'my_geocoder'
) -> (pd.DataFrame, dict):
    """
    For rows where country_col is null but placename_col is non-null,
    find all placenames with >min_count occurrences, geocode them once
    to get a country, and fill those missing country values.

    Returns:
        - df_filled: DataFrame with country_col filled where possible
        - place_to_country: dict mapping placename -> resolved country (or None)
    """
    # 1. Mask rows needing fill
    mask_missing = df[country_col].isna() & df[placename_col].notna()
    
    # 2. Count placename frequencies in those rows
    counts = df.loc[mask_missing, placename_col].value_counts()
    frequent = counts[counts > min_count].index.tolist()
    
    # 3. Set up geocoder and cache
    geolocator = Nominatim(user_agent=user_agent)
    place_to_country = {}
    
    # 4. Geocode each frequent placename
    for place in tqdm(frequent, desc='Geocoding frequent places'):
        try:
            loc = geolocator.geocode(place, exactly_one=True, addressdetails=True, timeout=10)
            country = loc.raw['address'].get('country') if loc and 'address' in loc.raw else None
        except Exception:
            country = None
        place_to_country[place] = country
    
    # 5. Fill the DataFrame
    sel = mask_missing & df[placename_col].isin(place_to_country)
    df.loc[sel, country_col] = df.loc[sel, placename_col].map(place_to_country)
    
    return df, place_to_country


# Fill countries from placenames
country_df, place_to_country = fill_countries_from_placename(
    merged_df,
    placename_col='Raw Placename',
    country_col='Country',
    min_count=10,
    user_agent='my_geocoder'
)

# Save the updated DataFrame
country_df.to_csv("datasets/merged_events_with_country.csv", index=False)

Geocoding frequent places: 100%|██████████| 3141/3141 [2:30:47<00:00,  2.88s/it]  



In [21]:
print(place_to_country)

{'Gaza Strip': 'Australia', 'Wednesday': 'السودان', 'Tuesday': 'Chile', 'Thursday': 'United States', 'Monday': 'United States', 'Friday': 'Nigeria', 'Delhi': 'India', 'Ukraine': 'Україна', 'The Hague': 'Nederland', 'Sunday': 'Liberia', 'Moscow': 'Россия', 'Uttar Pradesh': 'India', 'Nagorno - Karabakh': 'Հայաստան', 'Donetsk': 'Україна', 'Saturday': 'Liberia', 'West Bengal': 'India', 'eastern Ukraine': 'Україна', 'Gaza': 'Moçambique', 'Hong Kong': '中国', 'Parliament': 'Danmark', 'here': 'United Kingdom', 'Israeli': 'ישראל', 'Ukrainian': 'Україна', 'Geneva': 'Schweiz/Suisse/Svizzera/Svizra', 'southern Lebanon': 'United States', 'Supreme Court': 'India', 'yesterday': 'España', 'Federal Supreme Court': 'Brasil', 'Sumy region': 'Україна', 'Zaporozhye': 'Україна', 'Peru': 'Perú', 'Telangana': 'India', 'Senate': 'România', 'Zaporizhzhia': 'Україна', 'Zaporozhye region': None, 'parliament': 'Danmark', 'Himachal Pradesh': 'India', 'São Paulo': 'Brasil', 'UN Security Council': None, 'Peruvian': 'P

In [22]:
# Drop the rows with missing CountryCode
country_df.dropna(subset=['Country'], inplace=True)
# Reset index
country_df.reset_index(drop=True, inplace=True)

# Save the cleaned DataFrame
country_df.to_csv("datasets/cleaned_events.csv", index=False)

# Load the cleaned DataFrame
cleaned_df = pd.read_csv("datasets/cleaned_events.csv")

cleaned_df.head()

Unnamed: 0,Event ID,Event Date,Event Type,Event Mode,Event Intensity,Quad Code,Contexts,Actor Name,Actor Country,Actor COW,...,GeoNames ID,Raw Placename,Feature Type,Source,Publication Date,Story People,Story Organizations,Story Locations,Language,Version
0,20240526-9075-ddbdb4c6fd56_REQUEST,2024-05-26,REQUEST,,0.0,VERBAL CONFLICT,military | terrorism,Hama,Syria,652,...,281132.0,Gaza,ADM1,Al Jazeera English,2024-05-26,Abu Obeida | Osama Hamdan,Qassam Brigades | the World Court ’s | Hamas |...,Gaza Strip | Rafaḩ | State of Israel | Arab Re...,English,NGEC_coder-Vers001-b1-Run-001
1,20240526-9268-3e5252a90526_AID,2024-05-26,AID,,10.0,MATERIAL COOPERATION,,Ukhnaagiin Khürelsükh,Mongolia,712,...,587084.0,Baku,PPLC,Trend News Agency,2024-05-26,Ukhnaagiin Khürelsükh | Ilham Aliyev,Trend | UNFCCC,Republic of Azerbaijan | Baku | Mongolia,English,NGEC_coder-Vers001-b1-Run-001
2,20240526-9350-6f7d1f553415_AID,2024-05-26,AID,,10.0,MATERIAL COOPERATION,,,Morocco,600,...,2542007.0,Morocco,PCLI,Trend News Agency,2024-05-26,Mohammed VI | Ilham Aliyev,Trend,Republic of Azerbaijan | Kingdom of Morocco,English,NGEC_coder-Vers001-b1-Run-001
3,20240526-9528-fc360e9c8146_REQUEST,2024-05-26,REQUEST,,0.0,VERBAL CONFLICT,,Volodymyr Zelenskyy,Ukraine,369,...,2658434.0,Switzerland,PCLI,DW English,2024-05-26,Volodymyr Zelenskyy | Zelenskyy | Charles Mich...,European Council,Ukraine | Russian Federation | People’s Republ...,English,NGEC_coder-Vers001-b1-Run-001
4,20240526-9528-fc360e9c8146_ACCUSE,2024-05-26,ACCUSE,,-3.0,VERBAL CONFLICT,,,Ukraine,369,...,706483.0,Kharkiv,PPLA,DW English,2024-05-26,Volodymyr Zelenskyy | Zelenskyy | Charles Mich...,European Council,Ukraine | Russian Federation | People’s Republ...,English,NGEC_coder-Vers001-b1-Run-001


In [23]:
gc.collect()

57

In [24]:
#drop columns where country code has special characters
cleaned_df = cleaned_df[~cleaned_df['Country'].str.contains(r'[^a-zA-Z0-9]', na=False)]

cleaned_df.head()

Unnamed: 0,Event ID,Event Date,Event Type,Event Mode,Event Intensity,Quad Code,Contexts,Actor Name,Actor Country,Actor COW,...,GeoNames ID,Raw Placename,Feature Type,Source,Publication Date,Story People,Story Organizations,Story Locations,Language,Version
0,20240526-9075-ddbdb4c6fd56_REQUEST,2024-05-26,REQUEST,,0.0,VERBAL CONFLICT,military | terrorism,Hama,Syria,652,...,281132.0,Gaza,ADM1,Al Jazeera English,2024-05-26,Abu Obeida | Osama Hamdan,Qassam Brigades | the World Court ’s | Hamas |...,Gaza Strip | Rafaḩ | State of Israel | Arab Re...,English,NGEC_coder-Vers001-b1-Run-001
1,20240526-9268-3e5252a90526_AID,2024-05-26,AID,,10.0,MATERIAL COOPERATION,,Ukhnaagiin Khürelsükh,Mongolia,712,...,587084.0,Baku,PPLC,Trend News Agency,2024-05-26,Ukhnaagiin Khürelsükh | Ilham Aliyev,Trend | UNFCCC,Republic of Azerbaijan | Baku | Mongolia,English,NGEC_coder-Vers001-b1-Run-001
2,20240526-9350-6f7d1f553415_AID,2024-05-26,AID,,10.0,MATERIAL COOPERATION,,,Morocco,600,...,2542007.0,Morocco,PCLI,Trend News Agency,2024-05-26,Mohammed VI | Ilham Aliyev,Trend,Republic of Azerbaijan | Kingdom of Morocco,English,NGEC_coder-Vers001-b1-Run-001
3,20240526-9528-fc360e9c8146_REQUEST,2024-05-26,REQUEST,,0.0,VERBAL CONFLICT,,Volodymyr Zelenskyy,Ukraine,369,...,2658434.0,Switzerland,PCLI,DW English,2024-05-26,Volodymyr Zelenskyy | Zelenskyy | Charles Mich...,European Council,Ukraine | Russian Federation | People’s Republ...,English,NGEC_coder-Vers001-b1-Run-001
4,20240526-9528-fc360e9c8146_ACCUSE,2024-05-26,ACCUSE,,-3.0,VERBAL CONFLICT,,,Ukraine,369,...,706483.0,Kharkiv,PPLA,DW English,2024-05-26,Volodymyr Zelenskyy | Zelenskyy | Charles Mich...,European Council,Ukraine | Russian Federation | People’s Republ...,English,NGEC_coder-Vers001-b1-Run-001


In [25]:
import pycountry

def clean_events(
    df: pd.DataFrame,
    date_col: str = "Event Date",
    country_col: str = "Country"
) -> pd.DataFrame:
    # 1. drop unwanted columns
    drop_cols = [
        "Event Mode", "Actor COW", "Primary Actor Sector", "Actor Sectors",
        "Wikipedia Actor ID", "Recipient COW", "Primary Recipient Sector",
        "Recipient Sectors", "Wikipedia Recipient ID", "Placename", "City",
        "District", "Province", "Latitude", "Longitude", "GeoNames ID",
        "Feature Type", "Source", "Story People", "Story Organizations",
        "Story Locations"
    ]
    df = df.drop(columns=drop_cols, errors="ignore")
    
    # 2. parse your event-date column
    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
    
    # 3. normalize country codes using pycountry
    def to_alpha3(country):
        try:
            return pycountry.countries.lookup(str(country)).alpha_3
        except LookupError:
            return country  # If not found, keep original
    
    df[country_col] = df[country_col].apply(to_alpha3)
    
    # 4. clean all text columns: strip + title-case
    text_cols = df.select_dtypes(include="object").columns
    for col in text_cols:
        df[col] = (
            df[col]
            .astype(str)
            .str.strip()
            .str.title()
            .replace({"Nan": pd.NA})
        )
    
    return df

# Clean the DataFrame
final_cleaned_df = clean_events(
    cleaned_df,
    date_col="Event Date",
    country_col="Country"
)

# Save the cleaned DataFrame
final_cleaned_df.to_csv("datasets/final_cleaned_events.csv", index=False)

In [26]:
gc.collect()

20

In [30]:
#load the cleaned DataFrame
final_df = pd.read_csv("datasets/final_cleaned_events.csv")

# Drop rows where Actor Name Raw and Recipient Name Raw are both null
final_df.dropna(subset=['Actor Name Raw', 'Recipient Name Raw'], inplace=True)

# Reset index
final_df.reset_index(drop=True, inplace=True)

# Save the cleaned DataFrame
final_df.to_csv("datasets/final_cleaned_events.csv", index=False)

In [31]:
#Check for column-wise null values in the cleaned DataFrame
null_counts = final_df.isnull().sum()
print("Null values in each column:")
print(null_counts[null_counts > 0])

Null values in each column:
Contexts             153558
Actor Name           230542
Actor Country         91402
Actor Title           81883
Recipient Name       218097
Recipient Country    114429
Recipient Title      105753
dtype: int64


In [12]:
from collections import Counter

# Load data from a text file
def load_data(file_path, sep=","):
    """
    Reads your CSV (or TSV) into a DataFrame and
    normalizes column names.
    """
    try:
        df = pd.read_csv(file_path, sep=sep, dtype=str, low_memory=False)
        # strip whitespace from the column names
        df.columns = df.columns.str.strip()
        print(f"Successfully loaded {file_path}. Columns: {df.columns.tolist()}")
        return df
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return pd.DataFrame()
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return pd.DataFrame()


# Extract actor-recipient pairs
def extract_actor_recipient_pairs(df):
    """
    Extracts actor-recipient pairs from the DataFrame.
    Args:
        df (pd.DataFrame): DataFrame containing event data.
    Returns:
        pd.DataFrame: A DataFrame with actor-recipient pairs, excluding rows with missing values in these specific columns.
    """
    # Ensure required columns exist
    required_cols = ["Actor Name", "Recipient Name", "Event Date"]
    if not all(col in df.columns for col in required_cols):
        missing = [col for col in required_cols if col not in df.columns]
        print(f"Warning: Missing required columns for pair extraction: {missing}. Returning empty DataFrame.")
        return pd.DataFrame(columns=required_cols)
        
    pairs = df.loc[:, required_cols]
    return pairs.dropna()  # Remove rows with missing values in "Actor Name", "Recipient Name", or "Event Date"

# Count the frequency of each actor-recipient pair, excluding pairs with 'None'
def get_most_frequent_relations(df_pairs):
    """
    Counts the frequency of each actor-recipient pair, excluding relations with 'None'.
    Args:
        df_pairs (pd.DataFrame): DataFrame containing actor-recipient pairs and event dates.
    Returns:
        dict: A dictionary of actor-recipient pairs with their counts.
    """
    if df_pairs.empty:
        return Counter()

    # Convert DataFrame rows to a list of tuples (actor, recipient)
    # Ensure 'None' (as string) is handled, and also check for actual None/NaN if dtype wasn't strictly string
    pair_tuples = [
        (str(row["Actor Name"]).strip(), str(row["Recipient Name"]).strip())
        for _, row in df_pairs.iterrows()
        if pd.notna(row["Actor Name"]) and 'none' not in str(row["Actor Name"]).lower() and \
            pd.notna(row["Recipient Name"]) and 'none' not in str(row["Recipient Name"]).lower()
    ]

    # Count occurrences of each pair
    pair_counts = Counter(pair_tuples)
    return pair_counts

# Filter the DataFrame for pairs with a minimum occurrence count
def filter_pairs_by_occurrence(df, pair_counts, min_count=1):
    """
    Filters the original DataFrame for actor-recipient pairs that occur at least `min_count` times.
    Args:
        df (pd.DataFrame): Original DataFrame containing event data.
        pair_counts (dict): Dictionary of actor-recipient pairs with their counts.
        min_count (int): Minimum count for the pair to be included in the result.
    Returns:
        pd.DataFrame: A filtered DataFrame with only pairs having >= `min_count` occurrences.
    """
    if not pair_counts: # If pair_counts is empty
        print(f"No pair counts to filter by for min_count={min_count}. Returning empty DataFrame.")
        return pd.DataFrame(columns=df.columns)

    # Get the pairs that occur at least `min_count` times
    valid_pairs = {pair for pair, count in pair_counts.items() if count >= min_count}

    if not valid_pairs:
        print(f"No pairs meet the minimum occurrence count of {min_count}. Returning empty DataFrame.")
        return pd.DataFrame(columns=df.columns)

    # Filter the original DataFrame to include only these pairs
    # Ensure 'Actor Name' and 'Recipient Name' columns exist in df
    if "Actor Name" not in df.columns or "Recipient Name" not in df.columns:
        print("Warning: 'Actor Name' or 'Recipient Name' not in DataFrame for occurrence filtering. Returning empty DataFrame.")
        return pd.DataFrame(columns=df.columns)

    filtered_df = df[df.apply(
        lambda row: (str(row["Actor Name"]).strip(), str(row["Recipient Name"]).strip()) in valid_pairs, axis=1
    )].copy() # Use .copy() to avoid SettingWithCopyWarning

    return filtered_df

# Sort by date and split the last 5 relations as test, rest as train
def split_by_date(df, date_column):
    """
    For each relation type (actor-recipient pair), sort by date and split the last 5 as test.
    Args:
        df (pd.DataFrame): DataFrame with actor-recipient pairs and event dates.
        date_column (str): The column with the event date.
    Returns:
        pd.DataFrame, pd.DataFrame: The train and test DataFrames.
    """
    if df.empty:
        print("Input DataFrame for split_by_date is empty. Returning empty train/test sets.")
        return pd.DataFrame(columns=df.columns), pd.DataFrame(columns=df.columns)

    # Ensure required columns exist
    if not all(col in df.columns for col in ["Actor Name", "Recipient Name", date_column]):
        print("Warning: Missing required columns for splitting by date. Returning empty train/test sets.")
        return pd.DataFrame(columns=df.columns), pd.DataFrame(columns=df.columns)

    # Convert the date column to datetime if it's not already
    try:
        df.loc[:, date_column] = pd.to_datetime(df[date_column], errors='coerce')
        # Drop rows where date conversion failed
        df.dropna(subset=[date_column], inplace=True)
        if df.empty:
            print("DataFrame is empty after attempting date conversion and dropping NaT. Returning empty train/test sets.")
            return pd.DataFrame(columns=df.columns), pd.DataFrame(columns=df.columns)
    except Exception as e:
        print(f"Error converting '{date_column}' to datetime: {e}. Returning empty train/test sets.")
        return pd.DataFrame(columns=df.columns), pd.DataFrame(columns=df.columns)


    # List to hold train and test data
    train_data = []
    test_data = []

    # Group by actor-recipient pairs
    grouped = df.groupby(["Actor Name", "Recipient Name"])

    if grouped.ngroups == 0:
        print("No groups found for actor-recipient pairs. Returning original df as train, empty as test (or empty if original was empty).")
        return df.copy(), pd.DataFrame(columns=df.columns)


    for (actor, recipient), group in grouped:
        # Sort the group by date
        group = group.sort_values(by=date_column)

        if len(group) > 5: # Ensure there's enough data to split
            test_data.append(group.tail(5))
            train_data.append(group.head(len(group) - 5))
        else: # Otherwise, put all in train (or all in test if you prefer for very small groups)
            train_data.append(group)
            # test_data.append(pd.DataFrame(columns=group.columns)) # Add empty df to test if group too small

    # Concatenate all train and test data
    train_df = pd.concat(train_data).reset_index(drop=True) if train_data else pd.DataFrame(columns=df.columns)
    test_df = pd.concat(test_data).reset_index(drop=True) if test_data else pd.DataFrame(columns=df.columns)

    return train_df, test_df

# Main execution block
if __name__ == "__main__":
    # Specify the path to the text file containing event data
    input_file_path = "datasets/final_cleaned_events.csv"
    output_dir = "datasets/country_specific_sets" 

    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # --- Configuration ---
    target_countries = ["AFG", "IND", "RUS"]  # List of countries to process
    country_column_name = "Country"  
    min_pair_occurrences = 10        
    event_date_column = "Event Date"

    print(f"Loading main dataset from: {input_file_path}")
    main_df = load_data(input_file_path)

    if main_df.empty:
        print("Failed to load the main dataset. Exiting.")
    else:
        if country_column_name not in main_df.columns:
            print(f"Error: Country column '{country_column_name}' not found in the dataset. Exiting.")
        else:
            for country_name in target_countries:
                print(f"\n--- Processing data for: {country_name} ---")

                # Filter DataFrame for the current country
                country_df = main_df[main_df[country_column_name].astype(str).str.strip().str.upper() == country_name].copy()

                if country_df.empty:
                    print(f"No data found for {country_name}. Skipping.")
                    continue
                
                print(f"Found {len(country_df)} events for {country_name}.")

                # 1. Extract actor-recipient pairs for the country
                actor_recipient_df_country = extract_actor_recipient_pairs(country_df)
                if actor_recipient_df_country.empty:
                    print(f"No valid actor-recipient pairs extracted for {country_name}. Skipping further processing for this country.")
                    continue

                # 2. Get the most frequent actor-recipient pairs for the country
                pair_counts_country = get_most_frequent_relations(actor_recipient_df_country)
                if not pair_counts_country:
                    print(f"No frequent relations found for {country_name}. Skipping further processing for this country.")
                    continue
                # print(f"Most frequent pairs for {country_name}: {pair_counts_country.most_common(5)}")


                # 3. Filter the country-specific DataFrame for pairs that occur `min_pair_occurrences`+ times
                filtered_df_country = filter_pairs_by_occurrence(country_df, pair_counts_country, min_count=min_pair_occurrences)
                if filtered_df_country.empty:
                    print(f"No pairs met the minimum occurrence count of {min_pair_occurrences} for {country_name}. Skipping train/test split for this country.")
                    continue
                print(f"Filtered DataFrame for {country_name} (pairs >= {min_pair_occurrences} occurrences): {len(filtered_df_country)} rows.")


                # 4. Split the filtered data by date
                train_df_country, test_df_country = split_by_date(filtered_df_country, event_date_column)

                # 5. Save the train and test DataFrames to CSV files
                if not train_df_country.empty:
                    train_output_path = os.path.join(output_dir, f"train_data_{country_name.replace(' ', '_')}.csv")
                    train_df_country.to_csv(train_output_path, index=False)
                    print(f"Saved training data for {country_name} to: {train_output_path} ({len(train_df_country)} rows)")
                else:
                    print(f"No training data generated for {country_name}.")

                if not test_df_country.empty:
                    test_output_path = os.path.join(output_dir, f"test_data_{country_name.replace(' ', '_')}.csv")
                    test_df_country.to_csv(test_output_path, index=False)
                    print(f"Saved test data for {country_name} to: {test_output_path} ({len(test_df_country)} rows)")
                else:
                    print(f"No test data generated for {country_name}.")
    print("\n--- Script finished ---")


Loading main dataset from: datasets/final_cleaned_events.csv
Successfully loaded datasets/final_cleaned_events.csv. Columns: ['Event ID', 'Event Date', 'Event Type', 'Event Intensity', 'Quad Code', 'Contexts', 'Actor Name', 'Actor Country', 'Actor Title', 'Actor Name Raw', 'Recipient Name', 'Recipient Country', 'Recipient Title', 'Recipient Name Raw', 'Country', 'Raw Placename', 'Publication Date', 'Language', 'Version']

--- Processing data for: AFG ---
Found 3723 events for AFG.
Filtered DataFrame for AFG (pairs >= 10 occurrences): 114 rows.
Saved training data for AFG to: datasets/country_specific_sets/train_data_AFG.csv (99 rows)
Saved test data for AFG to: datasets/country_specific_sets/test_data_AFG.csv (15 rows)

--- Processing data for: IND ---
Found 60115 events for IND.
Filtered DataFrame for IND (pairs >= 10 occurrences): 5125 rows.
Saved training data for IND to: datasets/country_specific_sets/train_data_IND.csv (4020 rows)
Saved test data for IND to: datasets/country_speci

In [8]:
from collections import defaultdict

# Input and output directories
country_sets_dir = "datasets/country_specific_sets"
combined_output_dir = "datasets/country_sets"
os.makedirs(combined_output_dir, exist_ok=True)

# Find all CSV files
csv_files = glob.glob(os.path.join(country_sets_dir, "*.csv"))
print(f"Found {len(csv_files)} CSV files in {country_sets_dir}.")

# Group files by country code (AFG, IND, RUS)
country_file_map = defaultdict(list)
for csv_file in csv_files:
    filename = os.path.basename(csv_file)
    # Extract country code: train_data_IND.csv → IND
    parts = filename.split("_")
    if len(parts) == 3:
        country_code = parts[2].replace(".csv", "")
        country_file_map[country_code].append(csv_file)

# Combine and save per country
for country_code, files in country_file_map.items():
    dfs = []
    for file_path in files:
        df = pd.read_csv(file_path)
        dfs.append(df)
        print(f"Loaded {file_path} with shape {df.shape}")
    combined_df = pd.concat(dfs, ignore_index=True)
    output_path = os.path.join(combined_output_dir, f"combined_data_{country_code}.csv")
    combined_df.to_csv(output_path, index=False)
    print(f"Saved {output_path} with shape {combined_df.shape}")


Found 6 CSV files in datasets/country_specific_sets.
Loaded datasets/country_specific_sets/train_data_AFG.csv with shape (99, 19)
Loaded datasets/country_specific_sets/test_data_AFG.csv with shape (15, 19)
Saved datasets/country_sets/combined_data_AFG.csv with shape (114, 19)
Loaded datasets/country_specific_sets/test_data_RUS.csv with shape (55, 19)
Loaded datasets/country_specific_sets/train_data_RUS.csv with shape (128, 19)
Saved datasets/country_sets/combined_data_RUS.csv with shape (183, 19)
Loaded datasets/country_specific_sets/train_data_IND.csv with shape (4020, 19)
Loaded datasets/country_specific_sets/test_data_IND.csv with shape (1105, 19)
Saved datasets/country_sets/combined_data_IND.csv with shape (5125, 19)


In [2]:
def split_train_test_by_date(df, date_col='Event Date', test_size=None, n_recent_dates=None):
    """
    Splits a DataFrame into train and test sets based on date.
    
    Parameters:
    - df: pandas.DataFrame containing the data.
    - date_col: Name of the date column to split on (will be converted to datetime).
    - test_size: Float (fraction) or int (number of rows) for test set size.
    - n_recent_dates: Int number of most recent unique dates to include in the test set.
    
    Returns:
    - train_df, test_df: two pandas.DataFrames.
    """
    df = df.copy()
    df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
    
    if test_size is not None:
        df_sorted = df.sort_values(date_col)
        if isinstance(test_size, float):
            n_test = int(len(df_sorted) * test_size)
        elif isinstance(test_size, int):
            n_test = test_size
        else:
            raise ValueError("test_size must be float or int")
        test_df = df_sorted.tail(n_test)
        train_df = df_sorted.iloc[:-n_test] if n_test < len(df_sorted) else pd.DataFrame(columns=df.columns)
    elif n_recent_dates is not None:
        unique_dates = sorted(df[date_col].dt.date.dropna().unique())
        recent_dates = unique_dates[-n_recent_dates:]
        test_df = df[df[date_col].dt.date.isin(recent_dates)]
        train_df = df[~df[date_col].dt.date.isin(recent_dates)]
    else:
        raise ValueError("Specify either test_size or n_recent_dates")

    return train_df, test_df

# Demonstration on each country file
files = {
    'AFG': '../data/country_sets/combined_data_AFG.csv',
    'IND': '../data/country_sets/combined_data_IND.csv',
    'RUS': '../data/country_sets/combined_data_RUS.csv'
}

for country, path in files.items():
    df = pd.read_csv(path)
    # Example: take last 20% as test
    train_frac, test_frac = split_train_test_by_date(df, test_size=0.2)
    print(f"{country} (fraction split) → Train: {len(train_frac)} rows, Test: {len(test_frac)} rows")
    
    # Example: take last 2 unique dates as test
    train_dates, test_dates = split_train_test_by_date(df, n_recent_dates=2)
    print(f"{country} (last 2 dates) → Train: {len(train_dates)} rows, Test: {len(test_dates)} rows")



AFG (fraction split) → Train: 92 rows, Test: 22 rows
AFG (last 2 dates) → Train: 111 rows, Test: 3 rows
IND (fraction split) → Train: 4100 rows, Test: 1025 rows
IND (last 2 dates) → Train: 5110 rows, Test: 15 rows
RUS (fraction split) → Train: 147 rows, Test: 36 rows
RUS (last 2 dates) → Train: 180 rows, Test: 3 rows
