READING ORIGINAL DATASET AND EXTRACTING ALL RELATION PAIRS WITH 20+ FREQUENCY
SPLITTING INTO TRAINING AND TESTING DATA

### IMPORT STATEMENT

In [1]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from io import StringIO
import random 
import pickle
import os
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
%pip install scipy


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


### FINDING MORE FREQUENT RELATIONS

In [3]:
import pandas as pd
from collections import Counter

# Load data from a text file
def load_data(file_path):
    """
    Reads a tab-delimited text file and loads it into a pandas DataFrame.
    Args:
        file_path (str): Path to the text file containing event data.
    Returns:
        pd.DataFrame: A pandas DataFrame with the event data.
    """
    with open(file_path, 'r') as file:
        # Read all lines from the file
        lines = file.readlines()
    
    # Split lines into columns using tab as the delimiter
    data = [line.strip().split("\t") for line in lines]
    
    # Create a DataFrame using the first row as headers
    return pd.DataFrame(data[1:], columns=data[0])

# Extract actor-recipient pairs
def extract_actor_recipient_pairs(df):
    """
    Extracts actor-recipient pairs from the DataFrame.
    Args:
        df (pd.DataFrame): DataFrame containing event data.
    Returns:
        pd.DataFrame: A DataFrame with actor-recipient pairs, excluding rows with missing values.
    """
    pairs = df.loc[:, ["Actor Name", "Recipient Name"]]  # Select only the relevant columns
    return pairs.dropna()  # Remove rows with missing values

# Count the frequency of each actor-recipient pair, excluding pairs with 'None'
def get_most_frequent_relations(df):
    """
    Counts the frequency of each actor-recipient pair, excluding relations with 'None'.
    Args:
        df (pd.DataFrame): DataFrame containing actor-recipient pairs.
    Returns:
        list: A list of tuples with actor-recipient pairs and their counts, sorted by frequency.
    """
    pairs = extract_actor_recipient_pairs(df)  # Get actor-recipient pairs
    
    # Convert DataFrame rows to a list of tuples (actor, recipient)
    pair_tuples = [
        (row["Actor Name"], row["Recipient Name"]) 
        for _, row in pairs.iterrows()
        if 'None' not in row["Actor Name"] and 'None' not in row["Recipient Name"]
    ]
    
    # Count occurrences of each pair
    pair_counts = Counter(pair_tuples)
    return pair_counts.most_common()  # Return pairs sorted by frequency

# Main execution block
if __name__ == "__main__":
    # Specify the path to the text file containing event data
    file_path = "data_original.txt"

    # Load the data into a DataFrame
    df = load_data(file_path)

    # Get the most frequent actor-recipient pairs
    most_frequent_relations = get_most_frequent_relations(df)

    # Print results
    print("Most Frequent Actor-Recipient Pairs (Excluding 'None' relations):")
    for pair, count in most_frequent_relations:
        print(f"{pair}: {count} occurrences")


FileNotFoundError: [Errno 2] No such file or directory: 'data_original.txt'

### CREATING TRAINING AND TEST DATASET ( 20+ FREQUENCY PAIRS - 5 pairs in test dataset)

In [5]:
import pandas as pd
from collections import Counter

# Load data from a text file
def load_data(file_path):
    """
    Reads a tab-delimited text file and loads it into a pandas DataFrame.
    Args:
        file_path (str): Path to the text file containing event data.
    Returns:
        pd.DataFrame: A pandas DataFrame with the event data.
    """
    with open(file_path, 'r') as file:
        # Read all lines from the file
        lines = file.readlines()
    
    # Split lines into columns using tab as the delimiter
    data = [line.strip().split("\t") for line in lines]
    
    # Create a DataFrame using the first row as headers
    return pd.DataFrame(data[1:], columns=data[0])

# Extract actor-recipient pairs
def extract_actor_recipient_pairs(df):
    """
    Extracts actor-recipient pairs from the DataFrame.
    Args:
        df (pd.DataFrame): DataFrame containing event data.
    Returns:
        pd.DataFrame: A DataFrame with actor-recipient pairs, excluding rows with missing values.
    """
    pairs = df.loc[:, ["Actor Name", "Recipient Name", "Event Date"]]  # Select relevant columns
    return pairs.dropna()  # Remove rows with missing values

# Count the frequency of each actor-recipient pair, excluding pairs with 'None'
def get_most_frequent_relations(df):
    """
    Counts the frequency of each actor-recipient pair, excluding relations with 'None'.
    Args:
        df (pd.DataFrame): DataFrame containing actor-recipient pairs.
    Returns:
        dict: A dictionary of actor-recipient pairs with their counts.
    """
    pairs = extract_actor_recipient_pairs(df)  # Get actor-recipient pairs
    
    # Convert DataFrame rows to a list of tuples (actor, recipient)
    pair_tuples = [
        (row["Actor Name"], row["Recipient Name"]) 
        for _, row in pairs.iterrows()
        if 'None' not in row["Actor Name"] and 'None' not in row["Recipient Name"]
    ]
    
    # Count occurrences of each pair
    pair_counts = Counter(pair_tuples)
    return pair_counts  # Return a dictionary of pair counts

# Filter the DataFrame for pairs with 50+ occurrences
def filter_pairs_by_occurrence(df, pair_counts, min_count=1):
    """
    Filters the DataFrame for actor-recipient pairs that occur 20+ times.
    Args:
        df (pd.DataFrame): Original DataFrame containing event data.
        pair_counts (dict): Dictionary of actor-recipient pairs with their counts.
        min_count (int): Minimum count for the pair to be included in the result.
    Returns:
        pd.DataFrame: A filtered DataFrame with only pairs having 50+ occurrences.
    """
    # Get the pairs that occur at least `min_count` times
    valid_pairs = {pair for pair, count in pair_counts.items() if count >= min_count}
    
    # Filter the original DataFrame to include only these pairs
    filtered_df = df[df.apply(
        lambda row: (row["Actor Name"], row["Recipient Name"]) in valid_pairs, axis=1)]
    
    return filtered_df

# Sort by date and split the last 5 relations as test, rest as train
def split_by_date(df, date_column):
    """
    For each relation type (actor-recipient pair), sort by date and split the last 5 as test.
    Args:
        df (pd.DataFrame): DataFrame with actor-recipient pairs and event dates.
        date_column (str): The column with the event date.
    Returns:
        pd.DataFrame, pd.DataFrame: The train and test DataFrames.
    """
    # Convert the date column to datetime if it's not already
    df[date_column] = pd.to_datetime(df[date_column])
    
    # List to hold train and test data
    train_data = []
    test_data = []
    
    # Group by actor-recipient pairs
    grouped = df.groupby(["Actor Name", "Recipient Name"])
    
    for (actor, recipient), group in grouped:
        # Sort the group by date
        group = group.sort_values(by=date_column)
        
        # Split the last 5 relations as the test set
        test_data.append(group.tail(5))
        
        # All other relations are used for the train set
        train_data.append(group.head(len(group) - 5))
    
    # Concatenate all train and test data
    train_df = pd.concat(train_data)
    test_df = pd.concat(test_data)
    
    return train_df, test_df

# Main execution block
if __name__ == "__main__":
    # Specify the path to the text file containing event data
    file_path = "../datasets/data_original.txt"

    # Load the data into a DataFrame
    df = load_data(file_path)

    # Get the most frequent actor-recipient pairs
    pair_counts = get_most_frequent_relations(df)

    # Filter the DataFrame for pairs that occur 50+ times
    filtered_df = filter_pairs_by_occurrence(df, pair_counts, min_count=10)

    # Split the data by date, keeping the last 5 events for each relation as test
    train_df, test_df = split_by_date(filtered_df, "Event Date")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[date_column] = pd.to_datetime(df[date_column])


In [7]:
len(train_df), len(test_df)

(2710, 855)

In [8]:
# Save the DataFrame to a CSV file in the local folder
train_df.to_csv('train_data_new3.csv', index=False)
test_df.to_csv('test_data_new3.csv', index=False)
print("Done")

Done


In [11]:
train_df['Event Type'].value_counts()

Event Type
ACCUSE      290
THREATEN     80
COERCE       78
SANCTION     59
REQUEST      57
ASSAULT      46
RETREAT      21
CONCEDE      20
MOBILIZE     19
PROTEST      18
CONSULT       1
AID           1
Name: count, dtype: int64

In [9]:
print("Testing Data:")
test_df.head()

Testing Data:


Unnamed: 0,Event ID,Event Date,Event Type,Event Mode,Event Intensity,Quad Code,Contexts,Actor Name,Actor Country,Actor COW,...,GeoNames ID,Raw Placename,Feature Type,Source,Publication Date,Story People,Story Organizations,Story Locations,Language,Version
2641,20240528-6755-11854c072b7a_THREATEN,2024-05-28,THREATEN,,-3.5,VERBAL CONFLICT,military | terrorism,Benjamin Netanyahu,Israel,666,...,281102.0,Rafah,PPLA2,Evening Standard,2024-05-28,BENJAMIN NETANYAHU,the Palestinian Red Crescent | UN | Hamas | Th...,Ireland | Gaza Strip | Rafaḩ | State of Israel,English,NGEC_coder-Vers001-b1-Run-001
7692,20240601-5863-95272a49f7df_REQUEST,2024-06-01,REQUEST,,0.0,VERBAL CONFLICT,military | terrorism,Benjamin Netanyahu,Israel,666,...,272103.0,Lebanon,PCLI,Al Jazeera English,2024-06-01,Joe Biden | Benjamin Netanyahu | Biden,Hamas | Hezbollah,Gaza Strip | Lebanon | United States | State o...,English,NGEC_coder-Vers001-b1-Run-001
7595,20240601-5418-f85a7f0c50c4_REQUEST,2024-06-01,REQUEST,,0.0,VERBAL CONFLICT,terrorism,Benjamin Netanyahu,Israel,666,...,,Joe Biden,,Agence France Presse,2024-06-01,Benjamin Netanyahu | Netanyahu | Biden | Joe B...,Hamas,United States | Gaza Province | State of Israel,English,NGEC_coder-Vers001-b1-Run-001
184220,20240624-6038-ed29ffa8bec4_THREATEN,2024-06-24,THREATEN,,-3.5,VERBAL CONFLICT,terrorism,Benjamin Netanyahu,Israel,666,...,,Gaza Strip,,EFE News Service,2024-06-24,Joe Biden | Benjamin Netanyahu | Netanyahu,Channel 14 | Hamas | Security Council | Hezbol...,United States | Lebanon | State of Israel,English,NGEC_coder-Vers001-b1-Run-001
185080,20240625-4243-814143fbea89_REQUEST,2024-06-25,REQUEST,,0.0,VERBAL CONFLICT,military | terrorism,Benjamin Netanyahu,Israel,666,...,294640.0,Israel,PCLI,USA Today,2024-06-25,Biden | Joe Biden | Netanyahu,Channel 14 | Hamas | the U.N. Security Council...,Lebanon | Gaza Strip | Islamic Republic of Ira...,English,NGEC_coder-Vers001-b1-Run-001


In [10]:
# Save the DataFrame to a CSV file in the local folder
test_df.to_csv('test_data.csv', index=False)
print("Testing Data saved")

Testing Data saved
