In [2]:
import random
from pathlib import Path


In [3]:

def load_and_split_data(file_path, train_split_ratio=0.8, random_seed=42):
    """
    Loads a text file, shuffles its lines, and splits it into training and testing sets.

    Args:
        file_path (Path or str): The path to the text file.
        train_split_ratio (float): The proportion of data to allocate to the training set.
        random_seed (int): Seed for reproducibility.

    Returns:
        tuple[list[str], list[str]]: A tuple containing the training set and testing set.
    """
    print(f"Loading data from: {file_path}")
    
    # Ensure the file exists
    if not Path(file_path).is_file():
        print(f"Error: File not found at {file_path}")
        return [], []
        
    # Read all lines from the file
    with open(file_path, 'r', encoding='utf-8') as f:
        # a.strip() removes leading/trailing whitespace, including the newline character
        prompts = [line.strip() for line in f if line.strip()]

    # Set the random seed for reproducibility and shuffle the prompts
    random.seed(random_seed)
    random.shuffle(prompts)

    # Calculate the split index
    split_index = int(len(prompts) * train_split_ratio)

    # Split the data
    train_set = prompts[:split_index]
    test_set = prompts[split_index:]

    print(f"  - Total prompts: {len(prompts)}")
    print(f"  - Training set size: {len(train_set)}")
    print(f"  - Test set size: {len(test_set)}\n")
    
    return train_set, test_set

# --- Main script ---ty. Please check sadness.txt.")

In [4]:

# Define the file paths
happy_filepath = '/workspace/MATS-research/data/emotion_user_prompts/happiness.txt'
sad_filepath = '/workspace/MATS-research/data/emotion_user_prompts/sadness.txt' # Assuming you have a sadness.txt file

# Load and split the happiness data
happy_train, happy_test = load_and_split_data(happy_filepath)

# Load and split the sadness data
# You will need to create a 'sadness.txt' file in the same format as 'happiness.txt'
sad_train, sad_test = load_and_split_data(sad_filepath)


Loading data from: /workspace/MATS-research/data/emotion_user_prompts/happiness.txt
  - Total prompts: 505
  - Training set size: 404
  - Test set size: 101

Loading data from: /workspace/MATS-research/data/emotion_user_prompts/sadness.txt
  - Total prompts: 532
  - Training set size: 425
  - Test set size: 107

