In [4]:
import pandas as pd
import numpy as np
import re
import os
import nlpaug.augmenter.word as naw
import nltk

# Uncomment the following lines on the first run to download necessary NLTK data
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

class Task1Preprocessor:
    def __init__(self, augment=False):
        """
        :param augment: Whether to enable data augmentation.
                        (Set to True for Training Set, False for Validation/Test Sets!)
        """
        self.augment = augment

        # Initialize augmenter: Synonym Replacement
        # This method is safer for regression tasks as it usually preserves
        # the emotional polarity (valence) of the sentence better than back-translation.
        if self.augment:
            print(">>> [Init] Initializing Data Augmentation Model (WordNet)...")
            self.aug = naw.SynonymAug(aug_src='wordnet', aug_p=0.2) # Replace 20% of words

    def clean_text(self, text, is_words_mode=False):
        """
        Specific cleaning logic for Task 1 data.
        """
        if pd.isna(text):
            return ""

        text = str(text)

        # 1. Handle excessive dots/ellipses (e.g., User 3: "mood .................")
        # Replace 2 or more consecutive dots with a single dot to reduce token length
        text = re.sub(r'\.{2,}', '.', text)

        # 2. Handle User 10 style lists (is_words=True)
        # Convert "Happy , Sad , Calm" -> "Happy Sad Calm"
        # Removing commas helps BERT/RoBERTa process them as tokens rather than punctuation.
        if is_words_mode:
            text = text.replace(' , ', ' ').replace(',', ' ')

        # 3. Fix contractions separated by spaces (Common issue in SemEval data)
        # Example: "I ' m" -> "I am", "don ' t" -> "do not"
        text = re.sub(r"\s+'\s+m\b", " am", text)   # I ' m -> I am
        text = re.sub(r"\s+'\s+s\b", " is", text)   # it ' s -> it is
        text = re.sub(r"\s+'\s+t\b", " not", text)  # don ' t -> do not
        text = re.sub(r"\s+'\s+re\b", " are", text) # you ' re -> you are
        text = re.sub(r"\s+'\s+ll\b", " will", text)# I ' ll -> I will
        text = re.sub(r"\s+'\s+ve\b", " have", text)# I ' ve -> I have

        # 4. Normalize whitespace (remove extra spaces)
        text = re.sub(r'\s+', ' ', text).strip()

        return text

    def process_dataframe(self, df):
        """
        Main processing function to handle cleaning, sorting, and augmentation.
        """
        print(f"Starting processing for {len(df)} records...")

        # --- Step 0: Temporal Sorting (CRITICAL) ---
        # Even for Task 1 (Standard Prediction), sorting by time is essential
        # if your models (Transformer/LSTM) utilize context windows from previous posts.
        if 'timestamp' in df.columns:
            print(">>> [Sorting] Sorting data by User ID and Timestamp...")
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.sort_values(by=['user_id', 'timestamp']).reset_index(drop=True)

        # --- Step 1: Text Cleaning ---
        print(">>> [Cleaning] Cleaning text data...")
        # Using list comprehension for better performance than iterrows
        df['processed_text'] = [
            self.clean_text(row['text'], row['is_words'])
            for _, row in df.iterrows()
        ]

        # --- Step 2: Data Augmentation (Training Set Only) ---
        if self.augment:
            print(">>> [Augmentation] Applying data augmentation (this may take a few minutes)...")
            augmented_rows = []

            for _, row in df.iterrows():
                original_text = row['processed_text']

                # Only augment if the sentence is long enough (> 3 words)
                # We skip very short sentences or single words to avoid noise.
                if len(original_text.split()) > 3:
                    try:
                        # nlpaug returns a list, we take the first item
                        aug_text = self.aug.augment(original_text)[0]

                        # Copy all info from the current row (IDs, Labels, etc.)
                        new_row = row.copy()
                        new_row['processed_text'] = aug_text
                        new_row['is_augmented'] = True # Mark as augmented data

                        augmented_rows.append(new_row)
                    except:
                        pass # Skip if augmentation fails

            # Merge augmented data into the main DataFrame
            if augmented_rows:
                aug_df = pd.DataFrame(augmented_rows)
                df['is_augmented'] = False # Mark original data

                # Append augmented data to the end
                df = pd.concat([df, aug_df], ignore_index=True)
                print(f"Augmentation complete. Added samples: {len(aug_df)}, Total samples: {len(df)}")
            else:
                print("No augmented samples were generated.")
        else:
            # If augmentation is off, ensure the column still exists for consistency
            df['is_augmented'] = False

        return df

# ==========================================
# Main Execution Block
# ==========================================

if __name__ == "__main__":
    # 1. Define File Path
    # Logic: Look for the 'data' folder in the parent directory
    base_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in locals() else os.getcwd()

    # Path: ../data/train_release_3sep2025/train_subtask1.csv
    # Adjust this path if your folder structure is different
    input_file = os.path.join(base_dir, "..", "data", "train_release_3sep2025", "train_subtask1.csv")

    # Fallback: Check current directory if the above path doesn't exist
    if not os.path.exists(input_file):
        print(f"File not found at {input_file}, checking current directory...")
        input_file = "train_subtask1.csv"

    print(f"Attempting to load data from: {os.path.abspath(input_file)}")

    try:
        # Load the CSV file
        df = pd.read_csv(input_file)

        # 2. Instantiate Preprocessor
        # Set augment=True to generate Training data
        # Set augment=False to generate Validation/Test data
        processor = Task1Preprocessor(augment=True)

        # 3. Run Processing
        df_processed = processor.process_dataframe(df)

        # 4. Preview specific cleaning results (e.g., Checking User 3's dots)
        print("\n--- Cleaning Preview (User 3) ---")
        user3_sample = df_processed[df_processed['user_id'] == 3].head(1)
        if not user3_sample.empty:
            print(f"Original: {user3_sample['text'].values[0][:50]}...")
            print(f"Cleaned : {user3_sample['processed_text'].values[0][:50]}...")

        # 5. Save the processed file
        output_file = "train_subtask1_processed.csv"
        df_processed.to_csv(output_file, index=False)
        print(f"\nSuccess! Processed file saved to: {output_file}")

    except FileNotFoundError:
        print(f"[Error] File not found. Please ensure {input_file} exists.")
    except Exception as e:
        print(f"[Error] An unknown error occurred: {e}")

Attempting to load data from: /Users/van/Desktop/LLM/2025/Project/SemEval2026-EmoVA/data/train_release_3sep2025/train_subtask1.csv
>>> [Init] Initializing Data Augmentation Model (WordNet)...
Starting processing for 2764 records...
>>> [Sorting] Sorting data by User ID and Timestamp...
>>> [Cleaning] Cleaning text data...
>>> [Augmentation] Applying data augmentation (this may take a few minutes)...


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package ave

No augmented samples were generated.

--- Cleaning Preview (User 3) ---
Original: I ' ve been feeling just fine . Nice and relaxed ....
Cleaned : I have been feeling just fine . Nice and relaxed ....

Success! Processed file saved to: train_subtask1_processed.csv


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package ave