In [2]:
import pandas as pd
import numpy as np
import re
import os
import nlpaug.augmenter.word as naw
import nltk

# Uncomment these lines on the first run to download necessary NLTK data
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

class Task2aPreprocessor:
    """
    Preprocessing pipeline for SemEval Task 2a (Ecological Essays).
    Handles text cleaning, temporal sorting, missing target masking, and augmentation.
    """
    def __init__(self, augment=False):
        """
        :param augment: Boolean. Set to True for Training set, False for Val/Test sets.
        """
        self.augment = augment
        if self.augment:
            print(">>> Initializing Data Augmentation Model (Synonym Replacement)...")
            # Using Synonym augmentation to preserve emotional polarity for regression
            self.aug = naw.SynonymAug(aug_src='wordnet', aug_p=0.2)

    def clean_text(self, text, is_words_mode=False):
        """
        Specific cleaning logic for ecological essays and word lists.
        """
        if pd.isna(text): return ""
        text = str(text)

        # 1. Fix excessive punctuation (e.g., "I'm fine...........") -> "I'm fine."
        text = re.sub(r'\.{2,}', '.', text)

        # 2. Handle 'Word List' format (is_words=True)
        # Converts "Happy , Sad , Calm" -> "Happy Sad Calm" for better BERT tokenization
        if is_words_mode:
            text = text.replace(' , ', ' ').replace(',', ' ')

        # 3. Normalize whitespace and expand simple contractions
        text = re.sub(r'\s+', ' ', text).strip()
        text = re.sub(r"I ' m", "I am", text)
        text = re.sub(r"it ' s", "it is", text)
        text = re.sub(r"don ' t", "do not", text)

        return text

    def process(self, file_path, output_path):
        """
        Main processing execution.
        """
        print(f"--- Processing File: {file_path} ---")

        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Input file not found: {file_path}")

        df = pd.read_csv(file_path)
        print(f"Original dataset shape: {df.shape}")

        # --- Step 1: Strict Temporal Sorting (CRITICAL for Task 2a) ---
        # Since this is a forecasting task, data MUST be ordered by User and Time.
        if 'timestamp' in df.columns:
            print("Sorting data by User ID and Timestamp...")
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.sort_values(by=['user_id', 'timestamp'])

        # --- Step 2: Text Cleaning ---
        print("Cleaning text data...")
        df['processed_text'] = df.apply(
            lambda row: self.clean_text(row['text'], row['is_words']), axis=1
        )

        # --- Step 3: Handle Missing Targets (NaNs at sequence ends) ---
        # Task 2a data has NaNs in 'state_change' columns for the last entry of each user.
        target_cols = ['state_change_valence', 'state_change_arousal']

        # Create a Mask column: 1 = Valid Data, 0 = Padding/NaN
        # Use this mask in the Loss Function to ignore these rows.
        df['valid_target_mask'] = 1

        for col in target_cols:
            if col in df.columns:
                # Identify NaNs
                mask = df[col].isna()
                count_nan = mask.sum()

                if count_nan > 0:
                    print(f"Found {count_nan} missing values in '{col}'. Marking invalid in mask.")
                    df.loc[mask, 'valid_target_mask'] = 0

                    # Fill NaNs with 0.0 to prevent code crashes (Mask handles the logic)
                    df[col] = df[col].fillna(0.0)

        # --- Step 4: Data Augmentation (Training Set Only) ---
        if self.augment:
            print("Applying Data Augmentation (this may take a few minutes)...")
            augmented_rows = []

            for _, row in df.iterrows():
                original_text = row['processed_text']

                # Only augment if sentence is long enough (> 3 words)
                if len(original_text.split()) > 3:
                    try:
                        # Generate augmented text
                        aug_text = self.aug.augment(original_text)[0]

                        new_row = row.copy()
                        new_row['processed_text'] = aug_text
                        new_row['is_augmented'] = True # Flag as augmented

                        # Note: Augmented data inherits the 'valid_target_mask' of the original
                        augmented_rows.append(new_row)
                    except:
                        pass # Skip if augmentation fails

            # Merge augmented data
            aug_df = pd.DataFrame(augmented_rows)
            df['is_augmented'] = False
            df = pd.concat([df, aug_df], ignore_index=True)
            print(f"Augmentation Complete. Added {len(aug_df)} new samples.")
            print(f"Total dataset size: {len(df)}")

        # Save to CSV
        df.to_csv(output_path, index=False)
        print(f"Processing finished! File saved to: {output_path}")
        return df

# ==========================================
# Main Execution
# ==========================================
if __name__ == "__main__":
    # Define file paths (Adjust filenames as needed)
    input_filename = os.path.join("..", "data","train_release_3sep2025", "train_subtask2a.csv")
    output_filename = "train_subtask2a_processed.csv"

    # Initialize Preprocessor
    # Set augment=True for Training Data, False for Validation Data
    processor = Task2aPreprocessor(augment=True)

    try:
        processor.process(input_filename, output_filename)

        # Preview specific columns to verify logic
        print("\n--- Data Preview (User 1 Tail) ---")
        df_result = pd.read_csv(output_filename)
        cols = ['user_id', 'timestamp', 'state_change_valence', 'valid_target_mask']
        # Show User 1 to check if the last NaN row was handled correctly
        print(df_result[df_result['user_id'] == 1][cols].tail())

    except Exception as e:
        print(f"\n[Error] An error occurred: {e}")

>>> Initializing Data Augmentation Model (Synonym Replacement)...
--- Processing File: ../data/train_release_3sep2025/train_subtask2a.csv ---
Original dataset shape: (2764, 10)
Sorting data by User ID and Timestamp...
Cleaning text data...
Found 137 missing values in 'state_change_valence'. Marking invalid in mask.
Found 137 missing values in 'state_change_arousal'. Marking invalid in mask.
Applying Data Augmentation (this may take a few minutes)...


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package ave

Augmentation Complete. Added 0 new samples.
Total dataset size: 2764
Processing finished! File saved to: train_subtask2a_processed.csv

--- Data Preview (User 1 Tail) ---
   user_id            timestamp  state_change_valence  valid_target_mask
0        1  2021-06-09 12:41:57                   0.0                  1
1        1  2021-06-11 12:01:45                  -2.0                  1
2        1  2021-06-13 13:15:07                   2.0                  1
3        1  2021-06-16 12:03:12                   0.0                  1
4        1  2021-06-17 12:38:38                   0.0                  0


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package ave