In [1]:
import pandas as pd
import numpy as np
import re
import os
import nlpaug.augmenter.word as naw
import nltk
from nltk.corpus import stopwords

# ==========================================
# Uncomment these lines on the first run
# ==========================================
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('stopwords') 

class Task2aSmartPreprocessor:
    """
    Smart Preprocessing pipeline for SemEval Task 2a (Forecasting).
    Combines strict temporal sorting, NaN masking, and sentiment-aware text cleaning.
    """
    def __init__(self, augment=False):
        """
        :param augment: Boolean. Enable data augmentation (True for Training, False for Val/Test).
        """
        self.augment = augment
        
        print(">>> [Init] Loading Stopwords with Exceptions...")
        # 1. Load NLTK default stopwords
        stop_words_set = set(stopwords.words('english'))
        
        # 2. Define Whitelist: Words critical for sentiment that MUST NOT be removed
        exceptions = {
            'no', 'not', 'nor', 'neither', 'never', 'none', # Negation
            'but', 'however', 'although', 'except',         # Contrast
            'against', 'cannot', "can't", "don't"           # Other negatives
        }
        
        # 3. Remove exceptions from the stopword list
        self.stop_words = stop_words_set - exceptions
        print(f"    - Preserved {len(exceptions)} emotional keywords (e.g., not, but...)")

        if self.augment:
            print(">>> [Init] Initializing Augmentation Model (WordNet)...")
            self.aug = naw.SynonymAug(aug_src='wordnet', aug_p=0.2)

    def clean_text(self, text, is_words_mode=False):
        """
        Smart Cleaning Logic:
        1. Fix contractions (to protect 'not').
        2. Remove ALL special characters and numbers (keep only A-Z).
        3. Remove stopwords (except whitelist).
        """
        if pd.isna(text): return ""
        text = str(text)

        # --- 1. Fix Contractions FIRST ---
        # We must expand "n't" to "not" before removing punctuation.
        text = re.sub(r"n't", " not", text)
        text = re.sub(r"\s+'\s+t\b", " not", text) # Fix "don ' t"
        text = re.sub(r"\s+'\s+m\b", " am", text)
        text = re.sub(r"\s+'\s+s\b", " is", text)
        text = re.sub(r"\s+'\s+re\b", " are", text)
        text = re.sub(r"\s+'\s+ll\b", " will", text)
        text = re.sub(r"\s+'\s+ve\b", " have", text)
        
        # --- 2. Remove Special Characters & Digits ---
        # Regex: Keep ONLY alphabets (a-z, A-Z) and spaces.
        # This removes '.', ',', '!', '?', '123', etc.
        text = re.sub(r'[^a-zA-Z\s]', ' ', text)

        # --- 3. Normalize Whitespace ---
        text = re.sub(r'\s+', ' ', text).strip()

        # --- 4. Smart Stopword Removal ---
        words = text.split()
        # Filter: Remove word if it is in stop_words (and not in our whitelist)
        filtered_words = [w for w in words if w.lower() not in self.stop_words]
        text = " ".join(filtered_words)

        return text

    def process(self, file_path, output_path):
        print(f"--- Processing File: {file_path} ---")
        
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Error: Input file not found at {file_path}")

        df = pd.read_csv(file_path)
        print(f"Original dataset shape: {df.shape}")

        # --- Step 1: Strict Temporal Sorting (CRITICAL for Task 2a) ---
        # Data MUST be sorted by User and Time for LSTM/Transformer
        if 'timestamp' in df.columns:
            print(">>> [Sorting] Sorting data by User ID and Timestamp...")
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.sort_values(by=['user_id', 'timestamp'])

        # --- Step 2: Smart Text Cleaning ---
        print(">>> [Cleaning] Applying Smart Cleaning (Removing noise, keeping 'not')...")
        df['processed_text'] = df.apply(
            lambda row: self.clean_text(row['text'], row['is_words']), axis=1
        )

        # --- Step 3: Handle Missing Targets (Task 2a Specific) ---
        # In forecasting, the last entry of a user often has NaN for 'state_change'
        target_cols = ['state_change_valence', 'state_change_arousal']
        df['valid_target_mask'] = 1 # Default: 1 = Valid Data

        for col in target_cols:
            if col in df.columns:
                # Identify NaNs
                mask = df[col].isna()
                if mask.any():
                    count = mask.sum()
                    print(f"    - Found {count} missing values in '{col}' (End of sequences).")
                    
                    # Mark as invalid (0) for Loss Function masking
                    df.loc[mask, 'valid_target_mask'] = 0
                    
                    # Fill with 0.0 to prevent code crashes
                    df[col] = df[col].fillna(0.0)

        # --- Step 4: Data Augmentation (Training Only) ---
        if self.augment:
            print(">>> [Augmentation] Applying data augmentation...")
            augmented_rows = []
            for _, row in df.iterrows():
                # Augment only if sentence is long enough (> 2 words)
                if len(row['processed_text'].split()) > 2:
                    try:
                        aug_text = self.aug.augment(row['processed_text'])[0]
                        new_row = row.copy()
                        new_row['processed_text'] = aug_text
                        new_row['is_augmented'] = True # Mark as augmented
                        # Augmented data inherits the 'valid_target_mask' from original
                        augmented_rows.append(new_row)
                    except:
                        pass
            
            # Merge augmented data
            if augmented_rows:
                aug_df = pd.DataFrame(augmented_rows)
                df['is_augmented'] = False
                df = pd.concat([df, aug_df], ignore_index=True)
                print(f"Augmentation complete. Added {len(aug_df)} samples.")

        # Save to CSV
        df.to_csv(output_path, index=False)
        print(f"Processing finished! File saved to: {output_path}")
        return df

# ==========================================
# Main Execution Block
# ==========================================
if __name__ == "__main__":
    # Logic to locate the file in the parent 'data' directory
    base_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in locals() else os.getcwd()
    
    # Path: ../data/train_release_3sep2025/train_subtask2a.csv
    input_filename = os.path.join(base_dir, "..", "data", "train_release_3sep2025", "train_subtask2a.csv")
    
    # Fallback: Check local directory
    if not os.path.exists(input_filename):
        print(f"File not found in parent directory. Checking current directory...")
        input_filename = "train_subtask2a.csv"

    output_filename = "train_subtask2a_processed_smart.csv"

    # Instantiate and Run
    # Set augment=True for Training
    processor = Task2aSmartPreprocessor(augment=True)
    
    try:
        df_result = processor.process(input_filename, output_filename)
        
        # Verify Results
        print("\n--- Cleaning Verification (First Sample) ---")
        sample = df_result.iloc[0]
        print(f"Original: {sample['text'][:60]}...")
        print(f"Cleaned : {sample['processed_text'][:60]}...")
        print(f"Mask    : {sample['valid_target_mask']}")

    except Exception as e:
        print(f"\n[Error] An error occurred: {e}")

>>> [Init] Loading Stopwords with Exceptions...
    - Preserved 14 emotional keywords (e.g., not, but...)
>>> [Init] Initializing Augmentation Model (WordNet)...
--- Processing File: /Users/van/Desktop/LLM/2025/Project/SemEval2026-EmoVA/src/../data/train_release_3sep2025/train_subtask2a.csv ---
Original dataset shape: (2764, 10)
>>> [Sorting] Sorting data by User ID and Timestamp...
>>> [Cleaning] Applying Smart Cleaning (Removing noise, keeping 'not')...
    - Found 137 missing values in 'state_change_valence' (End of sequences).
    - Found 137 missing values in 'state_change_arousal' (End of sequences).
>>> [Augmentation] Applying data augmentation...


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package ave

Processing finished! File saved to: train_subtask2a_processed_smart.csv

--- Cleaning Verification (First Sample) ---
Original: I feel good .   I caught up on some sleep . Work went well f...
Cleaned : feel good caught sleep Work went well relaxed happy sone alc...
Mask    : 1


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package ave