In [None]:
import pandas as pd
import numpy as np
import re
import os
import nlpaug.augmenter.word as naw
import nltk
from nltk.corpus import stopwords

# Uncomment these lines on the first run to download necessary NLTK data
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('stopwords') 

class SmartPreprocessor:
    """
    Smart Preprocessing Pipeline for SemEval Task 1.
    
    Key Features:
    1. Fixes contractions (e.g., "don't" -> "do not") to protect negation words.
    2. Removes special characters but preserves alphabets.
    3. Smart Stop Word Removal: Removes generic words (e.g., "the", "is") 
       but strictly preserves sentiment-critical words (e.g., "not", "but", "nor").
    """
    def __init__(self, augment=False):
        """
        :param augment: Whether to enable data augmentation (True for Training, False for Val/Test).
        """
        self.augment = augment
        
        print(">>> [Init] Loading and customizing stop word list...")
        # 1. Load default NLTK stop words
        stop_words_set = set(stopwords.words('english'))
        
        # 2. Define a Whitelist of "Sentiment Reversal Words"
        # These words must NEVER be removed as they flip the sentiment polarity.
        exceptions = {
            'no', 'not', 'nor', 'neither', 'never', 'none', # Negations
            'but', 'however', 'although', 'except',         # Contrast/Transition
            'against', 'cannot', "can't", "don't"           # Other negative expressions
        }
        
        # 3. Remove exceptions from the stop word set (Keep them safe)
        self.stop_words = stop_words_set - exceptions
        
        print(f">>> [Init] Key emotional words preserved: {exceptions}")

        if self.augment:
            print(">>> [Init] Initializing Data Augmentation Model (WordNet)...")
            self.aug = naw.SynonymAug(aug_src='wordnet', aug_p=0.2)

    def clean_text(self, text, is_words_mode=False):
        """
        Applies the cleaning logic to a single text string.
        """
        if pd.isna(text): return ""
        text = str(text)

        # --- 1. Fix Contractions (CRITICAL STEP) ---
        # We expand contractions first so that words like "n't" become "not".
        # Since "not" is in our whitelist, it will survive the stop word removal.
        text = re.sub(r"\s+'\s+m\b", " am", text)   
        text = re.sub(r"\s+'\s+s\b", " is", text)   
        text = re.sub(r"\s+'\s+t\b", " not", text)  # don ' t -> do not
        text = re.sub(r"\s+'\s+re\b", " are", text) 
        text = re.sub(r"\s+'\s+ll\b", " will", text)
        text = re.sub(r"\s+'\s+ve\b", " have", text)
        
        # Handle standard contractions (without extra spaces)
        text = re.sub(r"n't", " not", text)
        text = re.sub(r"cannot", "can not", text)

        # --- 2. Handle Excessive Punctuation ---
        # "......" -> " "
        text = re.sub(r'[.,]{2,}', ' ', text) 
        if is_words_mode:
            text = text.replace(',', ' ')

        # --- 3. Remove Special Characters (Keep Alphabets Only) ---
        # Regex: Replace anything that is NOT a letter (a-z, A-Z) or whitespace with a space.
        # This removes numbers and punctuation.
        text = re.sub(r'[^a-zA-Z\s]', ' ', text)

        # --- 4. Normalize Whitespace ---
        text = re.sub(r'\s+', ' ', text).strip()

        # --- 5. Smart Stop Word Removal ---
        words = text.split()
        # Filter out words that are in the stop_words set
        # (Recall: 'not', 'but', etc., were removed from this set, so they remain in the text)
        filtered_words = [w for w in words if w.lower() not in self.stop_words]
        text = " ".join(filtered_words)

        return text

    def process_dataframe(self, df):
        """
        Main processing loop: Sorting -> Cleaning -> Augmentation.
        """
        print(f"Starting processing for {len(df)} records...")

        # 0. Temporal Sorting
        if 'timestamp' in df.columns:
            print(">>> [Sorting] Sorting data by User ID and Timestamp...")
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.sort_values(by=['user_id', 'timestamp']).reset_index(drop=True)

        # 1. Cleaning
        print(">>> [Cleaning] Executing smart cleaning (Preserving not/but)...")
        df['processed_text'] = [
            self.clean_text(row['text'], row.get('is_words', False))
            for _, row in df.iterrows()
        ]

        # 2. Augmentation (Training Set Only)
        if self.augment:
            print(">>> [Augmentation] Applying data augmentation...")
            augmented_rows = []
            for _, row in df.iterrows():
                original_text = row['processed_text']
                # Only augment if sentence has enough length (>= 2 words)
                if len(original_text.split()) >= 2:
                    try:
                        aug_text = self.aug.augment(original_text)[0]
                        new_row = row.copy()
                        new_row['processed_text'] = aug_text
                        new_row['is_augmented'] = True 
                        augmented_rows.append(new_row)
                    except:
                        pass
            
            if augmented_rows:
                aug_df = pd.DataFrame(augmented_rows)
                df['is_augmented'] = False # Mark original data
                
                # Append augmented data to the end
                df = pd.concat([df, aug_df], ignore_index=True)
                print(f"Augmentation complete. Total dataset size: {len(df)}")
            else:
                print("No augmented samples were generated.")
        else:
            df['is_augmented'] = False

        return df

# ==========================================
# Main Execution Block
# ==========================================
if __name__ == "__main__":
    # 1. Path Configuration
    base_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in locals() else os.getcwd()
    # Attempt to locate data in the parent directory
    input_file = os.path.join(base_dir, "..", "data", "train_release_3sep2025", "train_subtask1.csv")

    if not os.path.exists(input_file):
        print("File not found in parent directory. Checking current directory...")
        input_file = "train_subtask1.csv"

    print(f"Loading data from: {os.path.abspath(input_file)}")

    try:
        df = pd.read_csv(input_file)
        
        # 2. Instantiate Smart Processor
        # augment=True for Training Data, False for Validation Data
        processor = SmartPreprocessor(augment=True)
        df_processed = processor.process_dataframe(df)

        # --- 3. Verification ---
        print("\n--- Cleaning Verification (Testing logic) ---")
        test_text = "I am not happy but I am fine. 123!!"
        print(f"Original: {test_text}")
        print(f"Cleaned : {processor.clean_text(test_text)}")
        print("(Expected: 'not happy but fine' -> Stop words/numbers removed, keywords kept)")
        
        # 4. Save Output
        output_file = "train_subtask1_smart_processed.csv"
        df_processed.to_csv(output_file, index=False)
        print(f"\nSuccess! Processed file saved to: {output_file}")

    except Exception as e:
        print(f"[Error] An error occurred: {e}")

Attempting to load data from: /Users/van/Desktop/LLM/2025/Project/SemEval2026-EmoVA/data/train_release_3sep2025/train_subtask1.csv
>>> [Init] Initializing Data Augmentation Model (WordNet)...
Starting processing for 2764 records...
>>> [Sorting] Sorting data by User ID and Timestamp...
>>> [Cleaning] Cleaning text data...
>>> [Augmentation] Applying data augmentation (this may take a few minutes)...


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package ave

No augmented samples were generated.

--- Cleaning Preview (User 3) ---
Original: I ' ve been feeling just fine . Nice and relaxed ....
Cleaned : I have been feeling just fine . Nice and relaxed ....

Success! Processed file saved to: train_subtask1_processed.csv


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package ave