In [3]:
import pandas as pd
import numpy as np
import re
import nlpaug.augmenter.word as naw
import nltk
import os

# Download required NLTK resources on first run
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

class DataPreprocessor:
    def __init__(self, augment=False):
        """
        :param augment: Whether to enable data augmentation.
                        (Set to True for Training Set, False for Validation/Test Sets!)
        """
        self.augment = augment

        # Initialize augmenter: Synonym Replacement
        # This method is safer for regression tasks as it usually preserves
        # the emotional polarity (valence) of the sentence.
        if self.augment:
            print("Initializing Data Augmentation Model (WordNet)...")
            self.aug = naw.SynonymAug(aug_src='wordnet', aug_p=0.2) # Replace 20% of words

    def clean_text(self, text, is_words_mode=False):
        """
        Specific cleaning logic for SemEval Task 2
        """
        if pd.isna(text):
            return ""

        text = str(text)

        # 1. Handle excessive dots like User 3: "I'm in a good mood ................."
        # Replace 2 or more consecutive dots with a single dot to reduce token length
        text = re.sub(r'\.{2,}', '.', text)

        # 2. Handle User 10 style lists: "Happy , Sad , Calm" (is_words=True)
        # If it is a word list, remove commas to make it cleaner for BERT processing
        if is_words_mode:
            text = text.replace(' , ', ' ')
            text = text.replace(',', ' ')

        # 3. Normalize whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        # 4. Simple contraction expansion (can be expanded if needed)
        text = re.sub(r"I ' m", "I am", text)
        text = re.sub(r"don ' t", "do not", text)
        text = re.sub(r"it ' s", "it is", text)

        return text

    def process_dataframe(self, df):
        """
        Main processing function
        """
        print(f"Starting processing for {len(df)} records...")

        # --- Step 1: Text Cleaning ---
        # We process row by row to handle the 'is_words' logic correctly
        clean_texts = []
        for index, row in df.iterrows():
            cleaned = self.clean_text(row['text'], is_words_mode=row['is_words'])
            clean_texts.append(cleaned)

        df['processed_text'] = clean_texts

        # --- Step 2: Data Augmentation (Training Set Only) ---
        if self.augment:
            print("Applying data augmentation (this may take a while)...")
            augmented_rows = []

            # Iterate through original data to generate augmented samples
            for index, row in df.iterrows():
                original_text = row['processed_text']

                # Only augment if the sentence is long enough ( > 3 words)
                if len(original_text.split()) > 3:
                    try:
                        # nlpaug returns a list, we take the first item
                        aug_text = self.aug.augment(original_text)[0]

                        # Copy all info from the current row (IDs, Labels, etc.)
                        new_row = row.copy()
                        new_row['processed_text'] = aug_text

                        # Mark this row as augmented (useful for analysis, optional)
                        new_row['is_augmented'] = True

                        augmented_rows.append(new_row)
                    except:
                        pass # Skip if augmentation fails

            # Create a DataFrame for the augmented data
            aug_df = pd.DataFrame(augmented_rows)
            df['is_augmented'] = False # Mark original data as False

            # Merge: Original Data + Augmented Data
            df = pd.concat([df, aug_df], ignore_index=True)
            print(f"Augmentation complete. Original size: {len(df)-len(aug_df)}, Total size after aug: {len(df)}")

        return df

# ==========================================
# Usage Example (You can run this block to test)
# ==========================================
import os  # 记得在文件最开头加上这行 import

# ... (上面的 DataPreprocessor 类保持不变) ...

# ==========================================
# Main Execution Block
# ==========================================

if __name__ == "__main__":
    # 1. Define file path using os.path to handle '..' (parent directory) correctly
    # Structure: Current Script -> Go Up (..) -> data -> train_release... -> csv
    file_path = os.path.join("..", "data", "train_release_3sep2025", "train_subtask1.csv")

    print(f"Attempting to load data from: {file_path}")

    try:
        # Load the CSV file
        df = pd.read_csv(file_path)
        print(f"Successfully loaded! Dataset shape: {df.shape}")

        # 2. Instantiate preprocessor (Enable augmentation for Training)
        # NOTE: Set augment=False if you are processing validation data!
        processor = DataPreprocessor(augment=True)

        # 3. Run processing
        df_processed = processor.process_dataframe(df)

        # 4. View results (Preview first 5 rows)
        print("\n--- Processed Data Preview ---")
        print(df_processed[['text', 'processed_text', 'is_words']].head())

        # 5. Save the processed file for Arman
        # Saving it in the same folder as the input file, or locally
        output_path = "train_subtask1_processed.csv"
        df_processed.to_csv(output_path, index=False)
        print(f"\nSaved processed data to: {output_path}")

    except FileNotFoundError:
        print(f"\n[Error] File not found at: {os.path.abspath(file_path)}")
        print("Please check if your script is in the 'src' or 'code' folder and 'data' is next to it.")
    except Exception as e:
        print(f"\n[Error] An error occurred: {e}")

    # 5. Save for Arman (Dataloader)
    # df_processed.to_csv("train_processed_augmented.csv", index=False)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/van/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/van/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Attempting to load data from: ../data/train_release_3sep2025/train_subtask1.csv
Successfully loaded! Dataset shape: (2764, 8)
Initializing Data Augmentation Model (WordNet)...
Starting processing for 2764 records...
Applying data augmentation (this may take a while)...


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package ave

Augmentation complete. Original size: 2764, Total size after aug: 2764

--- Processed Data Preview ---
                                                text  \
0  I ' ve been feeling just fine . Nice and relax...   
1  I ' ve been feeling pretty good . This is my l...   
2  Still feeling sick today and sneezing alot wit...   
3  I ' m still feeling kinda sickly but I powered...   
4  I ' m feeling so much better today ! Seems lik...   

                                      processed_text  is_words  
0  I ' ve been feeling just fine . Nice and relax...     False  
1  I ' ve been feeling pretty good . This is my l...     False  
2  Still feeling sick today and sneezing alot wit...     False  
3  I am still feeling kinda sickly but I powered ...     False  
4  I am feeling so much better today ! Seems like...     False  

Saved processed data to: train_subtask1_processed.csv


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/van/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package ave