In [None]:
!pip install nlpaug nltk numpy pandas matplotlib

In [None]:
import nlpaug.augmenter.word as naw
import pandas as pd
import numpy as np
import nltk
import sys
from pathlib import Path
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

In [None]:
DATA_DIR = Path("data")
TRAIN_PATH = DATA_DIR / "train.csv"
AUGMENTED_PATH = DATA_DIR / "train_augmented.csv"

BASE_DRIVE_DIR = Path("/content/drive/MyDrive/NLP-Clarity")

if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    DATA_DIR = BASE_DRIVE_DIR / "data"
    TRAIN_PATH = DATA_DIR / "train.csv"
    AUGMENTED_PATH = DATA_DIR / "train_augmented.csv"

DATA_DIR.mkdir(parents=True, exist_ok=True)

if not TRAIN_PATH.exists():
    print(f"File not found: {TRAIN_PATH}")
    if Path("data/train.csv").exists():
         TRAIN_PATH = Path("data/train.csv")
         AUGMENTED_PATH = Path("data/train_augmented.csv")

df = pd.read_csv(TRAIN_PATH)
print(f"Original shape: {df.shape}")

label_counts = df['evasion_label'].value_counts()
print("\nClass distribution:")
print(label_counts)

label_counts.plot(kind='bar', title='Original Class Distribution')
plt.show()

In [None]:

# aug_p=0.3 => 30% of words will be candidates for replacement
aug = naw.SynonymAug(aug_src='wordnet', aug_p=0.3)

def augment_text(text):
    if not isinstance(text, str):
        return text
    try:
        res = aug.augment(text)
        return res[0] if isinstance(res, list) else res
    except Exception as e:
        return text

text = "The president decided to ignore the question about the economy."
print(f"Original: {text}")
print(f"Augmented: {augment_text(text)}")

In [None]:
TARGET_COUNT = 1000
augmented_rows = []

for label, count in label_counts.items():
    if count >= TARGET_COUNT:
        continue
        
    n_needed = TARGET_COUNT - count
    print(f"Augmenting '{label}': need {n_needed} more samples")
    
    class_df = df[df['evasion_label'] == label]
    
    generated = 0
    class_rows = class_df.to_dict('records')
    
    with tqdm(total=n_needed) as pbar:
        while generated < n_needed:
            for row in class_rows:
                if generated >= n_needed:
                    break
                
                new_row = row.copy()
                
                new_row['question'] = augment_text(row['question'])
                new_row['interview_answer'] = augment_text(row['interview_answer'])
                
                augmented_rows.append(new_row)
                generated += 1
                pbar.update(1)

In [None]:
df_aug = pd.DataFrame(augmented_rows)
df_final = pd.concat([df, df_aug], ignore_index=True)

# Shuffle
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"New shape: {df_final.shape}")
print("\nNew Class distribution:")
print(df_final['evasion_label'].value_counts())

In [None]:
df_final.to_csv(AUGMENTED_PATH, index=False)
print(f"Saved augmented dataset to {AUGMENTED_PATH}")