In [1]:
import sys
from pathlib import Path
import os

# Add parent directory to path to import logs module
# This allows importing from the root directory when running from notebooks/
current_dir = Path(os.getcwd())
if current_dir.name == 'notebooks':
    # If we're in notebooks/, add parent directory
    sys.path.insert(0, str(current_dir.parent))
else:
    # If running from root, add current directory
    sys.path.insert(0, str(current_dir))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from transformers import AutoModel
from logs import log

In [None]:
import wandb

# Disable wandb logging for this script
wandb.init(mode="disabled")

# CONFIG
NUM_TYPES = 5
NUM_MANIFESTATIONS = 6
datasets_merge = True
lang = "eng"
# trial_id = "0000NG3"
# model_names = ['bert-base-uncased', "UBC-NLP/MARBERTv2", "microsoft/deberta-v3-base", "FacebookAI/xlm-roberta-large", "0ssamaak0/roberta-base-LEGO_emotions", "FacebookAI/roberta-base"]
# model_name = model_names[-1]

In [3]:
train_1 = pd.read_csv("../dev_phase/subtask1/train/" + lang + ".csv")
train_2 = pd.read_csv("../dev_phase/subtask2/train/" + lang + ".csv")
train_3 = pd.read_csv("../dev_phase/subtask3/train/" + lang + ".csv")
dev_df = pd.read_csv("../dev_phase/subtask1/dev/" + lang + ".csv")

In [4]:
# Merge all training data to get unique texts with all labels
train_df = train_1.merge(train_2, on=["id", "text"], how="outer").merge(train_3, on=["id", "text"], how="outer")
train_df = train_df.fillna(0).astype({col: int for col in train_df.columns if col not in ["id", "text"]})
print(f"Total training examples: {len(train_df)}")

Total training examples: 3380


In [5]:
from openai import OpenAI
import os

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def paraphrase_text(text: str, model: str = "gpt-5.2-2025-12-11") -> str:
    """Paraphrase text using OpenAI API while preserving meaning and tone."""
    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": "You are a paraphrasing assistant. Rephrase the given text while preserving its exact meaning, tone, and sentiment. You can change the wording tone, formality, if it's in standard Arabic you can use dialect or vice versa. The goal is to augment this data to make the model more robust on polarization and hate speech detection. Do everything but keep the same meaning. Output ONLY the paraphrased text, nothing else."
            },
            {
                "role": "user", 
                "content": text
            }
        ],
    )
    return response.choices[0].message.content.strip()

In [6]:
# Sample 15% of the dataset for augmentation
AUGMENT_RATIO = 0.25
sample_df = train_df.sample(frac=AUGMENT_RATIO, random_state=42)
print(f"Samples to augment: {len(sample_df)}")

Samples to augment: 845


In [7]:
# Augment the sampled data
augmented_rows = []

for idx, row in tqdm(sample_df.iterrows(), total=len(sample_df), desc="Paraphrasing"):
    try:
        paraphrased = paraphrase_text(row["text"])
        new_row = row.copy()
        new_row["text"] = paraphrased
        new_row["id"] = row["id"] + "_aug"  # Mark as augmented
        augmented_rows.append(new_row)
    except Exception as e:
        print(f"Error paraphrasing row {idx}: {e}")
        continue

augmented_df = pd.DataFrame(augmented_rows)
print(f"Successfully augmented: {len(augmented_df)} examples")

Paraphrasing:   0%|          | 0/845 [00:00<?, ?it/s]

Successfully augmented: 845 examples


In [9]:
# Combine original and augmented data
final_df = pd.concat([train_df, augmented_df], ignore_index=True)
print(f"Final dataset size: {len(final_df)} (original: {len(train_df)}, augmented: {len(augmented_df)})")

# Save the augmented dataset
final_df.to_csv(f"../dev_phase/subtask1/train/{lang}_augmented_1.csv", index=False)
print("Saved augmented dataset!")

# # or load if already saved
# final_df = pd.read_csv(f"./dev_phase/subtask1/train/{lang}_augmented.csv")


Final dataset size: 4225 (original: 3380, augmented: 845)
Saved augmented dataset!


In [11]:
# Split the augmented dataset back into the three subtasks
train_1_aug = final_df[train_1.columns].copy()
train_2_aug = final_df[train_2.columns].copy()
train_3_aug = final_df[train_3.columns].copy()

# Save per-subtask augmented datasets
train_1_aug.to_csv(f"../dev_phase/subtask1/train/{lang}_augmented1.csv", index=False)
train_2_aug.to_csv(f"../dev_phase/subtask2/train/{lang}_augmented1.csv", index=False)
train_3_aug.to_csv(f"../dev_phase/subtask3/train/{lang}_augmented1.csv", index=False)

print("Saved split augmented datasets for subtask1/2/3.")

Saved split augmented datasets for subtask1/2/3.
