In [None]:
!pip install torch transformers datasets pandas scikit-learn numpy
!pip install accelerate -U

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting 

Collecting accelerate
  Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.6.0-py3-none-any.whl (354 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m354.7/354.7 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 1.5.2
    Uninstalling accelerate-1.5.2:
      Successfully uninstalled accelerate-1.5.2
Successfully installed accelerate-1.6.0


In [None]:
!pip install torch transformers pandas scikit-learn accelerate -q
!WANDB_DISABLED=true

import pandas as pd
import re
import numpy as np
import warnings
from sklearn.metrics import classification_report
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
import torch
from torch.utils.data import Dataset
import os

# Disable all warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
warnings.filterwarnings("ignore")

# Set seed and enable optimizations
torch.manual_seed(42)
torch.backends.cudnn.benchmark = True

# -------------------------
# 1. Language-Specific Preprocessing
# -------------------------
def preprocess_text(text, lang):
    text = re.sub(r"http\S+|www\S+|@\w+|#\w+", "", str(text))
    return text.strip()


# -------------------------
# 2.  Data Loading
# -------------------------
def load_single_task_data(lang, split="train"):
    annot_col = f"{lang}_a1"
    file_path = f"{split}_{lang}_l1.csv"

    try:
        # First verify file exists
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Missing file: {file_path}")

        # Verify column exists
        with open(file_path, 'r') as f:
            header = f.readline().strip().split(',')
            if "text" not in header or annot_col not in header:
                raise ValueError(f"Missing required columns in {file_path}")

        # Load with error-tolerant parsing
        df = pd.read_csv(
            file_path,
            usecols=["text", annot_col],
            engine='python',
            on_bad_lines='skip',
            dtype={'text': str, annot_col: str}
        )

    except Exception as e:
        print(f" Error loading {file_path}: {str(e)}")
        return [], []

    texts, labels = [], []
    for _, row in df.iterrows():
        val = row[annot_col]
        if pd.notna(val) and str(val).strip().upper() not in ['NL', 'NAN']:
            try:
                label = int(float(str(val).replace('.0','')))
                texts.append(preprocess_text(row["text"], lang))
                labels.append(label)
            except:
                continue

    return texts, labels

# -------------------------
# 3. Model Setup
# -------------------------
MODEL_MAP = {
    "en": "distilroberta-base",
    "hi": "bert-base-multilingual-cased",
    "ta": "bert-base-multilingual-cased"
}

# -------------------------
# 4. Dataset Class
# -------------------------
class SilentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

    def __len__(self):
        return len(self.labels)

# -------------------------
# 5. Silent Training Function
# -------------------------
def train_task1(lang):
    # Load data
    train_texts, train_labels = load_single_task_data(lang, "train")
    test_texts, test_labels = load_single_task_data(lang, "test")

    # Tokenize
    tokenizer = AutoTokenizer.from_pretrained(MODEL_MAP[lang])

    # Tokenize datasets
    train_encodings = tokenizer(
        train_texts,
        truncation=True,
        padding='max_length',
        max_length=64,
        return_tensors="pt"
    )

    test_encodings = tokenizer(
        test_texts,
        truncation=True,
        padding='max_length',
        max_length=64,
        return_tensors="pt"
    )

    # Create datasets
    train_dataset = SilentDataset(train_encodings, train_labels)
    test_dataset = SilentDataset(test_encodings, test_labels)

    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_MAP[lang],
        num_labels=2
    ).to("cuda" if torch.cuda.is_available() else "cpu")

    # Training config
    training_args = TrainingArguments(
        output_dir=f"./results_{lang}",
        per_device_train_batch_size=64,
        num_train_epochs=3,
        learning_rate=3e-5,
        fp16=torch.cuda.is_available(),
        logging_steps=1000,
        report_to="none",
        save_strategy="no",
        disable_tqdm=True,
        optim="adamw_torch_fused"
    )

    # Metrics with zero_division handled
    def compute_metrics(p):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            preds = p.predictions.argmax(-1)
            report = classification_report(
                p.label_ids,
                preds,
                output_dict=True,
                zero_division=0
            )
            return {
                'f1': report['weighted avg']['f1-score'],
                'accuracy': report['accuracy']
            }

    # Train and evaluate
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        compute_metrics=compute_metrics
    )

    trainer.train()
    results = trainer.evaluate(test_dataset)

    # Print clean results
    print(f"\n{lang.upper()}:")
    print(f"F1  → {results['eval_f1']:.3f}")
    print(f"ACC → {results['eval_accuracy']:.3f}")
    print("⎯"*30)

# -------------------------
# 6. Main Execution
# -------------------------
if __name__ == "__main__":
    for lang in ["en", "hi", "ta"]:
        train_task1(lang)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'train_runtime': 0.8313, 'train_samples_per_second': 472.77, 'train_steps_per_second': 10.827, 'train_loss': 0.496605290306939, 'epoch': 3.0}
{'eval_loss': 0.5446405410766602, 'eval_f1': 0.6617169470463541, 'eval_accuracy': 0.7639484978540773, 'eval_runtime': 0.254, 'eval_samples_per_second': 917.407, 'eval_steps_per_second': 118.121, 'epoch': 3.0}

EN:
F1  → 0.662
ACC → 0.764
⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'train_runtime': 13.5981, 'train_samples_per_second': 276.877, 'train_steps_per_second': 4.412, 'train_loss': 0.46761350631713866, 'epoch': 3.0}
{'eval_loss': 0.461620032787323, 'eval_f1': 0.7084533113944879, 'eval_accuracy': 0.7980769230769231, 'eval_runtime': 1.5894, 'eval_samples_per_second': 588.91, 'eval_steps_per_second': 73.614, 'epoch': 3.0}

HI:
F1  → 0.708
ACC → 0.798
⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'train_runtime': 16.5614, 'train_samples_per_second': 279.506, 'train_steps_per_second': 4.529, 'train_loss': 0.5161113993326822, 'epoch': 3.0}
{'eval_loss': 0.4446749985218048, 'eval_f1': 0.8132746142083558, 'eval_accuracy': 0.8146417445482866, 'eval_runtime': 1.5727, 'eval_samples_per_second': 408.227, 'eval_steps_per_second': 51.505, 'epoch': 3.0}

TA:
F1  → 0.813
ACC → 0.815
⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯


In [None]:
# %% [code]
!pip install torch transformers datasets pandas scikit-learn accelerate -q
!WANDB_DISABLED=true

import os
import re
import torch
import numpy as np
import pandas as pd
import warnings
from datasets import load_dataset
from sklearn.metrics import f1_score, accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

# Set environment variables and randomness
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
warnings.filterwarnings("ignore")
torch.manual_seed(42)

# -------------------------------
# Utility function to clean text
# -------------------------------
def clean_text(text):
    return re.sub(r"http\S+|www\S+|@\w+|#\w+", "", str(text)).strip()

# -------------------------------
# Custom Dataset for Hate Speech Classification
# -------------------------------
class HateSpeechDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = [clean_text(t) for t in texts]
        self.labels = labels
        self.encodings = tokenizer(self.texts, truncation=True, padding="max_length",
                                    max_length=max_length, return_tensors="pt")

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# -------------------------------
# Define a metric function for evaluation
# -------------------------------
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    acc = accuracy_score(p.label_ids, preds)
    f1 = f1_score(p.label_ids, preds, average='weighted', zero_division=0)
    return {"accuracy": acc, "f1": f1}

# -------------------------------
# Function to prepare data for each language
# -------------------------------
def prepare_data_for_language(lang):
    if lang == "en":
        dataset = load_dataset("tweet_eval", "hate")
    elif lang == "hi":
        df = pd.read_csv("hindi_2021.csv")

        # Only keep rows with task_1 labels 'NOT' or 'HOF'
        df = df[df["task_1"].isin(["NOT", "HOF"])]

        # Binary label: 0 = NOT, 1 = HOF
        df["label"] = df["task_1"].map({"NOT": 0, "HOF": 1})

        train_frac = 0.8
        train_df = df.sample(frac=train_frac, random_state=42)
        test_df = df.drop(train_df.index)

        train_texts = train_df["text"].tolist()
        train_labels = train_df["label"].tolist()
        test_texts = test_df["text"].tolist()
        test_labels = test_df["label"].tolist()

        print(f"HI: Loaded custom HASOC 2021 — Train: {len(train_texts)}, Test: {len(test_texts)}")
        return train_texts, train_labels, test_texts, test_labels

    elif lang == "ta":
        # Load the TSV file without headers
        df = pd.read_csv("tamil_offensive_full_train.csv", sep="\t", header=None, names=["text", "label"], quoting=3)

        # Drop rows labeled "not-Tamil"
        df = df[df["label"] != "not-Tamil"]

        # Convert labels: Not_offensive → 0, anything else (any kind of offensive) → 1
        df["label"] = df["label"].apply(lambda x: 0 if x == "Not_offensive" else 1)

        # Train/test split (e.g., 80/20)
        train_df = df.sample(frac=0.8, random_state=42)
        test_df = df.drop(train_df.index)

        train_texts = train_df["text"].tolist()
        train_labels = train_df["label"].tolist()
        test_texts = test_df["text"].tolist()
        test_labels = test_df["label"].tolist()

        print(f"TA: Loaded Tamil dataset — Train: {len(train_texts)}, Test: {len(test_texts)}")
        return train_texts, train_labels, test_texts, test_labels

    else:
        raise ValueError("Language must be one of: en, hi, ta")

    available_splits = list(dataset.keys())
    train_split = "train" if "train" in available_splits else available_splits[0]
    test_split = "test" if "test" in available_splits else ("validation" if "validation" in available_splits else available_splits[-1])

    train_data = dataset[train_split]
    test_data = dataset[test_split]

    # Binary classification: only use labels 0 or 1
    train_data = train_data.filter(lambda x: x["label"] in [0, 1])
    test_data = test_data.filter(lambda x: x["label"] in [0, 1])

    train_texts = train_data["text"]
    train_labels = train_data["label"]
    test_texts = test_data["text"]
    test_labels = test_data["label"]

    print(f"{lang.upper()}: Train samples = {len(train_texts)}, Test samples = {len(test_texts)}")
    return train_texts, train_labels, test_texts, test_labels


# -------------------------------
# Main loop: Train classifier for each language
# -------------------------------
model_map = {
    "en": "distilroberta-base",
    "hi": "bert-base-multilingual-cased",
    "ta": "bert-base-multilingual-cased"
}

for lang in ["en", "hi", "ta"]:
    print(f"\n=== Training hate speech classifier for {lang.upper()} ===")
    try:
        train_texts, train_labels, test_texts, test_labels = prepare_data_for_language(lang)
    except Exception as e:
        print(f"\u274c Data preparation failed for {lang}: {str(e)}")
        continue

    MODEL_NAME = model_map[lang]
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    train_dataset = HateSpeechDataset(train_texts, train_labels, tokenizer)
    test_dataset = HateSpeechDataset(test_texts, test_labels, tokenizer)

    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

    training_args = TrainingArguments(
        output_dir=f"./results_hate_{lang}",
        num_train_epochs=1,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        weight_decay=0.01,
        logging_steps=5000,  # High number to suppress intermediate logging
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer
    )

    trainer.train()
    results = trainer.evaluate()

    # Print only F1 and Accuracy
    print(f"{lang.upper()} Classifier Results — Accuracy: {results['eval_accuracy']:.4f}, F1: {results['eval_f1']:.4f}")



=== Training hate speech classifier for EN ===
EN: Train samples = 9000, Test samples = 2970


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.822275,0.579461,0.550812


EN Classifier Results — Accuracy: 0.5795, F1: 0.5508

=== Training hate speech classifier for HI ===
HI: Loaded custom HASOC 2021 — Train: 3675, Test: 919


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.485768,0.774755,0.762774


HI Classifier Results — Accuracy: 0.7748, F1: 0.7628

=== Training hate speech classifier for TA ===
TA: Loaded Tamil dataset — Train: 28111, Test: 6954


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,1.6e-05,1.0,1.0


TA Classifier Results — Accuracy: 1.0000, F1: 1.0000


In [None]:
!pip install torch transformers pandas scikit-learn accelerate -q
!WANDB_DISABLED=true

import pandas as pd
import re
import numpy as np
import warnings
from sklearn.metrics import f1_score, accuracy_score
from transformers import (
    AutoTokenizer,
    AutoModel,
    Trainer,
    TrainingArguments
)
import torch
from torch.utils.data import Dataset
import os

# Configuration
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
warnings.filterwarnings("ignore")
torch.manual_seed(42)
torch.backends.cudnn.benchmark = True

MODEL_MAP = {
    "en": "distilroberta-base",
    "hi": "bert-base-multilingual-cased",
    "ta": "bert-base-multilingual-cased"
}

def load_joint_data(lang, split="train"):
    """Improved data loading with NaN filtering"""
    def safe_load(suffix):
        try:
            df = pd.read_csv(
                f"{split}_{lang}_l{suffix}.csv",
                usecols=["text", f"{lang}_a{suffix}"],
                engine='python',
                on_bad_lines='skip',
                dtype={f"{lang}_a{suffix}": str}  # Read as string to handle NaN
            )
            return df.rename(columns={f"{lang}_a{suffix}": f"label{suffix}"})
        except Exception as e:
            print(f" Error loading {split}_{lang}_l{suffix}.csv: {str(e)}")
            return pd.DataFrame()

    # Load and merge data
    l1_df = safe_load(1)
    l3_df = safe_load(3)

    if l1_df.empty or l3_df.empty:
        print(f"Missing label files for {lang} {split}")
        return [], [], []

    try:
        merged_df = pd.merge(l1_df, l3_df, on="text", how="inner")
    except Exception as e:
        print(f"Merge failed: {str(e)}")
        return [], [], []

    texts, labels1, labels3 = [], [], []
    for _, row in merged_df.iterrows():
        try:
            # Text cleaning
            text = re.sub(r"http\S+|www\S+|@\w+|#\w+", "", str(row["text"]).strip())
            if not text:
                continue

            # Label validation and conversion
            label1 = str(row["label1"]).strip().replace('.0', '')
            label3 = str(row["label3"]).strip().replace('.0', '')

            if label1 in ['', 'nan', 'NaN', 'NL'] or label3 in ['', 'nan', 'NaN', 'NL']:
                continue

            label1 = int(float(label1))
            label3 = int(float(label3))

            if label1 not in [0, 1] or label3 not in [0, 1]:
                continue

            texts.append(text)
            labels1.append(label1)
            labels3.append(label3)

        except Exception as e:
            continue

    print(f"Loaded {len(texts)} clean samples for {lang} {split}")
    return texts, labels1, labels3

# -------------------------------
# Updated RobustDataset (returns combined labels)
# -------------------------------
class RobustDataset(Dataset):
    def __init__(self, encodings, labels1, labels3):
        self.encodings = {
            'input_ids': encodings['input_ids'],
            'attention_mask': encodings['attention_mask']
        }
        self.labels1 = torch.tensor(labels1, dtype=torch.long)
        self.labels3 = torch.tensor(labels3, dtype=torch.long)

        assert len(self.encodings['input_ids']) == len(self.labels1) == len(self.labels3), \
            f"Data mismatch in {len(self.labels1)} samples"

    def __getitem__(self, idx):
        # Return a single "labels" tensor with both targets
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': torch.tensor([self.labels1[idx].item(), self.labels3[idx].item()], dtype=torch.long)
        }

    def __len__(self):
        return len(self.labels1)

# -------------------------------
# Updated SafeClassifier (expects combined labels)
# -------------------------------
class SafeClassifier(torch.nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.base_model = AutoModel.from_pretrained(model_name)
        hidden_size = self.base_model.config.hidden_size
        self.gendered_head = torch.nn.Linear(hidden_size, 2)
        self.explicit_head = torch.nn.Linear(hidden_size, 2)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.base_model(input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0, :]

        logits1 = self.gendered_head(pooled)
        logits3 = self.explicit_head(pooled)

        loss = None
        if labels is not None:
            # Expect labels of shape [batch_size, 2]
            labels1 = labels[:, 0]
            labels3 = labels[:, 1]
            loss1 = torch.nn.functional.cross_entropy(logits1, labels1)
            loss2 = torch.nn.functional.cross_entropy(logits3, labels3)
            loss = loss1 + loss2

        # Return a dictionary with keys: loss, logits1, and logits3
        return {"loss": loss, "logits1": logits1, "logits3": logits3}

# -------------------------------
# Training Function for Joint Model
# -------------------------------
def train_joint_model(lang):
    print(f"\nStarting {lang.upper()} Training")

    # Load data with validation
    train_texts, train_l1, train_l3 = load_joint_data(lang, "train")
    test_texts, test_l1, test_l3 = load_joint_data(lang, "test")

    # Check class balance
    def check_labels(name, labels):
        zeros = sum(1 for l in labels if l == 0)
        ones = sum(1 for l in labels if l == 1)
        print(f"{name} - 0: {zeros}, 1: {ones}")
        return zeros > 0 and ones > 0

    valid_train = check_labels("Train Label1", train_l1) and check_labels("Train Label3", train_l3)
    valid_test = check_labels("Test Label1", test_l1) and check_labels("Test Label3", test_l3)

    if not (valid_train and valid_test):
        print(f" Insufficient class balance for {lang}")
        return

    # Tokenization
    tokenizer = AutoTokenizer.from_pretrained(MODEL_MAP[lang])

    train_encodings = tokenizer(
        train_texts,
        truncation=True,
        padding='max_length',
        max_length=64,
        return_tensors="pt"
    )

    test_encodings = tokenizer(
        test_texts,
        truncation=True,
        padding='max_length',
        max_length=64,
        return_tensors="pt"
    )

    # Create validated datasets
    try:
        train_dataset = RobustDataset(train_encodings, train_l1, train_l3)
        test_dataset = RobustDataset(test_encodings, test_l1, test_l3)
        print(f" Dataset Sizes - Train: {len(train_dataset)}, Test: {len(test_dataset)}")
    except Exception as e:
        print(f" Dataset creation failed: {str(e)}")
        return

    # Model setup
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = SafeClassifier(MODEL_MAP[lang]).to(device)

    # Training config
    training_args = TrainingArguments(
        output_dir=f"./results_{lang}",
        per_device_train_batch_size=32,
        num_train_epochs=3,
        learning_rate=2e-5,
        fp16=torch.cuda.is_available(),
        logging_steps=50,
        report_to="none",
        save_strategy="no",  # Not saving checkpoints in this example
        disable_tqdm=True
    )

    # -------------------------------
    # Updated compute_metrics (expects label_ids as a 2D array)
    # -------------------------------
    def compute_metrics(p):
        # Handle predictions: check if dict or tuple
        if isinstance(p.predictions, dict):
            logits1 = p.predictions["logits1"]
            logits3 = p.predictions["logits3"]
        else:
            if len(p.predictions) == 3:
                _, logits1, logits3 = p.predictions
            elif len(p.predictions) == 2:
                logits1, logits3 = p.predictions
            else:
                raise ValueError("Unexpected format for p.predictions")

        preds1 = np.argmax(logits1, axis=1)
        preds3 = np.argmax(logits3, axis=1)

        # Expect label_ids as an array of shape (N, 2)
        labels_array = p.label_ids  # p.label_ids should come as such from our dataset
        labels1 = labels_array[:, 0]
        labels3 = labels_array[:, 1]

        gendered_f1 = f1_score(labels1, preds1, average='weighted', zero_division=0)
        gendered_acc = accuracy_score(labels1, preds1)
        explicit_f1 = f1_score(labels3, preds3, average='weighted', zero_division=0)
        explicit_acc = accuracy_score(labels3, preds3)

        return {
            'eval_gendered_f1': gendered_f1,
            'eval_gendered_acc': gendered_acc,
            'eval_explicit_f1': explicit_f1,
            'eval_explicit_acc': explicit_acc
        }

    # -------------------------------
    # Custom Trainer (with an override of compute_loss)
    # -------------------------------
    class ValidationTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
            outputs = model(**inputs)
            loss = outputs["loss"]
            return (loss, outputs) if return_outputs else loss

        def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
            with torch.no_grad():
                outputs = model(**inputs)
            loss = outputs.get("loss")
            return (loss, outputs, inputs.get("labels"))

    try:
        trainer = ValidationTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
            compute_metrics=compute_metrics,
            tokenizer=tokenizer
        )

        print(" Starting training...")
        trainer.train()

        print("\n Final Evaluation:")
        results = trainer.evaluate()
        print(f"{lang.upper()} Results:")
        print(f"Gendered Abuse - F1: {results['eval_gendered_f1']:.3f} | Acc: {results['eval_gendered_acc']:.3f}")
        print(f"Explicit Language - F1: {results['eval_explicit_f1']:.3f} | Acc: {results['eval_explicit_acc']:.3f}")
        print("⎯" * 50)

    except Exception as e:
        print(f" Training aborted: {str(e)}")

# Execute training for each language
if __name__ == "__main__":
    for lang in ["en", "hi", "ta"]:
        train_joint_model(lang)



🚀 Starting EN Training
✅ Loaded 24 clean samples for en train
✅ Loaded 92 clean samples for en test
📊 Train Label1 - 0: 18, 1: 6
📊 Train Label3 - 0: 7, 1: 17
📊 Test Label1 - 0: 67, 1: 25
📊 Test Label3 - 0: 34, 1: 58
📦 Dataset Sizes - Train: 24, Test: 92
🔥 Starting training...
{'train_runtime': 0.1917, 'train_samples_per_second': 375.532, 'train_steps_per_second': 15.647, 'train_loss': 1.3854878743489583, 'epoch': 3.0}

🧪 Final Evaluation:
{'eval_gendered_f1': 0.6137544435329506, 'eval_gendered_acc': 0.7282608695652174, 'eval_explicit_f1': 0.199447895100069, 'eval_explicit_acc': 0.3695652173913043, 'eval_loss': 1.34207284450531, 'eval_runtime': 0.1108, 'eval_samples_per_second': 830.355, 'eval_steps_per_second': 108.307, 'epoch': 3.0}
EN Results:
Gendered Abuse - F1: 0.614 | Acc: 0.728
Explicit Language - F1: 0.199 | Acc: 0.370
⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯

🚀 Starting HI Training
✅ Loaded 28 clean samples for hi train
✅ Loaded 471 clean samples for hi test
📊 Train