In [None]:

!pip install torch transformers datasets pandas scikit-learn numpy
!pip install accelerate -U  

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting 

Baseline-1(task1)

In [None]:
'''!pip install torch transformers pandas scikit-learn accelerate -q
!WANDB_DISABLED=true

import pandas as pd
import re
import numpy as np
import warnings
from sklearn.metrics import classification_report
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
import torch
from torch.utils.data import Dataset
import os


os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
warnings.filterwarnings("ignore")


torch.manual_seed(42)
torch.backends.cudnn.benchmark = True


def preprocess_text(text, lang):
    text = re.sub(r"http\S+|www\S+|@\w+|#\w+", "", str(text))
    return text.strip()


def load_single_task_data(lang, split="train"):
    annot_col = f"{lang}_a1"
    file_path = f"{split}_{lang}_l1.csv"

    try:
        
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Missing file: {file_path}")

        
        with open(file_path, 'r') as f:
            header = f.readline().strip().split(',')
            if "text" not in header or annot_col not in header:
                raise ValueError(f"Missing required columns in {file_path}")

        
        df = pd.read_csv(
            file_path,
            usecols=["text", annot_col],
            engine='python',
            on_bad_lines='skip',
            dtype={'text': str, annot_col: str}
        )

    except Exception as e:
        print(f" Error loading {file_path}: {str(e)}")
        return [], []

    texts, labels = [], []
    for _, row in df.iterrows():
        val = row[annot_col]
        if pd.notna(val) and str(val).strip().upper() not in ['NL', 'NAN']:
            try:
                label = int(float(str(val).replace('.0','')))
                texts.append(preprocess_text(row["text"], lang))
                labels.append(label)
            except:
                continue

    return texts, labels

MODEL_MAP = {
    "en": "distilroberta-base",
    "hi": "bert-base-multilingual-cased",
    "ta": "bert-base-multilingual-cased"
}


class SilentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

    def __len__(self):
        return len(self.labels)

def train_task1(lang):

    train_texts, train_labels = load_single_task_data(lang, "train")
    test_texts, test_labels = load_single_task_data(lang, "test")


    tokenizer = AutoTokenizer.from_pretrained(MODEL_MAP[lang])

    train_encodings = tokenizer(
        train_texts,
        truncation=True,
        padding='max_length',
        max_length=64,
        return_tensors="pt"
    )

    test_encodings = tokenizer(
        test_texts,
        truncation=True,
        padding='max_length',
        max_length=64,
        return_tensors="pt"
    )

    train_dataset = SilentDataset(train_encodings, train_labels)
    test_dataset = SilentDataset(test_encodings, test_labels)


    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_MAP[lang],
        num_labels=2
    ).to("cuda" if torch.cuda.is_available() else "cpu")

    training_args = TrainingArguments(
        output_dir=f"./results_{lang}",
        per_device_train_batch_size=64,
        num_train_epochs=3,
        learning_rate=3e-5,
        fp16=torch.cuda.is_available(),
        logging_steps=1000,
        report_to="none",
        save_strategy="no",
        disable_tqdm=True,
        optim="adamw_torch_fused"
    )

    def compute_metrics(p):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            preds = p.predictions.argmax(-1)
            report = classification_report(
                p.label_ids,
                preds,
                output_dict=True,
                zero_division=0
            )
            return {
                'f1': report['weighted avg']['f1-score'],
                'accuracy': report['accuracy']
            }


    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        compute_metrics=compute_metrics
    )

    trainer.train()
    results = trainer.evaluate(test_dataset)

    print(f"\n{lang.upper()}:")
    print(f"F1  → {results['eval_f1']:.3f}")
    print(f"ACC → {results['eval_accuracy']:.3f}")
    print("⎯"*30)


if __name__ == "__main__":
    for lang in ["en", "hi", "ta"]:
        train_task1(lang)'''

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'train_runtime': 2.078, 'train_samples_per_second': 189.121, 'train_steps_per_second': 4.331, 'train_loss': 0.496605290306939, 'epoch': 3.0}
{'eval_loss': 0.5446405410766602, 'eval_f1': 0.6617169470463541, 'eval_accuracy': 0.7639484978540773, 'eval_runtime': 0.33, 'eval_samples_per_second': 705.968, 'eval_steps_per_second': 90.897, 'epoch': 3.0}

EN:
F1  → 0.662
ACC → 0.764
⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'train_runtime': 10.8966, 'train_samples_per_second': 345.521, 'train_steps_per_second': 5.506, 'train_loss': 0.46761350631713866, 'epoch': 3.0}
{'eval_loss': 0.461620032787323, 'eval_f1': 0.7084533113944879, 'eval_accuracy': 0.7980769230769231, 'eval_runtime': 1.8679, 'eval_samples_per_second': 501.11, 'eval_steps_per_second': 62.639, 'epoch': 3.0}

HI:
F1  → 0.708
ACC → 0.798
⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'train_runtime': 13.5487, 'train_samples_per_second': 341.657, 'train_steps_per_second': 5.536, 'train_loss': 0.5161113993326822, 'epoch': 3.0}
{'eval_loss': 0.4446749985218048, 'eval_f1': 0.8132746142083558, 'eval_accuracy': 0.8146417445482866, 'eval_runtime': 1.0116, 'eval_samples_per_second': 634.608, 'eval_steps_per_second': 80.067, 'epoch': 3.0}

TA:
F1  → 0.813
ACC → 0.815
⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯


Baseline2(task1)

In [None]:
!pip install torch transformers pandas scikit-learn accelerate -q
!WANDB_DISABLED=true

import pandas as pd
import re
import numpy as np
import warnings
from sklearn.metrics import classification_report
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
import torch
from torch.utils.data import Dataset
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
warnings.filterwarnings("ignore")

torch.manual_seed(42)
torch.backends.cudnn.benchmark = True

def preprocess_text(text, lang):
    """Handles mixed-language and special character cases"""
    if pd.isna(text) or text.strip() in ['', 'nan', 'NaN']:
        return ""

    text = str(text)
    text = re.sub(r"@\w+|#\w+|https?://\S+|www\.\S+", "", text) 
    text = re.sub(r"<.*?>|{.*?}|\[.*?\]", "", text) 

    if lang == "hi":
        text = re.sub(r"[^\u0900-\u097F\s]", "", text) 
    elif lang == "ta":
        text = re.sub(r"[^\u0B80-\u0BFF\s]", "", text)  
    else:  
        text = re.sub(r"[^a-zA-Z\s]", "", text)  

    return text.strip()


def load_single_task_data(lang, split="train"):
    target_col = f"{lang}_a1"
    file_path = f"{split}_{lang}_l1.csv"

    try:
        
        df = pd.read_csv(
            file_path,
            usecols=lambda col: col.lower() in ["text", target_col.lower()],
            dtype={'text': 'string', target_col: 'string'},
            engine='python',
            on_bad_lines='warn'
        )

        df.columns = df.columns.str.lower()
        target_col = target_col.lower()

        df["text"] = df["text"].apply(lambda x: preprocess_text(x, lang))
        df = df[df["text"].str.len() > 5]  

        df[target_col] = (
            df[target_col]
            .str.strip().str.upper()
            .replace({
                'NL': '0', 'NAN': '0',
                '0.0': '0', '1.0': '1',
                '0.': '0', '1.': '1'
            }, regex=False)
        )

        valid_labels = df[target_col].isin(['0', '1'])
        df = df[valid_labels].dropna(subset=[target_col])

        if df.empty:
            print(f"No valid data in {file_path}")
            return [], []

        return df["text"].tolist(), df[target_col].astype(int).tolist()

    except Exception as e:
        print(f"Critical error in {file_path}: {str(e)}")
        return [], []

MODEL_MAP = {
    "en": "xlm-roberta-base",
    "hi": "xlm-roberta-base",
    "ta": "xlm-roberta-base"
}

class SilentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

    def __len__(self):
        return len(self.labels)

def train_task1(lang):
    print(f"\n{'='*40}")
    print(f" Starting {lang.upper()} training")
    print(f"{'='*40}")

    train_texts, train_labels = load_single_task_data(lang, "train")
    test_texts, test_labels = load_single_task_data(lang, "test")

    min_train = max(10, len(train_labels)//2)  
    if len(train_texts) < min_train or len(test_texts) < 5:
        print(f" Skipping {lang}: Insufficient data (train: {len(train_texts)}, test: {len(test_texts)})")
        return

    if len(set(train_labels)) < 2:
        print(f"Skipping {lang}: Only one class present")
        return

    tokenizer = AutoTokenizer.from_pretrained(MODEL_MAP[lang])

    train_encodings = tokenizer(
        train_texts,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors="pt"
    )

    test_encodings = tokenizer(
        test_texts,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors="pt"
    )


    train_dataset = SilentDataset(train_encodings, train_labels)
    test_dataset = SilentDataset(test_encodings, test_labels)
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_MAP[lang],
        num_labels=2
    ).to("cuda" if torch.cuda.is_available() else "cpu")
    training_args = TrainingArguments(
        output_dir=f"./results_{lang}",
        per_device_train_batch_size=8 if len(train_texts) < 100 else 16,
        num_train_epochs=10 if len(train_texts) < 500 else 5,
        learning_rate=1e-5,
        fp16=torch.cuda.is_available(),
        logging_steps=50,
        save_strategy="no",
        disable_tqdm=True,
        optim="adamw_torch_fused",
        gradient_accumulation_steps=2
    )

    def compute_metrics(p):
        preds = p.predictions.argmax(-1)
        return {
            'f1': classification_report(
                p.label_ids, preds,
                output_dict=True, zero_division=0
            )['weighted avg']['f1-score'],
            'accuracy': (preds == p.label_ids).mean()
        }

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics
    )

    try:
        trainer.train()
        results = trainer.evaluate()

        print(f"\n{lang.upper()} Results:")
        print(f"F1 Score: {results['eval_f1']:.3f}")
        print(f"Accuracy: {results['eval_accuracy']:.3f}")
        print(f"Train Samples: {len(train_texts)}")
        print(f"Test Samples: {len(test_texts)}")

    except Exception as e:
        print(f" Training failed: {str(e)}")

    print(f"{'='*40}\n")

if __name__ == "__main__":
    for lang in ["en", "hi", "ta"]:
        train_task1(lang)


🚀 Starting EN training


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


{'train_runtime': 9.6888, 'train_samples_per_second': 132.111, 'train_steps_per_second': 4.128, 'train_loss': 0.5703863620758056, 'epoch': 10.0}
{'eval_loss': 0.5477120280265808, 'eval_f1': 0.6648204736440031, 'eval_accuracy': 0.7662337662337663, 'eval_runtime': 0.4361, 'eval_samples_per_second': 529.725, 'eval_steps_per_second': 66.502, 'epoch': 10.0}

EN Results:
F1 Score: 0.665
Accuracy: 0.766
Train Samples: 128
Test Samples: 231


🚀 Starting HI training


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


{'loss': 0.5914, 'grad_norm': 6.322108268737793, 'learning_rate': 6.620689655172415e-06, 'epoch': 1.7241379310344827}
{'loss': 0.5324, 'grad_norm': 6.2976250648498535, 'learning_rate': 3.172413793103449e-06, 'epoch': 3.4482758620689653}
{'train_runtime': 35.9822, 'train_samples_per_second': 127.285, 'train_steps_per_second': 4.03, 'train_loss': 0.5436136048415612, 'epoch': 5.0}
{'eval_loss': 0.500669538974762, 'eval_f1': 0.670997116124665, 'eval_accuracy': 0.7707736389684814, 'eval_runtime': 1.5795, 'eval_samples_per_second': 441.905, 'eval_steps_per_second': 55.713, 'epoch': 5.0}

HI Results:
F1 Score: 0.671
Accuracy: 0.771
Train Samples: 916
Test Samples: 698


🚀 Starting TA training


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


{'loss': 0.71, 'grad_norm': 4.152376174926758, 'learning_rate': 7.958333333333333e-06, 'epoch': 1.0416666666666667}
{'loss': 0.6594, 'grad_norm': 13.928925514221191, 'learning_rate': 5.8750000000000005e-06, 'epoch': 2.0833333333333335}
{'loss': 0.626, 'grad_norm': 10.112614631652832, 'learning_rate': 3.7916666666666666e-06, 'epoch': 3.125}
{'loss': 0.5745, 'grad_norm': 14.257893562316895, 'learning_rate': 1.7083333333333334e-06, 'epoch': 4.166666666666667}
{'train_runtime': 66.4372, 'train_samples_per_second': 115.222, 'train_steps_per_second': 3.612, 'train_loss': 0.6268714189529419, 'epoch': 5.0}
{'eval_loss': 0.5190352201461792, 'eval_f1': 0.7553968654700792, 'eval_accuracy': 0.753968253968254, 'eval_runtime': 2.3722, 'eval_samples_per_second': 265.573, 'eval_steps_per_second': 33.302, 'epoch': 5.0}

TA Results:
F1 Score: 0.755
Accuracy: 0.754
Train Samples: 1531
Test Samples: 630



Baseline(1) Task2


In [None]:
'''!pip install torch transformers datasets pandas scikit-learn accelerate -q
!WANDB_DISABLED=true

import os
import re
import torch
import numpy as np
import pandas as pd
import warnings
from datasets import load_dataset
from sklearn.metrics import f1_score, accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
warnings.filterwarnings("ignore")
torch.manual_seed(42)

def clean_text(text):
    return re.sub(r"http\S+|www\S+|@\w+|#\w+", "", str(text)).strip()

class HateSpeechDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = [clean_text(t) for t in texts]
        self.labels = labels
        self.encodings = tokenizer(self.texts, truncation=True, padding="max_length",
                                    max_length=max_length, return_tensors="pt")

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    acc = accuracy_score(p.label_ids, preds)
    f1 = f1_score(p.label_ids, preds, average='weighted', zero_division=0)
    return {"accuracy": acc, "f1": f1}

def prepare_data_for_language(lang):
    if lang == "en":
        dataset = load_dataset("tweet_eval", "hate")
    elif lang == "hi":
        df = pd.read_csv("hindi_2021.csv")  

        
        df = df[df["task_1"].isin(["NOT", "HOF"])]

        df["label"] = df["task_1"].map({"NOT": 0, "HOF": 1})

        train_frac = 0.8
        train_df = df.sample(frac=train_frac, random_state=42)
        test_df = df.drop(train_df.index)

        train_texts = train_df["text"].tolist()
        train_labels = train_df["label"].tolist()
        test_texts = test_df["text"].tolist()
        test_labels = test_df["label"].tolist()

        print(f"HI: Loaded custom HASOC 2021 — Train: {len(train_texts)}, Test: {len(test_texts)}")
        return train_texts, train_labels, test_texts, test_labels

    elif lang == "ta":
        df = pd.read_csv("tamil_offensive_full_train.csv", sep="\t", header=None, names=["text", "label"], quoting=3)

        df = df[df["label"] != "not-Tamil"]

        df["label"] = df["label"].apply(lambda x: 0 if x == "Not_offensive" else 1)

        train_df = df.sample(frac=0.8, random_state=42)
        test_df = df.drop(train_df.index)

        train_texts = train_df["text"].tolist()
        train_labels = train_df["label"].tolist()
        test_texts = test_df["text"].tolist()
        test_labels = test_df["label"].tolist()

        print(f"TA: Loaded Tamil dataset — Train: {len(train_texts)}, Test: {len(test_texts)}")
        return train_texts, train_labels, test_texts, test_labels

    else:
        raise ValueError("Language must be one of: en, hi, ta")

    available_splits = list(dataset.keys())
    train_split = "train" if "train" in available_splits else available_splits[0]
    test_split = "test" if "test" in available_splits else ("validation" if "validation" in available_splits else available_splits[-1])

    train_data = dataset[train_split]
    test_data = dataset[test_split]

    train_data = train_data.filter(lambda x: x["label"] in [0, 1])
    test_data = test_data.filter(lambda x: x["label"] in [0, 1])

    train_texts = train_data["text"]
    train_labels = train_data["label"]
    test_texts = test_data["text"]
    test_labels = test_data["label"]

    print(f"{lang.upper()}: Train samples = {len(train_texts)}, Test samples = {len(test_texts)}")
    return train_texts, train_labels, test_texts, test_labels

model_map = {
    "en": "distilroberta-base",
    "hi": "bert-base-multilingual-cased",
    "ta": "bert-base-multilingual-cased"
}

for lang in ["en", "hi", "ta"]:
    print(f"\n=== Training hate speech classifier for {lang.upper()} ===")
    try:
        train_texts, train_labels, test_texts, test_labels = prepare_data_for_language(lang)
    except Exception as e:
        print(f"\u274c Data preparation failed for {lang}: {str(e)}")
        continue

    MODEL_NAME = model_map[lang]
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    train_dataset = HateSpeechDataset(train_texts, train_labels, tokenizer)
    test_dataset = HateSpeechDataset(test_texts, test_labels, tokenizer)

    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

    training_args = TrainingArguments(
        output_dir=f"./results_hate_{lang}",
        num_train_epochs=1,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        weight_decay=0.01,
        logging_steps=5000, 
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer
    )

    trainer.train()
    results = trainer.evaluate()

    print(f"{lang.upper()} Classifier Results — Accuracy: {results['eval_accuracy']:.4f}, F1: {results['eval_f1']:.4f}")'''



=== Training hate speech classifier for EN ===
EN: Train samples = 9000, Test samples = 2970


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.822275,0.579461,0.550812


EN Classifier Results — Accuracy: 0.5795, F1: 0.5508

=== Training hate speech classifier for HI ===
HI: Loaded custom HASOC 2021 — Train: 3675, Test: 919


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.485768,0.774755,0.762774


HI Classifier Results — Accuracy: 0.7748, F1: 0.7628

=== Training hate speech classifier for TA ===
TA: Loaded Tamil dataset — Train: 28111, Test: 6954


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,1.6e-05,1.0,1.0


TA Classifier Results — Accuracy: 1.0000, F1: 1.0000


Baseline2(task2)

In [None]:
!pip install torch transformers pandas scikit-learn -q
!WANDB_DISABLED=true

import os
import re
import torch
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW 
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import f1_score, accuracy_score
from tqdm import tqdm

os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.manual_seed(115)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def clean_text(text):
    return re.sub(r"http\S+|www\S+|@\w+|#\w+", "", str(text)).strip()


def load_data(lang):
    if lang == "en":
        from datasets import load_dataset
        dataset = load_dataset("tweet_eval", "hate")
        train_data = dataset['train'].filter(lambda x: x["label"] in [0, 1])
        test_data = dataset['test'].filter(lambda x: x["label"] in [0, 1])
        return train_data['text'], train_data['label'], test_data['text'], test_data['label']

    elif lang == "hi":
        df = pd.read_csv("/kaggle/input/alternateabusedatasets/hindi_2021.csv")
        df = df[df["task_1"].isin(["NOT", "HOF"])]
        df['label'] = df['task_1'].map({"NOT": 0, "HOF": 1})
        train_df = df.sample(frac=0.8)
        return (train_df['text'].tolist(), train_df['label'].tolist(),
                df.drop(train_df.index)['text'].tolist(), df.drop(train_df.index)['label'].tolist())

    elif lang == "ta":
        df = pd.read_csv("/kaggle/input/alternateabusedatasets/tamil_offensive_full_train.csv", sep="\t", header=None, names=["text", "label"], quoting=3)
        df = df[df["label"] != "not-Tamil"]
        df['label'] = df['label'].apply(lambda x: 0 if x == "Not_offensive" else 1)
        train_df = df.sample(frac=0.8)
        return (train_df['text'].tolist(), train_df['label'].tolist(),
                df.drop(train_df.index)['text'].tolist(), df.drop(train_df.index)['label'].tolist())


def preprocess_data(texts, labels, tokenizer, max_length=128):
    texts = [clean_text(t) for t in texts]
    encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_length)
    return TensorDataset(
        torch.tensor(encodings['input_ids']),
        torch.tensor(encodings['attention_mask']),
        torch.tensor(labels)
    )

def train_model(lang, model_name, num_epochs=3, batch_size=16):

    train_texts, train_labels, test_texts, test_labels = load_data(lang)


    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)


    train_dataset = preprocess_data(train_texts, train_labels, tokenizer)
    test_dataset = preprocess_data(test_texts, test_labels, tokenizer)


    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)


    optimizer = AdamW(model.parameters(), lr=2e-5)


    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        print(f"\nEpoch {epoch+1}/{num_epochs}")
        for batch in tqdm(train_loader, desc="Training"):
            inputs = {
                'input_ids': batch[0].to(device),
                'attention_mask': batch[1].to(device),
                'labels': batch[2].to(device)
            }
        
            optimizer.zero_grad()
            outputs = model(**inputs)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        model.eval()
        all_preds, all_labels = [], []
        for batch in tqdm(test_loader, desc="Evaluating"):
            inputs = {
                'input_ids': batch[0].to(device),
                'attention_mask': batch[1].to(device)
            }
            labels = batch[2].numpy()
    
            with torch.no_grad():
                outputs = model(**inputs)
    
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels)
    
        acc = accuracy_score(all_labels, all_preds)
        f1 = f1_score(all_labels, all_preds, average='weighted')
        print(f"Loss: {total_loss/len(train_loader):.4f} | Acc: {acc:.4f} | F1: {f1:.4f}")


models = {
    "en": "distilbert-base-uncased",
    "hi": "bert-base-multilingual-cased",
    "ta": "bert-base-multilingual-cased"
}

for lang in ["ta", "hi", "en"]:
    print(f"\nTraining {lang.upper()} model ({models[lang]})")
    train_model(lang, models[lang], num_epochs=2)


Training TA model (bert-base-multilingual-cased)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Baseline(1) task3


In [None]:
'''!pip install torch transformers pandas scikit-learn accelerate -q
!WANDB_DISABLED=true

import pandas as pd
import re
import numpy as np
import warnings
from sklearn.metrics import f1_score, accuracy_score
from transformers import (
    AutoTokenizer,
    AutoModel,
    Trainer,
    TrainingArguments
)
import torch
from torch.utils.data import Dataset
import os


os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
warnings.filterwarnings("ignore")
torch.manual_seed(42)
torch.backends.cudnn.benchmark = True

MODEL_MAP = {
    "en": "distilroberta-base",
    "hi": "bert-base-multilingual-cased",
    "ta": "bert-base-multilingual-cased"
}

def load_joint_data(lang, split="train"):
    def safe_load(suffix):
        try:
            df = pd.read_csv(
                f"{split}_{lang}_l{suffix}.csv",
                usecols=["text", f"{lang}_a{suffix}"],
                engine='python',
                on_bad_lines='skip',
                dtype={f"{lang}_a{suffix}": str}  
            )
            return df.rename(columns={f"{lang}_a{suffix}": f"label{suffix}"})
        except Exception as e:
            print(f"⚠️ Error loading {split}_{lang}_l{suffix}.csv: {str(e)}")
            return pd.DataFrame()

    l1_df = safe_load(1)
    l3_df = safe_load(3)

    if l1_df.empty or l3_df.empty:
        print(f"Missing label files for {lang} {split}")
        return [], [], []

    try:
        merged_df = pd.merge(l1_df, l3_df, on="text", how="inner")
    except Exception as e:
        print(f" Merge failed: {str(e)}")
        return [], [], []

    texts, labels1, labels3 = [], [], []
    for _, row in merged_df.iterrows():
        try:
            text = re.sub(r"http\S+|www\S+|@\w+|#\w+", "", str(row["text"]).strip())
            if not text:
                continue

            label1 = str(row["label1"]).strip().replace('.0', '')
            label3 = str(row["label3"]).strip().replace('.0', '')

            if label1 in ['', 'nan', 'NaN', 'NL'] or label3 in ['', 'nan', 'NaN', 'NL']:
                continue

            label1 = int(float(label1))
            label3 = int(float(label3))

            if label1 not in [0, 1] or label3 not in [0, 1]:
                continue

            texts.append(text)
            labels1.append(label1)
            labels3.append(label3)

        except Exception as e:
            continue

    print(f" Loaded {len(texts)} clean samples for {lang} {split}")
    return texts, labels1, labels3

class RobustDataset(Dataset):
    def __init__(self, encodings, labels1, labels3):
        self.encodings = {
            'input_ids': encodings['input_ids'],
            'attention_mask': encodings['attention_mask']
        }
        self.labels1 = torch.tensor(labels1, dtype=torch.long)
        self.labels3 = torch.tensor(labels3, dtype=torch.long)

        assert len(self.encodings['input_ids']) == len(self.labels1) == len(self.labels3), \
            f"Data mismatch in {len(self.labels1)} samples"

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': torch.tensor([self.labels1[idx].item(), self.labels3[idx].item()], dtype=torch.long)
        }

    def __len__(self):
        return len(self.labels1)

class SafeClassifier(torch.nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.base_model = AutoModel.from_pretrained(model_name)
        hidden_size = self.base_model.config.hidden_size
        self.gendered_head = torch.nn.Linear(hidden_size, 2)
        self.explicit_head = torch.nn.Linear(hidden_size, 2)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.base_model(input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0, :]

        logits1 = self.gendered_head(pooled)
        logits3 = self.explicit_head(pooled)

        loss = None
        if labels is not None:
            labels1 = labels[:, 0]
            labels3 = labels[:, 1]
            loss1 = torch.nn.functional.cross_entropy(logits1, labels1)
            loss2 = torch.nn.functional.cross_entropy(logits3, labels3)
            loss = loss1 + loss2
        return {"loss": loss, "logits1": logits1, "logits3": logits3}

def train_joint_model(lang):
    print(f"\nStarting {lang.upper()} Training")

    train_texts, train_l1, train_l3 = load_joint_data(lang, "train")
    test_texts, test_l1, test_l3 = load_joint_data(lang, "test")

    def check_labels(name, labels):
        zeros = sum(1 for l in labels if l == 0)
        ones = sum(1 for l in labels if l == 1)
        print(f"📊 {name} - 0: {zeros}, 1: {ones}")
        return zeros > 0 and ones > 0

    valid_train = check_labels("Train Label1", train_l1) and check_labels("Train Label3", train_l3)
    valid_test = check_labels("Test Label1", test_l1) and check_labels("Test Label3", test_l3)

    if not (valid_train and valid_test):
        print(f" Insufficient class balance for {lang}")
        return

    tokenizer = AutoTokenizer.from_pretrained(MODEL_MAP[lang])

    train_encodings = tokenizer(
        train_texts,
        truncation=True,
        padding='max_length',
        max_length=64,
        return_tensors="pt"
    )

    test_encodings = tokenizer(
        test_texts,
        truncation=True,
        padding='max_length',
        max_length=64,
        return_tensors="pt"
    )

    try:
        train_dataset = RobustDataset(train_encodings, train_l1, train_l3)
        test_dataset = RobustDataset(test_encodings, test_l1, test_l3)
        print(f" Dataset Sizes - Train: {len(train_dataset)}, Test: {len(test_dataset)}")
    except Exception as e:
        print(f" Dataset creation failed: {str(e)}")
        return

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = SafeClassifier(MODEL_MAP[lang]).to(device)

    training_args = TrainingArguments(
        output_dir=f"./results_{lang}",
        per_device_train_batch_size=32,
        num_train_epochs=3,
        learning_rate=2e-5,
        fp16=torch.cuda.is_available(),
        logging_steps=50,
        report_to="none",
        save_strategy="no",  
        disable_tqdm=True
    )
    def compute_metrics(p):
        if isinstance(p.predictions, dict):
            logits1 = p.predictions["logits1"]
            logits3 = p.predictions["logits3"]
        else:
            if len(p.predictions) == 3:
                _, logits1, logits3 = p.predictions
            elif len(p.predictions) == 2:
                logits1, logits3 = p.predictions
            else:
                raise ValueError("Unexpected format for p.predictions")

        preds1 = np.argmax(logits1, axis=1)
        preds3 = np.argmax(logits3, axis=1)

        labels_array = p.label_ids  
        labels1 = labels_array[:, 0]
        labels3 = labels_array[:, 1]

        gendered_f1 = f1_score(labels1, preds1, average='weighted', zero_division=0)
        gendered_acc = accuracy_score(labels1, preds1)
        explicit_f1 = f1_score(labels3, preds3, average='weighted', zero_division=0)
        explicit_acc = accuracy_score(labels3, preds3)

        return {
            'eval_gendered_f1': gendered_f1,
            'eval_gendered_acc': gendered_acc,
            'eval_explicit_f1': explicit_f1,
            'eval_explicit_acc': explicit_acc
        }

    class ValidationTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
            outputs = model(**inputs)
            loss = outputs["loss"]
            return (loss, outputs) if return_outputs else loss

        def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
            with torch.no_grad():
                outputs = model(**inputs)
            loss = outputs.get("loss")
            return (loss, outputs, inputs.get("labels"))

    try:
        trainer = ValidationTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
            compute_metrics=compute_metrics,
            tokenizer=tokenizer
        )

        print( "Starting training...")
        trainer.train()

        print("\n Final Evaluation:")
        results = trainer.evaluate()
        print(f"{lang.upper()} Results:")
        print(f"Gendered Abuse - F1: {results['eval_gendered_f1']:.3f} | Acc: {results['eval_gendered_acc']:.3f}")
        print(f"Explicit Language - F1: {results['eval_explicit_f1']:.3f} | Acc: {results['eval_explicit_acc']:.3f}")
        print("⎯" * 50)

    except Exception as e:
        print(f" Training aborted: {str(e)}")

if __name__ == "__main__":
    for lang in ["en", "hi", "ta"]:
        train_joint_model(lang)'''



🚀 Starting EN Training
✅ Loaded 24 clean samples for en train
✅ Loaded 92 clean samples for en test
📊 Train Label1 - 0: 18, 1: 6
📊 Train Label3 - 0: 7, 1: 17
📊 Test Label1 - 0: 67, 1: 25
📊 Test Label3 - 0: 34, 1: 58
📦 Dataset Sizes - Train: 24, Test: 92
🔥 Starting training...
{'train_runtime': 0.2037, 'train_samples_per_second': 353.412, 'train_steps_per_second': 14.726, 'train_loss': 1.3854878743489583, 'epoch': 3.0}

🧪 Final Evaluation:
{'eval_gendered_f1': 0.6137544435329506, 'eval_gendered_acc': 0.7282608695652174, 'eval_explicit_f1': 0.199447895100069, 'eval_explicit_acc': 0.3695652173913043, 'eval_loss': 1.34207284450531, 'eval_runtime': 0.1704, 'eval_samples_per_second': 540.045, 'eval_steps_per_second': 70.441, 'epoch': 3.0}
EN Results:
Gendered Abuse - F1: 0.614 | Acc: 0.728
Explicit Language - F1: 0.199 | Acc: 0.370
⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯

🚀 Starting HI Training
✅ Loaded 28 clean samples for hi train
✅ Loaded 471 clean samples for hi test
📊 Train 

Baseline 2 (task3)

In [None]:
!pip install torch transformers pandas scikit-learn accelerate -q
!WANDB_DISABLED=true

import pandas as pd
import re
import numpy as np
import warnings
from sklearn.metrics import f1_score, accuracy_score
from transformers import (
    AutoTokenizer,
    AutoModel,
    Trainer,
    TrainingArguments
)
import torch
from torch.utils.data import Dataset
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
warnings.filterwarnings("ignore")
torch.manual_seed(42)
torch.backends.cudnn.benchmark = True

MODEL_MAP = {
    "en": "xlm-roberta-base",
    "hi": "xlm-roberta-base",
    "ta": "xlm-roberta-base"
}

def tokenize_data(texts, lang):
    tokenizer = AutoTokenizer.from_pretrained(MODEL_MAP[lang])
    return tokenizer(
        texts,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors="pt"
    )

class RobustDataset(Dataset):
    def __init__(self, encodings, labels1, labels3):
        self.encodings = {
            'input_ids': encodings['input_ids'],
            'attention_mask': encodings['attention_mask']
        }
        self.labels1 = torch.tensor(labels1, dtype=torch.long)
        self.labels3 = torch.tensor(labels3, dtype=torch.long)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': torch.stack([self.labels1[idx], self.labels3[idx]])
        }

    def __len__(self):
        return len(self.labels1)


def load_joint_data(lang, split="train"):
    file_path = f"{split}_{lang}_l1.csv"
    targets = [f"{lang}_a1", f"{lang}_a3"]

    try:
        df = pd.read_csv(
            file_path,
            usecols=["text"] + targets,
            dtype={'text': 'string', **{t: 'string' for t in targets}},
            engine='python',
            on_bad_lines='warn'
        )

        df["text"] = df["text"].apply(lambda x: re.sub(r"http\S+|www\S+|@\w+|#\w+", "", str(x))[:512])
        df = df[df["text"].str.len() > 5]

        labels = []
        for t in targets:
            df[t] = df[t].str.strip().str.upper().replace({'NL':'0', 'NAN':'0', '0.0':'0', '1.0':'1'})
            df = df[df[t].isin(['0', '1'])]

        return df["text"].tolist(), df[targets[0]].astype(int).tolist(), df[targets[1]].astype(int).tolist()

    except Exception as e:
        print(f"Error loading {file_path}: {str(e)}")
        return [], [], []

class BalancedMultiTaskClassifier(torch.nn.Module):
    def __init__(self, model_name, class_weights):
        super().__init__()
        self.base_model = AutoModel.from_pretrained(model_name)
        self.gendered_head = torch.nn.Linear(self.base_model.config.hidden_size, 2)
        self.explicit_head = torch.nn.Linear(self.base_model.config.hidden_size, 2)
        self.class_weights = [torch.tensor(w, dtype=torch.float) for w in class_weights]
        self.task_weights = torch.nn.Parameter(torch.ones(2))

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.base_model(input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0, :]

        logits1 = self.gendered_head(pooled)
        logits3 = self.explicit_head(pooled)

        if labels is not None:
            weights = torch.softmax(self.task_weights, dim=0)
            loss1 = torch.nn.functional.cross_entropy(
                logits1, labels[:,0],
                weight=self.class_weights[0].to(logits1.device)
            )
            loss3 = torch.nn.functional.cross_entropy(
                logits3, labels[:,1],
                weight=self.class_weights[1].to(logits3.device)
            )
            return {"loss": weights[0]*loss1 + weights[1]*loss3, "logits1": logits1, "logits3": logits3}

        return {"logits1": logits1, "logits3": logits3}

def create_metrics_fn(lang):
    def compute_metrics(p):
        if isinstance(p.predictions, dict):
            logits1 = p.predictions["logits1"]
            logits3 = p.predictions["logits3"]
        else:
            logits1 = p.predictions[0]
            logits3 = p.predictions[1]

        labels = np.array(p.label_ids)

        preds1 = np.argmax(logits1, axis=1)
        preds3 = np.argmax(logits3, axis=1)

        return {
            'eval_gendered_f1': f1_score(labels[:, 0], preds1, average='weighted'),
            'eval_explicit_f1': f1_score(labels[:, 1], preds3, average='weighted')
        }
    return compute_metrics

def train_robust_joint_model(lang):
    print(f"\n Training {lang.upper()} Model")


    train_texts, train_l1, train_l3 = load_joint_data(lang, "train")
    test_texts, test_l1, test_l3 = load_joint_data(lang, "test")
    def get_weights(labels):
        counts = np.bincount(labels, minlength=2) + 1 
        return [sum(counts)/counts[0], sum(counts)/counts[1]]

    class_weights = [
        get_weights(train_l1) if len(np.unique(train_l1)) > 1 else [1,1],
        get_weights(train_l3) if len(np.unique(train_l3)) > 1 else [1,1]
    ]

    model = BalancedMultiTaskClassifier(
        MODEL_MAP[lang],
        class_weights=class_weights
    ).to("cuda" if torch.cuda.is_available() else "cpu")

    training_args = TrainingArguments(
        output_dir=f"./results_{lang}",
        per_device_train_batch_size=8,
        num_train_epochs=10,
        learning_rate=1e-5,
        gradient_accumulation_steps=4,
        fp16=torch.cuda.is_available(),
        logging_steps=50,
        report_to="none",
        save_strategy="no",
        disable_tqdm=True,
        optim="adamw_torch_fused"
    )

    train_dataset = RobustDataset(tokenize_data(train_texts, lang), train_l1, train_l3)
    test_dataset = RobustDataset(tokenize_data(test_texts, lang), test_l1, test_l3)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=create_metrics_fn(lang)
    )

    try:
        trainer.train()
        results = trainer.evaluate()
        print(f"\n📊 {lang.upper()} Results:")
        print(f"Gendered Abuse F1: {results['eval_gendered_f1']:.3f}")
        print(f"Explicit Content F1: {results['eval_explicit_f1']:.3f}")
    except Exception as e:
        print(f"Training failed: {str(e)}")

if __name__ == "__main__":
    for lang in ["en", "hi", "ta"]:
        train_robust_joint_model(lang)


🚀 Training EN Model
{'train_runtime': 4.5576, 'train_samples_per_second': 52.659, 'train_steps_per_second': 2.194, 'train_loss': 0.5304481983184814, 'epoch': 10.0}
{'eval_gendered_f1': 0.5362983645791783, 'eval_explicit_f1': 0.42113943028485756, 'eval_loss': 0.7066855430603027, 'eval_runtime': 0.3526, 'eval_samples_per_second': 260.9, 'eval_steps_per_second': 34.03, 'epoch': 10.0}

📊 EN Results:
Gendered Abuse F1: 0.536
Explicit Content F1: 0.421

🚀 Training HI Model
{'train_runtime': 2.1606, 'train_samples_per_second': 175.879, 'train_steps_per_second': 4.628, 'train_loss': 0.4835947513580322, 'epoch': 5.0}
{'eval_gendered_f1': 0.07337590631804419, 'eval_explicit_f1': 0.2113785852327242, 'eval_loss': 0.7066513299942017, 'eval_runtime': 0.9592, 'eval_samples_per_second': 491.047, 'eval_steps_per_second': 61.511, 'epoch': 5.0}

📊 HI Results:
Gendered Abuse F1: 0.073
Explicit Content F1: 0.211

🚀 Training TA Model
{'loss': 0.6791, 'grad_norm': 4.513556957244873, 'learning_rate': 2.00000