In [3]:
import os
import re
import warnings
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, Trainer, TrainingArguments

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
warnings.filterwarnings("ignore")

MODEL_MAP = {
    "en": "distilroberta-base",
    "hi": "bert-base-multilingual-cased",
    "ta": "bert-base-multilingual-cased"
}

def preprocess_text(text, lang):
    text = re.sub(r"http\S+|www\S+|@\w+|#\w+", "", str(text))
    return text.strip()

def load_multitask_data(lang, split="train"):
    label1_col = f"{lang}_a1"
    label3_col = f"{lang}_a3"

    # Select the correct base directory depending on the split.
    if split.lower() == "train":
        base_dir = "/kaggle/input/uli-dataset/uli_dataset-main/training"
    elif split.lower() == "test":
        base_dir = "/kaggle/input/uli-dataset/uli_dataset-main/testing"
    else:
        base_dir = ""

    file_path = os.path.join(base_dir, f"{split}_{lang}_l1.csv")

    if not os.path.exists(file_path):
        print(f"Missing file: {file_path}")
        return [], [], []

    try:
        df = pd.read_csv(file_path, engine="python", on_bad_lines='skip')
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return [], [], []

    if "text" not in df.columns or label1_col not in df.columns or label3_col not in df.columns:
        print(f"Missing required columns in {file_path}")
        return [], [], []

    texts, label1s, label3s = [], [], []
    for _, row in df.iterrows():
        try:
            text = preprocess_text(row["text"], lang)
            l1 = int(float(str(row[label1_col]).replace('.0', '')))
            l3 = int(float(str(row[label3_col]).replace('.0', '')))
            texts.append(text)
            label1s.append(l1)
            label3s.append(l3)
        except Exception as e:
            continue
    return texts, label1s, label3s


class MultiTaskDataset(Dataset):
    def __init__(self, encodings, labels1, labels3):
        self.encodings = encodings
        self.labels1 = labels1
        self.labels3 = labels3

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels1': torch.tensor(self.labels1[idx], dtype=torch.long),
            'labels3': torch.tensor(self.labels3[idx], dtype=torch.long)
        }

    def __len__(self):
        return len(self.labels1)

class MultiTaskBiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim=300, hidden_size=128, dropout_prob=0.2, num_classes=2):
        
        super(MultiTaskBiLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=1,
            bidirectional=True,
            batch_first=True
        )
        self.dropout = nn.Dropout(dropout_prob)
        
        self.classifier1 = nn.Linear(2 * hidden_size, num_classes)
        self.classifier3 = nn.Linear(2 * hidden_size, num_classes) 
        self.loss_fct = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask=None, labels1=None, labels3=None):
        embedded = self.embedding(input_ids) 
        lstm_out, _ = self.lstm(embedded)  
        if attention_mask is not None:
            mask = attention_mask.unsqueeze(-1)  # (batch_size, seq_len, 1)
            lstm_out = lstm_out * mask  # Zero-out pads
            lengths = mask.sum(1) 
            rep = lstm_out.sum(1) / lengths.clamp(min=1e-9)
        else:
            rep = lstm_out.mean(dim=1)
        
        rep = self.dropout(rep)
        logits1 = self.classifier1(rep)
        logits3 = self.classifier3(rep)
        
        loss = None
        if labels1 is not None and labels3 is not None:
            loss1 = self.loss_fct(logits1, labels1)
            loss3 = self.loss_fct(logits3, labels3)
            loss = (loss1 + loss3) / 2.0

        return {
            'loss': loss,
            'logits1': logits1,
            'logits3': logits3
        }

def compute_multitask_metrics(eval_pred):
    if isinstance(eval_pred.predictions, tuple):
        logits1, logits3 = eval_pred.predictions
    elif isinstance(eval_pred.predictions, dict):
        logits1 = eval_pred.predictions.get('logits1')
        logits3 = eval_pred.predictions.get('logits3')
    else:
        raise TypeError("Unexpected type for predictions.")

    if isinstance(eval_pred.label_ids, dict):
        labels1 = eval_pred.label_ids.get('labels1')
        labels3 = eval_pred.label_ids.get('labels3')
    elif isinstance(eval_pred.label_ids, (list, tuple)) and len(eval_pred.label_ids) == 2:
        labels1, labels3 = eval_pred.label_ids
    else:
        raise TypeError("Unexpected type for label_ids.")

    pred1 = np.argmax(logits1, axis=1)
    pred3 = np.argmax(logits3, axis=1)
    report1 = classification_report(labels1, pred1, output_dict=True, zero_division=0)
    report3 = classification_report(labels3, pred3, output_dict=True, zero_division=0)

    return {
        "f1_task1": report1["weighted avg"]["f1-score"],
        "acc_task1": report1["accuracy"],
        "f1_task3": report3["weighted avg"]["f1-score"],
        "acc_task3": report3["accuracy"]
    }

def train_multitask(lang):
    tokenizer = AutoTokenizer.from_pretrained(MODEL_MAP[lang])
    train_texts, train_labels1, train_labels3 = load_multitask_data(lang, "train")
    test_texts, test_labels1, test_labels3 = load_multitask_data(lang, "test")
    train_enc = tokenizer(train_texts, padding="max_length", truncation=True, max_length=64, return_tensors="pt")
    test_enc = tokenizer(test_texts, padding="max_length", truncation=True, max_length=64, return_tensors="pt")

    train_dataset = MultiTaskDataset(train_enc, train_labels1, train_labels3)
    test_dataset = MultiTaskDataset(test_enc, test_labels1, test_labels3)
    vocab_size = tokenizer.vocab_size
    model = MultiTaskBiLSTM(vocab_size=vocab_size, embedding_dim=300, hidden_size=128)
    model.config = type("DummyConfig", (), {})()

    args = TrainingArguments(
        output_dir=f"./multitask_bilstm_{lang}",
        per_device_train_batch_size=32,
        per_device_eval_batch_size=64,
        num_train_epochs=3,
        learning_rate=3e-5,
        logging_steps=100,
        eval_strategy="epoch",
        save_strategy="no",
        report_to="none",
        disable_tqdm=True
    )

    def custom_compute_metrics(eval_pred):
        return compute_multitask_metrics(eval_pred)

    def collate_fn(batch):
        input_ids = torch.stack([item["input_ids"] for item in batch])
        attention_mask = torch.stack([item["attention_mask"] for item in batch])
        labels1 = torch.stack([item["labels1"] for item in batch])
        labels3 = torch.stack([item["labels3"] for item in batch])
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels1": labels1,
            "labels3": labels3
        }
    
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        data_collator=collate_fn,
        compute_metrics=custom_compute_metrics
    )

    trainer.train()
    metrics = trainer.evaluate()
    print(f"\n{lang.upper()} Results:")
    print(f"Gendered Abuse Task - F1: {metrics.get('eval_f1_task1', 0):.3f}, Accuracy: {metrics.get('eval_acc_task1', 0):.3f}")
    print(f"Explicit Language Task - F1: {metrics.get('eval_f1_task3', 0):.3f}, Accuracy: {metrics.get('eval_acc_task3', 0):.3f}")
    print("⎯" * 30)

if __name__ == "__main__":
    for lang in ["en", "hi", "ta"]:
        train_multitask(lang)


{'eval_loss': 0.6825830340385437, 'eval_f1_task1': 0.6348073591287848, 'eval_acc_task1': 0.7065217391304348, 'eval_f1_task3': 0.4775268210050818, 'eval_acc_task3': 0.5869565217391305, 'eval_runtime': 0.0289, 'eval_samples_per_second': 3187.24, 'eval_steps_per_second': 34.644, 'epoch': 1.0}
{'eval_loss': 0.6822375655174255, 'eval_f1_task1': 0.6348073591287848, 'eval_acc_task1': 0.7065217391304348, 'eval_f1_task3': 0.4775268210050818, 'eval_acc_task3': 0.5869565217391305, 'eval_runtime': 0.0245, 'eval_samples_per_second': 3758.483, 'eval_steps_per_second': 40.853, 'epoch': 2.0}
{'eval_loss': 0.6820657849311829, 'eval_f1_task1': 0.6415396198004893, 'eval_acc_task1': 0.717391304347826, 'eval_f1_task3': 0.4775268210050818, 'eval_acc_task3': 0.5869565217391305, 'eval_runtime': 0.0236, 'eval_samples_per_second': 3905.785, 'eval_steps_per_second': 42.454, 'epoch': 3.0}
{'train_runtime': 0.1431, 'train_samples_per_second': 503.063, 'train_steps_per_second': 20.961, 'train_loss': 0.6854918797810

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

{'eval_loss': 0.6881389617919922, 'eval_f1_task1': 0.6047152771175968, 'eval_acc_task1': 0.5668789808917197, 'eval_f1_task3': 0.5227893125982297, 'eval_acc_task3': 0.5796178343949044, 'eval_runtime': 0.1008, 'eval_samples_per_second': 4673.855, 'eval_steps_per_second': 39.693, 'epoch': 1.0}
{'eval_loss': 0.6874972581863403, 'eval_f1_task1': 0.6180360314495809, 'eval_acc_task1': 0.583864118895966, 'eval_f1_task3': 0.5208272088845338, 'eval_acc_task3': 0.5796178343949044, 'eval_runtime': 0.0963, 'eval_samples_per_second': 4888.793, 'eval_steps_per_second': 41.518, 'epoch': 2.0}
{'eval_loss': 0.6871768832206726, 'eval_f1_task1': 0.6276601483361461, 'eval_acc_task1': 0.5966029723991507, 'eval_f1_task3': 0.5188205855590644, 'eval_acc_task3': 0.5796178343949044, 'eval_runtime': 0.093, 'eval_samples_per_second': 5064.273, 'eval_steps_per_second': 43.009, 'epoch': 3.0}
{'train_runtime': 0.3965, 'train_samples_per_second': 189.142, 'train_steps_per_second': 7.566, 'train_loss': 0.69209655125935

In [1]:
import os
import re
import warnings
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from sklearn.metrics import classification_report
from transformers import (AutoTokenizer, Trainer, TrainingArguments, 
                          XLMRobertaModel)

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
warnings.filterwarnings("ignore")

def preprocess_text(text, lang):
    """Removing URLs, mentions, hashtags and extra whitespaces."""
    text = re.sub(r"http\S+|www\S+|@\w+|#\w+", "", str(text))
    return text.strip()

def load_multitask_data(lang, split="train"):
    label1_col = f"{lang}_a1"
    label3_col = f"{lang}_a3"

    if split.lower() == "train":
        base_dir = "/kaggle/input/uli-dataset/uli_dataset-main/training"
    elif split.lower() == "test":
        base_dir = "/kaggle/input/uli-dataset/uli_dataset-main/testing"
    else:
        base_dir = ""

    file_path = os.path.join(base_dir, f"{split}_{lang}_l1.csv")

    if not os.path.exists(file_path):
        print(f"Missing file: {file_path}")
        return [], [], []

    try:
        df = pd.read_csv(file_path, engine="python", on_bad_lines='skip')
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return [], [], []

    if "text" not in df.columns or label1_col not in df.columns or label3_col not in df.columns:
        print(f"Missing required columns in {file_path}")
        return [], [], []

    texts, label1s, label3s = [], [], []
    for _, row in df.iterrows():
        try:
            text = preprocess_text(row["text"], lang)
            l1 = int(float(str(row[label1_col]).replace('.0', '')))
            l3 = int(float(str(row[label3_col]).replace('.0', '')))
            texts.append(text)
            label1s.append(l1)
            label3s.append(l3)
        except Exception as e:
            continue
    return texts, label1s, label3s

class MultiTaskDataset(Dataset):
    def __init__(self, encodings, labels1, labels3):
        self.encodings = encodings
        self.labels1 = labels1
        self.labels3 = labels3

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels1': torch.tensor(self.labels1[idx], dtype=torch.long),
            'labels3': torch.tensor(self.labels3[idx], dtype=torch.long)
        }

    def __len__(self):
        return len(self.labels1)

class MultiTaskXRoBERTa(nn.Module):
    def __init__(self, num_classes=2, dropout_prob=0.2):
        super(MultiTaskXRoBERTa, self).__init__()
        self.xroberta = XLMRobertaModel.from_pretrained("xlm-roberta-large")
        hidden_size = self.xroberta.config.hidden_size
        
        self.dropout = nn.Dropout(dropout_prob)
        self.classifier1 = nn.Linear(hidden_size, num_classes)
        self.classifier3 = nn.Linear(hidden_size, num_classes)
        self.loss_fct = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels1=None, labels3=None):
        outputs = self.xroberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state.mean(dim=1)
        pooled_output = self.dropout(pooled_output)
        logits1 = self.classifier1(pooled_output)
        logits3 = self.classifier3(pooled_output)
        
        loss = None
        if labels1 is not None and labels3 is not None:
            loss1 = self.loss_fct(logits1, labels1)
            loss3 = self.loss_fct(logits3, labels3)
            loss = (loss1 + loss3) / 2.0
        
        return {
            'loss': loss,
            'logits1': logits1,
            'logits3': logits3
        }

def compute_multitask_metrics(eval_pred):
    if isinstance(eval_pred.predictions, tuple):
        logits1, logits3 = eval_pred.predictions
    elif isinstance(eval_pred.predictions, dict):
        logits1 = eval_pred.predictions.get('logits1')
        logits3 = eval_pred.predictions.get('logits3')
    else:
        raise TypeError("Unexpected type for predictions.")

    if isinstance(eval_pred.label_ids, dict):
        labels1 = eval_pred.label_ids.get('labels1')
        labels3 = eval_pred.label_ids.get('labels3')
    elif isinstance(eval_pred.label_ids, (list, tuple)) and len(eval_pred.label_ids) == 2:
        labels1, labels3 = eval_pred.label_ids
    else:
        raise TypeError("Unexpected type for label_ids.")

    pred1 = np.argmax(logits1, axis=1)
    pred3 = np.argmax(logits3, axis=1)
    
    report1 = classification_report(labels1, pred1, output_dict=True, zero_division=0)
    report3 = classification_report(labels3, pred3, output_dict=True, zero_division=0)

    return {
        "precision_task1": report1["weighted avg"]["precision"],
        "recall_task1": report1["weighted avg"]["recall"],
        "f1_task1": report1["weighted avg"]["f1-score"],
        "acc_task1": report1["accuracy"],
        "precision_task3": report3["weighted avg"]["precision"],
        "recall_task3": report3["weighted avg"]["recall"],
        "f1_task3": report3["weighted avg"]["f1-score"],
        "acc_task3": report3["accuracy"]
    }

def train_multitask(lang):
    tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")
    
    train_texts, train_labels1, train_labels3 = load_multitask_data(lang, "train")
    test_texts, test_labels1, test_labels3 = load_multitask_data(lang, "test")

    train_enc = tokenizer(train_texts, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    test_enc = tokenizer(test_texts, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

    train_dataset = MultiTaskDataset(train_enc, train_labels1, train_labels3)
    test_dataset = MultiTaskDataset(test_enc, test_labels1, test_labels3)

    model = MultiTaskXRoBERTa(num_classes=2, dropout_prob=0.2)
    model.config = type("DummyConfig", (), {})() 

    args = TrainingArguments(
        output_dir=f"./multitask_xroberta_{lang}",
        per_device_train_batch_size=8,  
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        learning_rate=3e-5,
        logging_steps=100,
        eval_strategy="epoch",
        save_strategy="no",
        report_to="none",
        save_total_limit=1,
        disable_tqdm=False  # tqdm progress bars
    )

    def custom_compute_metrics(eval_pred):
        return compute_multitask_metrics(eval_pred)

    def collate_fn(batch):
        input_ids = torch.stack([item["input_ids"] for item in batch])
        attention_mask = torch.stack([item["attention_mask"] for item in batch])
        labels1 = torch.stack([item["labels1"] for item in batch])
        labels3 = torch.stack([item["labels3"] for item in batch])
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels1": labels1,
            "labels3": labels3
        }
    
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        data_collator=collate_fn,
        compute_metrics=custom_compute_metrics
    )

    trainer.train()
    metrics = trainer.evaluate()

    save_path = f"best_model_{lang}.pth"
    torch.save(model.state_dict(), save_path)
    print(f"Saved best model for {lang.upper()} to {save_path}")

    print(f"\n{'='*40}")
    print(f"Results for language: {lang.upper()}")
    print(f"{'-'*40}")
    print("Gendered Abuse Task:")
    print(f"  Precision: {metrics.get('eval_precision_task1', 0):.3f}")
    print(f"  Recall:    {metrics.get('eval_recall_task1', 0):.3f}")
    print(f"  F1 Score:  {metrics.get('eval_f1_task1', 0):.3f}")
    print(f"  Accuracy:  {metrics.get('eval_acc_task1', 0):.3f}")
    print(f"{'-'*40}")
    print("Explicit Language Task:")
    print(f"  Precision: {metrics.get('eval_precision_task3', 0):.3f}")
    print(f"  Recall:    {metrics.get('eval_recall_task3', 0):.3f}")
    print(f"  F1 Score:  {metrics.get('eval_f1_task3', 0):.3f}")
    print(f"  Accuracy:  {metrics.get('eval_acc_task3', 0):.3f}")
    print(f"{'='*40}\n")

if __name__ == "__main__":
    for lang in ["en", "hi", "ta"]:
        train_multitask(lang)


2025-04-15 15:32:26.574823: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744731147.023458      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744731147.150472      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Precision Task1,Recall Task1,F1 Task1,Acc Task1,Precision Task3,Recall Task3,F1 Task3,Acc Task3
1,No log,0.638697,0.530364,0.728261,0.613754,0.728261,0.331876,0.576087,0.421139,0.576087
2,No log,0.647282,0.530364,0.728261,0.613754,0.728261,0.331876,0.576087,0.421139,0.576087
3,No log,0.650696,0.530364,0.728261,0.613754,0.728261,0.331876,0.576087,0.421139,0.576087


Saved best model for EN to best_model_en.pth

Results for language: EN
----------------------------------------
Gendered Abuse Task:
  Precision: 0.530
  Recall:    0.728
  F1 Score:  0.614
  Accuracy:  0.728
----------------------------------------
Explicit Language Task:
  Precision: 0.332
  Recall:    0.576
  F1 Score:  0.421
  Accuracy:  0.576



Epoch,Training Loss,Validation Loss,Precision Task1,Recall Task1,F1 Task1,Acc Task1,Precision Task3,Recall Task3,F1 Task3,Acc Task3
1,No log,0.588825,0.6339,0.796178,0.705832,0.796178,0.405696,0.636943,0.495675,0.636943
2,No log,0.573084,0.6339,0.796178,0.705832,0.796178,0.405696,0.636943,0.495675,0.636943
3,No log,0.575952,0.6339,0.796178,0.705832,0.796178,0.405696,0.636943,0.495675,0.636943


Saved best model for HI to best_model_hi.pth

Results for language: HI
----------------------------------------
Gendered Abuse Task:
  Precision: 0.634
  Recall:    0.796
  F1 Score:  0.706
  Accuracy:  0.796
----------------------------------------
Explicit Language Task:
  Precision: 0.406
  Recall:    0.637
  F1 Score:  0.496
  Accuracy:  0.637



Epoch,Training Loss,Validation Loss,Precision Task1,Recall Task1,F1 Task1,Acc Task1,Precision Task3,Recall Task3,F1 Task3,Acc Task3
1,No log,0.661805,0.719207,0.672355,0.625698,0.672355,0.369067,0.607509,0.459178,0.607509
2,No log,0.603759,0.73999,0.730375,0.717313,0.730375,0.649248,0.624573,0.516794,0.624573
3,No log,0.552215,0.728423,0.730375,0.727425,0.730375,0.844293,0.83959,0.835072,0.83959


Saved best model for TA to best_model_ta.pth

Results for language: TA
----------------------------------------
Gendered Abuse Task:
  Precision: 0.728
  Recall:    0.730
  F1 Score:  0.727
  Accuracy:  0.730
----------------------------------------
Explicit Language Task:
  Precision: 0.844
  Recall:    0.840
  F1 Score:  0.835
  Accuracy:  0.840

