In [1]:
!pip install torch transformers pandas scikit-learn indic-transliteration -q
!pip install indic-transliteration

!WANDB_DISABLED=true

import pandas as pd
import re
import numpy as np
import warnings
from sklearn.metrics import classification_report, f1_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
from transformers import MarianMTModel, MarianTokenizer
import os

# Configuration
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
warnings.filterwarnings("ignore")

torch.manual_seed(42)
torch.backends.cudnn.benchmark = True

# ======================
# Indic Preprocessing
# ======================

# This class is responsible for cleaning the input text. It removes unwanted characters like URLs,
# mentions, and hashtags. If the language is Hindi or Tamil, it transliterates text written in Roman
# script into the native script (e.g., Devanagari for Hindi). This helps our model better understand
# native language text, especially if it was typed in an inconsistent or Romanized way.

class IndicTextProcessor:
    def __init__(self, lang):
        self.lang = lang
        self.script_map = {
            'hi': sanscript.DEVANAGARI,
            'ta': sanscript.TAMIL,
            'en': None
        }

    def clean(self, text):
        text = re.sub(r"@\w+|#\w+|https?://\S+|www\.\S+", "", str(text))
        text = text.strip()
        if self.lang != 'en':
            try:
                text = transliterate(text, sanscript.ITRANS, self.script_map[self.lang])
            except:
                pass
        return text

# ======================
# Back Translation
# ======================

# This class performs back translation, a data augmentation technique. It takes English text,
# translates it into an intermediate language (e.g., Hindi), and then back into English.
# This helps introduce slight variations in wording, which can make the model more robust
# and generalize better to unseen data.

class BackTranslator:
    def __init__(self, src_lang="en", mid_lang="hi"):
        self.en_to_mid_tok = MarianTokenizer.from_pretrained(f'Helsinki-NLP/opus-mt-{src_lang}-{mid_lang}')
        self.en_to_mid_model = MarianMTModel.from_pretrained(f'Helsinki-NLP/opus-mt-{src_lang}-{mid_lang}')
        self.mid_to_en_tok = MarianTokenizer.from_pretrained(f'Helsinki-NLP/opus-mt-{mid_lang}-{src_lang}')
        self.mid_to_en_model = MarianMTModel.from_pretrained(f'Helsinki-NLP/opus-mt-{mid_lang}-{src_lang}')
        self.en_to_mid_model.to("cuda" if torch.cuda.is_available() else "cpu")
        self.mid_to_en_model.to("cuda" if torch.cuda.is_available() else "cpu")

    def translate(self, texts, model, tokenizer):
        encoded = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128).to(model.device)
        generated = model.generate(**encoded, max_length=128)
        return tokenizer.batch_decode(generated, skip_special_tokens=True)

    def back_translate(self, texts):
        with torch.no_grad():
            mid = self.translate(texts, self.en_to_mid_model, self.en_to_mid_tok)
            return self.translate(mid, self.mid_to_en_model, self.mid_to_en_tok)

# ======================
# Enhanced Data Loading
# ======================

# This function reads training or test CSV files for a given language (English, Hindi, or Tamil).
# It uses the IndicTextProcessor to clean the text and filters out noisy or invalid label entries.
# For English training data, it also performs back translation to augment the dataset.

def load_data(lang, split):
    processor = IndicTextProcessor(lang)
    target_col = f"{lang}_a1"
    file_path = f"{split}_{lang}_l1.csv"

    try:
        df = pd.read_csv(
            file_path,
            usecols=["text", target_col],
            dtype={'text': 'string', target_col: 'string'},
            engine='python',
            on_bad_lines='warn'
        )

        texts, labels = [], []
        for _, row in df.iterrows():
            text = processor.clean(row["text"])
            label_str = str(row[target_col]).strip().upper()

            if text and label_str not in ['NL', 'NAN', '']:
                try:
                    label = int(float(label_str.replace('.0', '')))
                    if label in {0, 1}:
                        texts.append(text)
                        labels.append(label)
                except:
                    continue

        # Add back-translated English examples
        if lang == "en" and split == "train" and len(texts) > 100:
            bt = BackTranslator(mid_lang="hi")
            aug_texts = bt.back_translate(texts[:100])
            texts.extend(aug_texts)
            labels.extend(labels[:100])

        return texts, labels
    except Exception as e:
        print(f"Error loading {file_path}: {str(e)}")
        return [], []

# ======================
# Dataset Class
# ======================
class AbuseDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=64):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }
# This class converts cleaned text and labels into a format that can be used by the Hugging Face Trainer.
# It uses the tokenizer to turn each sentence into token IDs and attention masks,
# and stores the labels for classification. This is essential for batching and feeding data into the model.




# Focal loss is a variation of cross-entropy loss that gives more importance to hard-to-classify examples.
# It helps improve performance when the dataset is imbalanced (i.e., one class appears much more than another),
# which is common in abuse detection tasks.


# ======================
# Focal Loss & Training
# ======================
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, weight=None):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.weight = weight

    def forward(self, logits, labels):
        ce_loss = nn.CrossEntropyLoss(weight=self.weight, reduction='none')(logits, labels)
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * ce_loss
        return focal_loss.mean()

class FocalTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = FocalLoss(weight=class_weights.to(model.device))
        loss = loss_fct(logits.view(-1, 2), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

#Hugging Face's Trainer class by default uses standard loss functions.
#Here, we override the compute_loss method to use FocalLoss instead.
#This allows us to train models that are more sensitive to the under-represented class.

# ======================
# Training Pipeline
# ======================
def train_advanced(lang):
    MODEL_MAP = {
        "en": "distilroberta-base",
        "hi": "ai4bharat/indic-bert",
        "ta": "ai4bharat/indic-bert"
    }

    # Load data
    train_texts, train_labels = load_data(lang, "train")
    test_texts, test_labels = load_data(lang, "test")

    if len(train_texts) < 10 or len(test_texts) < 5:
        print(f" Insufficient data for {lang.upper()}")
        return

    tokenizer = AutoTokenizer.from_pretrained(MODEL_MAP[lang])
    train_dataset = AbuseDataset(train_texts, train_labels, tokenizer)
    test_dataset = AbuseDataset(test_texts, test_labels, tokenizer)

    # Compute class weights
    classes, counts = np.unique(train_labels, return_counts=True)
    if len(classes) == 1:
        cw = torch.tensor([1.0, 1.0])
    else:
        cw = torch.tensor([len(train_labels) / c for c in counts], dtype=torch.float32).sqrt()
    global class_weights
    class_weights = cw

    # Model setup
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_MAP[lang],
        num_labels=2,
        hidden_dropout_prob=0.2,
        attention_probs_dropout_prob=0.2
    ).to("cuda" if torch.cuda.is_available() else "cpu")

    def compute_metrics(p):
        preds = p.predictions.argmax(-1)
        return {
            'f1': f1_score(p.label_ids, preds, average='weighted'),
            'accuracy': (preds == p.label_ids).mean()
        }

    training_args = TrainingArguments(
        output_dir=f"./results_adv_{lang}",
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        num_train_epochs=3,
        learning_rate=3e-5,
        fp16=torch.cuda.is_available(),
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_steps=100,
        report_to="none",
        optim="adamw_torch_fused",

        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
    )


    trainer = FocalTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics
    )

    print(f"\n{'='*40}")
    print(f" Training {lang.upper()} ({len(train_dataset)} samples)")
    print(f"{'='*40}")

    trainer.train()
    results = trainer.evaluate()

    print(f"\n Final Metrics for {lang.upper()}:")
    print(f"F1  → {results['eval_f1']:.3f}")
    print(f"ACC → {results['eval_accuracy']:.3f}")
    print("⎯" * 30)

# ======================
# Run All
# ======================
if __name__ == "__main__":
    for lang in ["en", "hi", "ta"]:
        train_advanced(lang)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m69.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m55.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/813k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.06M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/306M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/304M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/304M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 Training EN (231 samples)


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,No log,0.374601,0.661717,0.763948
2,No log,0.373637,0.661717,0.763948
3,No log,0.37341,0.661717,0.763948



 Final Metrics for EN:
F1  → 0.662
ACC → 0.764
⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯


config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/135M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 Training HI (1255 samples)


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,No log,0.377285,0.708155,0.797861
2,No log,0.377486,0.708155,0.797861
3,No log,0.377425,0.708155,0.797861


model.safetensors:   0%|          | 0.00/135M [00:00<?, ?B/s]


 Final Metrics for HI:
F1  → 0.708
ACC → 0.798
⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 Training TA (1543 samples)


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,No log,0.379275,0.427018,0.580997
2,No log,0.37963,0.427018,0.580997
3,No log,0.37988,0.494439,0.593458



 Final Metrics for TA:
F1  → 0.494
ACC → 0.593
⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯⎯


In [3]:
from google.colab import drive
import shutil
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define source and destination paths
languages = ["en", "hi", "ta"]  # or just ["en"] if only one language
drive_base_dir = "/content/drive/MyDrive/hate_speech_checkpoints"

for lang in languages:
    src_dir = f"./results_adv_{lang}"
    dest_dir = os.path.join(drive_base_dir, f"{lang}_checkpoints")

    if os.path.exists(src_dir):
        print(f" Copying {lang.upper()} checkpoints to Drive...")
        shutil.copytree(src_dir, dest_dir, dirs_exist_ok=True)
        print(f" Saved to: {dest_dir}")
    else:
        print(f" Checkpoint directory not found for {lang.upper()}: {src_dir}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
 Copying EN checkpoints to Drive...
 Saved to: /content/drive/MyDrive/hate_speech_checkpoints/en_checkpoints
 Copying HI checkpoints to Drive...
 Saved to: /content/drive/MyDrive/hate_speech_checkpoints/hi_checkpoints
 Copying TA checkpoints to Drive...
 Saved to: /content/drive/MyDrive/hate_speech_checkpoints/ta_checkpoints
