<a href="https://colab.research.google.com/github/Ankyytt/Hindi_Hate_Speech_Detection/blob/main/IndiCBertFinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers datasets peft accelerate evaluate bitsandbytes scikit-learn torch

[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m59.4/59.4 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# =====================================================
# FINAL Advanced IndicBERT + LoRA Training Script
# ¬† ¬†(ALL FIXES INCLUDED)
# =====================================================

# STEP 1: Install necessary libraries (MUST BE RUN IN YOUR ENVIRONMENT!)
# !pip install -q transformers datasets peft accelerate evaluate bitsandbytes scikit-learn torch

import os
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from torch.optim import AdamW
from peft import LoraConfig, get_peft_model
import torch
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import numpy as np
import evaluate

# =====================================================
# 1Ô∏è Load & Prepare Dataset (MODIFIED FOR ALL ERRORS)
# =====================================================
FILE_PATH = "hate_speech_hindi_final.csv"
LABEL_COLUMNS = ['defamation', 'hate', 'non-hate', 'violence', 'vulgar']

print("Attempting robust file load...")
try:
    df = pd.read_csv(
        FILE_PATH,
        encoding='latin-1',
        sep=',',
        on_bad_lines='skip',
        engine='python'
    )
except Exception as e:
    print(f"FATAL ERROR: Could not read file. Error: {e}")
    # In a real script, you might raise the error or stop here
    raise e

# Filter columns
required_cols = ['text'] + LABEL_COLUMNS
df = df[required_cols].copy()

# üõë NEW FIX: Ensure 'text' is string and handle NaNs for tokenizer üõë
df['text'] = df['text'].astype(str).fillna('')

# Convert all label columns to integer type
for col in LABEL_COLUMNS:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

# Create a mock object for label management
class MockMlb:
    def __init__(self, classes):
        self.classes_ = classes
mlb = MockMlb(classes=LABEL_COLUMNS)

# Split the data
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

train_ds = Dataset.from_pandas(train_df, preserve_index=False)
val_ds = Dataset.from_pandas(val_df, preserve_index=False)
test_ds = Dataset.from_pandas(test_df, preserve_index=False)

# =====================================================
# 2Ô∏è Tokenize & Format Data
# =====================================================
MODEL_NAME = "ai4bharat/IndicBERTv2-MLM-only"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

train_ds = train_ds.map(tokenize_function, batched=True)
val_ds = val_ds.map(tokenize_function, batched=True)
test_ds = test_ds.map(tokenize_function, batched=True)

label_cols = LABEL_COLUMNS
train_ds = train_ds.map(lambda x: {"labels": np.array([x[c] for c in label_cols], dtype=np.float32)}, remove_columns=label_cols+['text'])
val_ds = val_ds.map(lambda x: {"labels": np.array([x[c] for c in label_cols], dtype=np.float32)}, remove_columns=label_cols+['text'])
test_ds = test_ds.map(lambda x: {"labels": np.array([x[c] for c in label_cols], dtype=np.float32)}, remove_columns=label_cols+['text'])

train_ds.set_format("torch")
val_ds.set_format("torch")
test_ds.set_format("torch")

# =====================================================
# 3Ô∏è Create DataLoaders & Class Weights
# =====================================================
train_dataloader = DataLoader(train_ds, shuffle=True, batch_size=16)
eval_dataloader = DataLoader(val_ds, batch_size=16)
test_dataloader = DataLoader(test_ds, batch_size=16)

class_counts = train_df[label_cols].sum()
total_samples = len(train_df)
class_weights = torch.tensor(total_samples / (len(label_cols) * class_counts), dtype=torch.float32)

# =====================================================
# 4Ô∏è Load Model & Apply LoRA
# =====================================================
id2label = {i: label for i, label in enumerate(mlb.classes_)}
label2id = {label: i for i, label in enumerate(mlb.classes_)}

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=len(mlb.classes_), problem_type="multi_label_classification",
    id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True)

peft_config = LoraConfig(task_type="SEQ_CLS", r=8, lora_alpha=16, lora_dropout=0.1)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# =====================================================
# 5Ô∏è Improved Training Setup
# =====================================================
LEARNING_RATE = 3e-5
NUM_EPOCHS = 5
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
num_training_steps = NUM_EPOCHS * len(train_dataloader)
num_warmup_steps = int(0.1 * num_training_steps)

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
class_weights = class_weights.to(device)

# =====================================================
# 6Ô∏è Manual Training & Evaluation Loop
# =====================================================
progress_bar = tqdm(range(num_training_steps))
metric = evaluate.load("f1")
best_f1 = 0
best_model_path = "best_peft_model"

for epoch in range(NUM_EPOCHS):
    model.train()
    for batch in train_dataloader:
        labels = batch.pop("labels").to(device)
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**inputs)
        loss_fct = torch.nn.BCEWithLogitsLoss(pos_weight=class_weights)
        loss = loss_fct(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    model.eval()
    all_preds = []
    all_labels = []
    for batch in eval_dataloader:
        labels = batch.pop("labels")
        inputs = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        predictions = (torch.sigmoid(logits) > 0.5).cpu().numpy().astype(int)
        all_preds.extend(predictions)
        all_labels.extend(labels.numpy().astype(int))

    all_preds_flat = np.array(all_preds).flatten()
    all_labels_flat = np.array(all_labels).flatten()
    f1_score = metric.compute(predictions=all_preds_flat, references=all_labels_flat, average="micro")["f1"]

    print(f"\nEpoch {epoch+1}/{NUM_EPOCHS} | Validation F1 (micro): {f1_score:.4f}")

    if f1_score > best_f1:
        best_f1 = f1_score
        model.save_pretrained(best_model_path)
        print(f" New best model saved to '{best_model_path}' with F1: {best_f1:.4f}")

print("\n\n New model training complete! ")
print(f"The best model (LoRA adapter weights) was saved to '{best_model_path}' with a validation F1 score of {best_f1:.4f}.")
print("You can now re-run the debugging/prediction script to test this new, improved model.")

Attempting robust file load...


Map:   0%|          | 0/25529 [00:00<?, ? examples/s]

Map:   0%|          | 0/3191 [00:00<?, ? examples/s]

Map:   0%|          | 0/3192 [00:00<?, ? examples/s]

Map:   0%|          | 0/25529 [00:00<?, ? examples/s]

Map:   0%|          | 0/3191 [00:00<?, ? examples/s]

Map:   0%|          | 0/3192 [00:00<?, ? examples/s]

  class_weights = torch.tensor(total_samples / (len(label_cols) * class_counts), dtype=torch.float32)


config.json:   0%|          | 0.00/639 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/IndicBERTv2-MLM-only and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 298,757 || all params: 278,343,946 || trainable%: 0.1073


  0%|          | 0/7980 [00:00<?, ?it/s]

Downloading builder script: 0.00B [00:00, ?B/s]


Epoch 1/5 | Validation F1 (micro): 0.9285
 New best model saved to 'best_peft_model' with F1: 0.9285

Epoch 2/5 | Validation F1 (micro): 0.9230

Epoch 3/5 | Validation F1 (micro): 0.9218

Epoch 4/5 | Validation F1 (micro): 0.9208

Epoch 5/5 | Validation F1 (micro): 0.9220


 New model training complete! 
The best model (LoRA adapter weights) was saved to 'best_peft_model' with a validation F1 score of 0.9285.
You can now re-run the debugging/prediction script to test this new, improved model.


In [None]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel
import torch
import numpy as np
from typing import List, Dict, Tuple

# --- Configuration ---
MODEL_NAME = "ai4bharat/IndicBERTv2-MLM-only"
PEFT_MODEL_PATH = "best_peft_model"
LABEL_COLUMNS = ['defamation', 'hate', 'non-hate', 'violence', 'vulgar']
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# üõë FINAL OPTIMIZED THRESHOLD üõë
FINAL_THRESHOLD = 0.37

# --- Step 1: Load Model and Tokenizer ---

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# 1. Load the base model
base_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(LABEL_COLUMNS),
    problem_type="multi_label_classification",
    ignore_mismatched_sizes=True
)

# 2. Attach the trained PEFT adapter weights
model = PeftModel.from_pretrained(base_model, PEFT_MODEL_PATH)
model.to(device)
model.eval()

print(f"\nModel and PEFT adapter loaded successfully from '{PEFT_MODEL_PATH}'.")

# --- Step 2: Define Prediction Function (Using Final Threshold) ---

def predict_multilabel_text(text: str, model, tokenizer, device, threshold: float = FINAL_THRESHOLD) -> List[str]:
    """Tokenizes text and returns predicted labels using the optimized threshold."""

    texts = [text]

    encoding = tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

    inputs = {k: v.to(device) for k, v in encoding.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    # Apply sigmoid to logits
    probs = torch.sigmoid(outputs.logits).cpu().numpy().flatten()

    # Get binary predictions (1 or 0) based on new threshold
    predictions = (probs >= threshold).astype(int)

    # Map predictions to label names
    predicted_labels = [LABEL_COLUMNS[i] for i, pred in enumerate(predictions) if pred == 1]

    return predicted_labels

# --- Step 3: Manual Inputs for Testing ---

manual_texts = [
    # Test Case 1: Non-Hate/Neutral
    "‡§Ü‡§ú ‡§ï‡§æ ‡§Æ‡•å‡§∏‡§Æ ‡§¨‡§π‡•Å‡§§ ‡§∏‡•Å‡§π‡§æ‡§®‡§æ ‡§π‡•à‡•§",

    # Test Case 2: Hate, Vulgar, Defamation (Stronger Example)
    "‡§Ø‡§π ‡§®‡•á‡§§‡§æ ‡§ï‡•á‡§µ‡§≤ ‡§ù‡•Ç‡§† ‡§¨‡•ã‡§≤‡§§‡§æ ‡§π‡•à ‡§î‡§∞ ‡§µ‡§π ‡§™‡•Ç‡§∞‡•Ä ‡§§‡§∞‡§π ‡§∏‡•á ‡§¶‡•á‡§∂‡§¶‡•ç‡§∞‡•ã‡§π‡•Ä ‡§π‡•à‡•§",

    # Test Case 3: Violence/Threat (Check for 'violence' label)
    "‡§â‡§∏‡•á ‡§ö‡•å‡§∞‡§æ‡§π‡•á ‡§™‡§∞ ‡§≤‡§ü‡§ï‡§æ ‡§¶‡•á‡§®‡§æ ‡§ö‡§æ‡§π‡§ø‡§è ‡§§‡§æ‡§ï‡§ø ‡§≤‡•ã‡§ó‡•ã‡§Ç ‡§ï‡•ã ‡§∏‡§¨‡§ï ‡§Æ‡§ø‡§≤‡•á‡•§",
]

print("\n" + "=" * 50)
print(f"FINAL PREDICTION RESULTS (Threshold: {FINAL_THRESHOLD})")
print("=" * 50)

for i, text in enumerate(manual_texts):
    predicted_labels = predict_multilabel_text(text, model, tokenizer, device)

    print(f"\n--- Test Case {i+1} ---")
    print(f"Text (Hindi): {text}")
    print(f"Predicted Labels: {predicted_labels}")

print("\n" + "=" * 50)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/IndicBERTv2-MLM-only and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model and PEFT adapter loaded successfully from 'best_peft_model'.

FINAL PREDICTION RESULTS (Threshold: 0.37)

--- Test Case 1 ---
Text (Hindi): ‡§Ü‡§ú ‡§ï‡§æ ‡§Æ‡•å‡§∏‡§Æ ‡§¨‡§π‡•Å‡§§ ‡§∏‡•Å‡§π‡§æ‡§®‡§æ ‡§π‡•à‡•§
Predicted Labels: ['hate', 'non-hate', 'violence']

--- Test Case 2 ---
Text (Hindi): ‡§Ø‡§π ‡§®‡•á‡§§‡§æ ‡§ï‡•á‡§µ‡§≤ ‡§ù‡•Ç‡§† ‡§¨‡•ã‡§≤‡§§‡§æ ‡§π‡•à ‡§î‡§∞ ‡§µ‡§π ‡§™‡•Ç‡§∞‡•Ä ‡§§‡§∞‡§π ‡§∏‡•á ‡§¶‡•á‡§∂‡§¶‡•ç‡§∞‡•ã‡§π‡•Ä ‡§π‡•à‡•§
Predicted Labels: ['hate', 'non-hate', 'violence']

--- Test Case 3 ---
Text (Hindi): ‡§â‡§∏‡•á ‡§ö‡•å‡§∞‡§æ‡§π‡•á ‡§™‡§∞ ‡§≤‡§ü‡§ï‡§æ ‡§¶‡•á‡§®‡§æ ‡§ö‡§æ‡§π‡§ø‡§è ‡§§‡§æ‡§ï‡§ø ‡§≤‡•ã‡§ó‡•ã‡§Ç ‡§ï‡•ã ‡§∏‡§¨‡§ï ‡§Æ‡§ø‡§≤‡•á‡•§
Predicted Labels: ['hate', 'non-hate', 'violence']



In [None]:
# =====================================================
# FINAL OPTIMIZED IndicBERT + LoRA TRAINING SCRIPT
# (Includes ALL fixes and optimized hyperparameters)
# =====================================================

# STEP 1: Install necessary libraries (MUST BE RUN IN YOUR ENVIRONMENT!)
# !pip install -q transformers datasets peft accelerate evaluate bitsandbytes scikit-learn torch

import os
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from torch.optim import AdamW
from peft import LoraConfig, get_peft_model
import torch
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import numpy as np
import evaluate

# =====================================================
# 1Ô∏è Load & Prepare Dataset
# =====================================================
FILE_PATH = "hate_speech_hindi_final.csv"
LABEL_COLUMNS = ['defamation', 'hate', 'non-hate', 'violence', 'vulgar']

print("Attempting robust file load...")
try:
    df = pd.read_csv(
        FILE_PATH,
        encoding='latin-1',
        sep=',',
        on_bad_lines='skip',
        engine='python'
    )
except Exception as e:
    print(f"FATAL ERROR: Could not read file. Error: {e}")
    raise e

# Filter columns and perform data cleaning
required_cols = ['text'] + LABEL_COLUMNS
df = df[required_cols].copy()

# Fix: Ensure 'text' is string and handle NaNs for tokenizer
df['text'] = df['text'].astype(str).fillna('')

# Convert all label columns to integer type
for col in LABEL_COLUMNS:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

# Mock object for label management
class MockMlb:
    def __init__(self, classes):
        self.classes_ = classes
mlb = MockMlb(classes=LABEL_COLUMNS)

# Split the data
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

train_ds = Dataset.from_pandas(train_df, preserve_index=False)
val_ds = Dataset.from_pandas(val_df, preserve_index=False)
test_ds = Dataset.from_pandas(test_df, preserve_index=False)

# =====================================================
# 2Ô∏è Tokenize & Format Data
# =====================================================
MODEL_NAME = "ai4bharat/IndicBERTv2-MLM-only"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

train_ds = train_ds.map(tokenize_function, batched=True)
val_ds = val_ds.map(tokenize_function, batched=True)
test_ds = test_ds.map(tokenize_function, batched=True)

label_cols = LABEL_COLUMNS
train_ds = train_ds.map(lambda x: {"labels": np.array([x[c] for c in label_cols], dtype=np.float32)}, remove_columns=label_cols+['text'])
val_ds = val_ds.map(lambda x: {"labels": np.array([x[c] for c in label_cols], dtype=np.float32)}, remove_columns=label_cols+['text'])
test_ds = test_ds.map(lambda x: {"labels": np.array([x[c] for c in label_cols], dtype=np.float32)}, remove_columns=label_cols+['text'])

train_ds.set_format("torch")
val_ds.set_format("torch")
test_ds.set_format("torch")

# =====================================================
# 3Ô∏è Create DataLoaders & Class Weights
# =====================================================
train_dataloader = DataLoader(train_ds, shuffle=True, batch_size=16)
eval_dataloader = DataLoader(val_ds, batch_size=16)
test_dataloader = DataLoader(test_ds, batch_size=16)

class_counts = train_df[label_cols].sum()
total_samples = len(train_df)
class_weights = torch.tensor(total_samples / (len(label_cols) * class_counts), dtype=torch.float32)

# =====================================================
# 4Ô∏è Load Model & Apply LoRA (FIXED DROPOUT)
# =====================================================
id2label = {i: label for i, label in enumerate(mlb.classes_)}
label2id = {label: i for i, label in enumerate(mlb.classes_)}

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=len(mlb.classes_), problem_type="multi_label_classification",
    id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True)

# üõë FIX 1: Increased LoRA Dropout for better regularization üõë
peft_config = LoraConfig(task_type="SEQ_CLS", r=8, lora_alpha=16, lora_dropout=0.2)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# =====================================================
# 5Ô∏è Improved Training Setup (FIXED LR & EPOCHS)
# =====================================================
# üõë FIX 2: Reduced Learning Rate for slower, more stable learning üõë
LEARNING_RATE = 1e-5
# üõë FIX 3: Reduced Epochs to prevent later overfitting üõë
NUM_EPOCHS = 3
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
num_training_steps = NUM_EPOCHS * len(train_dataloader)
num_warmup_steps = int(0.1 * num_training_steps)

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
class_weights = class_weights.to(device)

# =====================================================
# 6Ô∏è Manual Training & Evaluation Loop
# =====================================================
progress_bar = tqdm(range(num_training_steps))
metric = evaluate.load("f1")
best_f1 = 0
best_model_path = "best_peft_model"

for epoch in range(NUM_EPOCHS):
    model.train()
    for batch in train_dataloader:
        labels = batch.pop("labels").to(device)
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**inputs)
        loss_fct = torch.nn.BCEWithLogitsLoss(pos_weight=class_weights)
        loss = loss_fct(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    model.eval()
    all_preds = []
    all_labels = []
    for batch in eval_dataloader:
        labels = batch.pop("labels")
        inputs = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        predictions = (torch.sigmoid(logits) > 0.5).cpu().numpy().astype(int)
        all_preds.extend(predictions)
        all_labels.extend(labels.numpy().astype(int))

    all_preds_flat = np.array(all_preds).flatten()
    all_labels_flat = np.array(all_labels).flatten()
    f1_score = metric.compute(predictions=all_preds_flat, references=all_labels_flat, average="micro")["f1"]

    print(f"\nEpoch {epoch+1}/{NUM_EPOCHS} | Validation F1 (micro): {f1_score:.4f}")

    if f1_score > best_f1:
        best_f1 = f1_score
        model.save_pretrained(best_model_path)
        print(f" New best model saved to '{best_model_path}' with F1: {best_f1:.4f}")

print("\n\n New model training complete! ")
print(f"The best model (LoRA adapter weights) was saved to '{best_model_path}' with a validation F1 score of {best_f1:.4f}.")
print("You must re-run the prediction script next to test the new, stable model.")

Attempting robust file load...


Map:   0%|          | 0/25529 [00:00<?, ? examples/s]

Map:   0%|          | 0/3191 [00:00<?, ? examples/s]

Map:   0%|          | 0/3192 [00:00<?, ? examples/s]

Map:   0%|          | 0/25529 [00:00<?, ? examples/s]

Map:   0%|          | 0/3191 [00:00<?, ? examples/s]

Map:   0%|          | 0/3192 [00:00<?, ? examples/s]

  class_weights = torch.tensor(total_samples / (len(label_cols) * class_counts), dtype=torch.float32)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/IndicBERTv2-MLM-only and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 298,757 || all params: 278,343,946 || trainable%: 0.1073


  0%|          | 0/4788 [00:00<?, ?it/s]


Epoch 1/3 | Validation F1 (micro): 0.9287
 New best model saved to 'best_peft_model' with F1: 0.9287

Epoch 2/3 | Validation F1 (micro): 0.9287

Epoch 3/3 | Validation F1 (micro): 0.9287


 New model training complete! 
The best model (LoRA adapter weights) was saved to 'best_peft_model' with a validation F1 score of 0.9287.
You must re-run the prediction script next to test the new, stable model.


In [None]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel
import torch
import numpy as np
from typing import List, Dict, Tuple

# --- Configuration ---
MODEL_NAME = "ai4bharat/IndicBERTv2-MLM-only"
PEFT_MODEL_PATH = "best_peft_model"
LABEL_COLUMNS = ['defamation', 'hate', 'non-hate', 'violence', 'vulgar']
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# üõë FINAL OPTIMIZED THRESHOLD (Test Value) üõë
FINAL_THRESHOLD = 0.37

# --- Step 1: Load Model and Tokenizer ---

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# 1. Load the base model
base_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(LABEL_COLUMNS),
    problem_type="multi_label_classification",
    ignore_mismatched_sizes=True
)

# 2. Attach the trained PEFT adapter weights
model = PeftModel.from_pretrained(base_model, PEFT_MODEL_PATH)
model.to(device)
model.eval()

print(f"\nModel and PEFT adapter loaded successfully from '{PEFT_MODEL_PATH}'.")

# --- Step 2: Define Prediction Function (Using Final Threshold) ---

def predict_multilabel_text(text: str, model, tokenizer, device, threshold: float = FINAL_THRESHOLD) -> List[str]:
    """Tokenizes text and returns predicted labels using the optimized threshold."""

    texts = [text]

    encoding = tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

    inputs = {k: v.to(device) for k, v in encoding.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    # Apply sigmoid to logits
    probs = torch.sigmoid(outputs.logits).cpu().numpy().flatten()

    # Get binary predictions (1 or 0) based on new threshold
    predictions = (probs >= threshold).astype(int)

    # Map predictions to label names
    predicted_labels = [LABEL_COLUMNS[i] for i, pred in enumerate(predictions) if pred == 1]

    return predicted_labels

# --- Step 3: Manual Inputs for Testing ---

manual_texts = [
    # Test Case 1: Non-Hate/Neutral
    "‡§Ü‡§ú ‡§ï‡§æ ‡§Æ‡•å‡§∏‡§Æ ‡§¨‡§π‡•Å‡§§ ‡§∏‡•Å‡§π‡§æ‡§®‡§æ ‡§π‡•à‡•§",

    # Test Case 2: Hate, Vulgar, Defamation (Stronger Example)
    "‡§Ø‡§π ‡§®‡•á‡§§‡§æ ‡§ï‡•á‡§µ‡§≤ ‡§ù‡•Ç‡§† ‡§¨‡•ã‡§≤‡§§‡§æ ‡§π‡•à ‡§î‡§∞ ‡§µ‡§π ‡§™‡•Ç‡§∞‡•Ä ‡§§‡§∞‡§π ‡§∏‡•á ‡§¶‡•á‡§∂‡§¶‡•ç‡§∞‡•ã‡§π‡•Ä ‡§π‡•à‡•§",

    # Test Case 3: Violence/Threat (Check for 'violence' label)
    "‡§â‡§∏‡•á ‡§ö‡•å‡§∞‡§æ‡§π‡•á ‡§™‡§∞ ‡§≤‡§ü‡§ï‡§æ ‡§¶‡•á‡§®‡§æ ‡§ö‡§æ‡§π‡§ø‡§è ‡§§‡§æ‡§ï‡§ø ‡§≤‡•ã‡§ó‡•ã‡§Ç ‡§ï‡•ã ‡§∏‡§¨‡§ï ‡§Æ‡§ø‡§≤‡•á‡•§",
]

print("\n" + "=" * 50)
print(f"FINAL PREDICTION RESULTS (Threshold: {FINAL_THRESHOLD})")
print("=" * 50)

for i, text in enumerate(manual_texts):
    predicted_labels = predict_multilabel_text(text, model, tokenizer, device)

    print(f"\n--- Test Case {i+1} ---")
    print(f"Text (Hindi): {text}")
    print(f"Predicted Labels: {predicted_labels}")

print("\n" + "=" * 50)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/IndicBERTv2-MLM-only and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model and PEFT adapter loaded successfully from 'best_peft_model'.

FINAL PREDICTION RESULTS (Threshold: 0.37)

--- Test Case 1 ---
Text (Hindi): ‡§Ü‡§ú ‡§ï‡§æ ‡§Æ‡•å‡§∏‡§Æ ‡§¨‡§π‡•Å‡§§ ‡§∏‡•Å‡§π‡§æ‡§®‡§æ ‡§π‡•à‡•§
Predicted Labels: ['hate', 'non-hate', 'vulgar']

--- Test Case 2 ---
Text (Hindi): ‡§Ø‡§π ‡§®‡•á‡§§‡§æ ‡§ï‡•á‡§µ‡§≤ ‡§ù‡•Ç‡§† ‡§¨‡•ã‡§≤‡§§‡§æ ‡§π‡•à ‡§î‡§∞ ‡§µ‡§π ‡§™‡•Ç‡§∞‡•Ä ‡§§‡§∞‡§π ‡§∏‡•á ‡§¶‡•á‡§∂‡§¶‡•ç‡§∞‡•ã‡§π‡•Ä ‡§π‡•à‡•§
Predicted Labels: ['hate', 'non-hate', 'violence']

--- Test Case 3 ---
Text (Hindi): ‡§â‡§∏‡•á ‡§ö‡•å‡§∞‡§æ‡§π‡•á ‡§™‡§∞ ‡§≤‡§ü‡§ï‡§æ ‡§¶‡•á‡§®‡§æ ‡§ö‡§æ‡§π‡§ø‡§è ‡§§‡§æ‡§ï‡§ø ‡§≤‡•ã‡§ó‡•ã‡§Ç ‡§ï‡•ã ‡§∏‡§¨‡§ï ‡§Æ‡§ø‡§≤‡•á‡•§
Predicted Labels: ['hate', 'non-hate']



In [None]:
# =====================================================
# FINAL OPTIMIZED IndicBERT + LoRA TRAINING SCRIPT
# (Includes ALL fixes and optimized loss function)
# =====================================================

# STEP 1: Install necessary libraries (MUST BE RUN IN YOUR ENVIRONMENT!)
# !pip install -q transformers datasets peft accelerate evaluate bitsandbytes scikit-learn torch

import os
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from torch.optim import AdamW
from peft import LoraConfig, get_peft_model
import torch
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import numpy as np
import evaluate

# =====================================================
# 1Ô∏è Load & Prepare Dataset
# =====================================================
FILE_PATH = "hate_speech_hindi_final.csv"
LABEL_COLUMNS = ['defamation', 'hate', 'non-hate', 'violence', 'vulgar']

print("Attempting robust file load...")
try:
    df = pd.read_csv(
        FILE_PATH,
        encoding='latin-1',
        sep=',',
        on_bad_lines='skip',
        engine='python'
    )
except Exception as e:
    print(f"FATAL ERROR: Could not read file. Error: {e}")
    raise e

# Filter columns and perform data cleaning
required_cols = ['text'] + LABEL_COLUMNS
df = df[required_cols].copy()

# Fix: Ensure 'text' is string and handle NaNs for tokenizer
df['text'] = df['text'].astype(str).fillna('')

# Convert all label columns to integer type
for col in LABEL_COLUMNS:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

# Mock object for label management
class MockMlb:
    def __init__(self, classes):
        self.classes_ = classes
mlb = MockMlb(classes=LABEL_COLUMNS)

# Split the data
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

train_ds = Dataset.from_pandas(train_df, preserve_index=False)
val_ds = Dataset.from_pandas(val_df, preserve_index=False)
test_ds = Dataset.from_pandas(test_df, preserve_index=False)

# =====================================================
# 2Ô∏è Tokenize & Format Data
# =====================================================
MODEL_NAME = "ai4bharat/IndicBERTv2-MLM-only"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

train_ds = train_ds.map(tokenize_function, batched=True)
val_ds = val_ds.map(tokenize_function, batched=True)
test_ds = test_ds.map(tokenize_function, batched=True)

label_cols = LABEL_COLUMNS
train_ds = train_ds.map(lambda x: {"labels": np.array([x[c] for c in label_cols], dtype=np.float32)}, remove_columns=label_cols+['text'])
val_ds = val_ds.map(lambda x: {"labels": np.array([x[c] for c in label_cols], dtype=np.float32)}, remove_columns=label_cols+['text'])
test_ds = test_ds.map(lambda x: {"labels": np.array([x[c] for c in label_cols], dtype=np.float32)}, remove_columns=label_cols+['text'])

train_ds.set_format("torch")
val_ds.set_format("torch")
test_ds.set_format("torch")

# =====================================================
# 3Ô∏è Create DataLoaders & Class Weights (OPTIMIZED)
# =====================================================
train_dataloader = DataLoader(train_ds, shuffle=True, batch_size=16)
eval_dataloader = DataLoader(val_ds, batch_size=16)
test_dataloader = DataLoader(test_ds, batch_size=16)

# üõë FIX: Calculate Positive Weights (Pos_Weight) using inverse class frequency üõë
class_counts = train_df[label_cols].sum()
total_samples = len(train_df)
total_negative_counts = total_samples - class_counts

# Pos_Weight = (Negative Samples) / (Positive Samples). This scales up the minority class loss.
pos_weights = total_negative_counts / class_counts
class_weights = torch.tensor(pos_weights.values, dtype=torch.float32)

# =====================================================
# 4Ô∏è Load Model & Apply LoRA (OPTIMIZED HYPERPARAMETERS)
# =====================================================
id2label = {i: label for i, label in enumerate(mlb.classes_)}
label2id = {label: i for i, label in enumerate(mlb.classes_)}

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=len(mlb.classes_), problem_type="multi_label_classification",
    id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True)

# Optimized LoRA Dropout (0.2)
peft_config = LoraConfig(task_type="SEQ_CLS", r=8, lora_alpha=16, lora_dropout=0.2)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# =====================================================
# 5Ô∏è Improved Training Setup (OPTIMIZED HYPERPARAMETERS)
# =====================================================
LEARNING_RATE = 1e-5 # Optimized LR
NUM_EPOCHS = 3       # Optimized Epochs
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
num_training_steps = NUM_EPOCHS * len(train_dataloader)
num_warmup_steps = int(0.1 * num_training_steps)

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
class_weights = class_weights.to(device)

# =====================================================
# 6Ô∏è Manual Training & Evaluation Loop (FIXED LOSS FUNCTION CALL)
# =====================================================
progress_bar = tqdm(range(num_training_steps))
metric = evaluate.load("f1")
best_f1 = 0
best_model_path = "best_peft_model"

for epoch in range(NUM_EPOCHS):
    model.train()
    for batch in train_dataloader:
        labels = batch.pop("labels").to(device)
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**inputs)

        # üõë FIX: Use Pos_Weight in BCEWithLogitsLoss to target minority classes üõë
        loss_fct = torch.nn.BCEWithLogitsLoss(pos_weight=class_weights)
        loss = loss_fct(outputs.logits, labels)

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    model.eval()
    all_preds = []
    all_labels = []
    for batch in eval_dataloader:
        labels = batch.pop("labels")
        inputs = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        predictions = (torch.sigmoid(logits) > 0.5).cpu().numpy().astype(int)
        all_preds.extend(predictions)
        all_labels.extend(labels.numpy().astype(int))

    all_preds_flat = np.array(all_preds).flatten()
    all_labels_flat = np.array(all_labels).flatten()
    f1_score = metric.compute(predictions=all_preds_flat, references=all_labels_flat, average="micro")["f1"]

    print(f"\nEpoch {epoch+1}/{NUM_EPOCHS} | Validation F1 (micro): {f1_score:.4f}")

    if f1_score > best_f1:
        best_f1 = f1_score
        model.save_pretrained(best_model_path)
        print(f" New best model saved to '{best_model_path}' with F1: {best_f1:.4f}")

print("\n\n Final model training complete! ")
print(f"The best model (LoRA adapter weights) was saved to '{best_model_path}' with a validation F1 score of {best_f1:.4f}.")
print("\nüî• NEXT STEP: Rerun your prediction script to confirm the loss function fix worked! üî•")

Attempting robust file load...


Map:   0%|          | 0/25529 [00:00<?, ? examples/s]

Map:   0%|          | 0/3191 [00:00<?, ? examples/s]

Map:   0%|          | 0/3192 [00:00<?, ? examples/s]

Map:   0%|          | 0/25529 [00:00<?, ? examples/s]

Map:   0%|          | 0/3191 [00:00<?, ? examples/s]

Map:   0%|          | 0/3192 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/IndicBERTv2-MLM-only and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 298,757 || all params: 278,343,946 || trainable%: 0.1073


  0%|          | 0/4788 [00:00<?, ?it/s]


Epoch 1/3 | Validation F1 (micro): 0.7509
 New best model saved to 'best_peft_model' with F1: 0.7509

Epoch 2/3 | Validation F1 (micro): 0.7386

Epoch 3/3 | Validation F1 (micro): 0.7366


 Final model training complete! 
The best model (LoRA adapter weights) was saved to 'best_peft_model' with a validation F1 score of 0.7509.

üî• NEXT STEP: Rerun your prediction script to confirm the loss function fix worked! üî•


In [None]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel
import torch
import numpy as np
from typing import List

# --- Configuration ---
MODEL_NAME = "ai4bharat/IndicBERTv2-MLM-only"
PEFT_MODEL_PATH = "best_peft_model"
LABEL_COLUMNS = ['defamation', 'hate', 'non-hate', 'violence', 'vulgar']
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Using the estimated optimal threshold
FINAL_THRESHOLD = 0.37

# --- Step 1: Load Model and Tokenizer ---

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# 1. Load the base model
base_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(LABEL_COLUMNS),
    problem_type="multi_label_classification",
    ignore_mismatched_sizes=True
)

# 2. Attach the trained PEFT adapter weights
model = PeftModel.from_pretrained(base_model, PEFT_MODEL_PATH)
model.to(device)
model.eval()

print(f"\nModel and PEFT adapter loaded successfully from '{PEFT_MODEL_PATH}'.")

# --- Step 2: Define Prediction Function ---

def predict_multilabel_text(text: str, model, tokenizer, device, threshold: float = FINAL_THRESHOLD) -> List[str]:
    """Tokenizes text and returns predicted labels using the optimized threshold."""

    texts = [text]

    encoding = tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

    inputs = {k: v.to(device) for k, v in encoding.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    # Apply sigmoid to logits
    probs = torch.sigmoid(outputs.logits).cpu().numpy().flatten()

    # Get binary predictions (1 or 0) based on threshold
    predictions = (probs >= threshold).astype(int)

    # Map predictions to label names
    predicted_labels = [LABEL_COLUMNS[i] for i, pred in enumerate(predictions) if pred == 1]

    return predicted_labels

# --- Step 3: Manual Inputs for Testing ---

manual_texts = [
    # Test Case 1: Non-Hate/Neutral - Should predict ONLY 'non-hate' or []
    "‡§Ü‡§ú ‡§ï‡§æ ‡§Æ‡•å‡§∏‡§Æ ‡§¨‡§π‡•Å‡§§ ‡§∏‡•Å‡§π‡§æ‡§®‡§æ ‡§π‡•à‡•§",

    # Test Case 2: Hate, Vulgar, Defamation - Should predict 'hate' and potentially 'defamation'
    "‡§Ø‡§π ‡§®‡•á‡§§‡§æ ‡§ï‡•á‡§µ‡§≤ ‡§ù‡•Ç‡§† ‡§¨‡•ã‡§≤‡§§‡§æ ‡§π‡•à ‡§î‡§∞ ‡§µ‡§π ‡§™‡•Ç‡§∞‡•Ä ‡§§‡§∞‡§π ‡§∏‡•á ‡§¶‡•á‡§∂‡§¶‡•ç‡§∞‡•ã‡§π‡•Ä ‡§π‡•à‡•§",

    # Test Case 3: Violence/Threat - Should predict 'violence'
    "‡§â‡§∏‡•á ‡§ö‡•å‡§∞‡§æ‡§π‡•á ‡§™‡§∞ ‡§≤‡§ü‡§ï‡§æ ‡§¶‡•á‡§®‡§æ ‡§ö‡§æ‡§π‡§ø‡§è ‡§§‡§æ‡§ï‡§ø ‡§≤‡•ã‡§ó‡•ã‡§Ç ‡§ï‡•ã ‡§∏‡§¨‡§ï ‡§Æ‡§ø‡§≤‡•á‡•§",
]

print("\n" + "=" * 50)
print(f"FINAL PREDICTION RESULTS (Threshold: {FINAL_THRESHOLD})")
print("=" * 50)

for i, text in enumerate(manual_texts):
    predicted_labels = predict_multilabel_text(text, model, tokenizer, device)

    print(f"\n--- Test Case {i+1} ---")
    print(f"Text (Hindi): {text}")
    print(f"Predicted Labels: {predicted_labels}")

print("\n" + "=" * 50)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/IndicBERTv2-MLM-only and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model and PEFT adapter loaded successfully from 'best_peft_model'.

FINAL PREDICTION RESULTS (Threshold: 0.37)

--- Test Case 1 ---
Text (Hindi): ‡§Ü‡§ú ‡§ï‡§æ ‡§Æ‡•å‡§∏‡§Æ ‡§¨‡§π‡•Å‡§§ ‡§∏‡•Å‡§π‡§æ‡§®‡§æ ‡§π‡•à‡•§
Predicted Labels: ['defamation', 'hate', 'non-hate', 'violence', 'vulgar']

--- Test Case 2 ---
Text (Hindi): ‡§Ø‡§π ‡§®‡•á‡§§‡§æ ‡§ï‡•á‡§µ‡§≤ ‡§ù‡•Ç‡§† ‡§¨‡•ã‡§≤‡§§‡§æ ‡§π‡•à ‡§î‡§∞ ‡§µ‡§π ‡§™‡•Ç‡§∞‡•Ä ‡§§‡§∞‡§π ‡§∏‡•á ‡§¶‡•á‡§∂‡§¶‡•ç‡§∞‡•ã‡§π‡•Ä ‡§π‡•à‡•§
Predicted Labels: ['defamation', 'hate', 'non-hate', 'violence', 'vulgar']

--- Test Case 3 ---
Text (Hindi): ‡§â‡§∏‡•á ‡§ö‡•å‡§∞‡§æ‡§π‡•á ‡§™‡§∞ ‡§≤‡§ü‡§ï‡§æ ‡§¶‡•á‡§®‡§æ ‡§ö‡§æ‡§π‡§ø‡§è ‡§§‡§æ‡§ï‡§ø ‡§≤‡•ã‡§ó‡•ã‡§Ç ‡§ï‡•ã ‡§∏‡§¨‡§ï ‡§Æ‡§ø‡§≤‡•á‡•§
Predicted Labels: ['defamation', 'hate', 'non-hate', 'violence', 'vulgar']



In [None]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
# Note: Removed IntervalStrategy as it seems unsupported
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
import torch
import numpy as np
import evaluate
from torch.nn import BCEWithLogitsLoss

# =====================================================
# 1Ô∏è Configuration & Data Preparation
# =====================================================
FILE_PATH = "hate_speech_hindi_final.csv"
LABEL_COLUMNS = ['defamation', 'hate', 'non-hate', 'violence', 'vulgar']
MODEL_NAME = "ai4bharat/IndicBERTv2-MLM-only"

# Data Loading (assuming previous robust setup)
df = pd.read_csv(FILE_PATH, encoding='latin-1', sep=',', on_bad_lines='skip', engine='python')
df = df[['text'] + LABEL_COLUMNS].copy()
df['text'] = df['text'].astype(str).fillna('')
for col in LABEL_COLUMNS:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)
mlb = type('MockMlb', (object,), {'classes_': LABEL_COLUMNS})()

# Split & Tokenize Data
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_ds = train_test_split(temp_df, test_size=0.5, random_state=42)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

train_ds = Dataset.from_pandas(train_df, preserve_index=False).map(tokenize_function, batched=True)
val_ds = Dataset.from_pandas(val_df, preserve_index=False).map(tokenize_function, batched=True)

label_cols = LABEL_COLUMNS
train_ds = train_ds.map(lambda x: {"labels": np.array([x[c] for c in label_cols], dtype=np.float32)}, remove_columns=label_cols+['text'])
val_ds = val_ds.map(lambda x: {"labels": np.array([x[c] for c in label_cols], dtype=np.float32)}, remove_columns=label_cols+['text'])

# =====================================================
# 2Ô∏è Class Weights & Model Initialization
# =====================================================
class_counts = train_df[label_cols].sum()
total_samples = len(train_df)
total_negative_counts = total_samples - class_counts
pos_weights = total_negative_counts / class_counts
pos_weights_tensor = torch.tensor(pos_weights.values, dtype=torch.float32)

id2label = {i: label for i, label in enumerate(mlb.classes_)}
label2id = {label: i for i, label in enumerate(mlb.classes_)}

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=len(mlb.classes_), problem_type="multi_label_classification",
    id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True)

peft_config = LoraConfig(task_type="SEQ_CLS", r=8, lora_alpha=16, lora_dropout=0.2)
model = get_peft_model(model, peft_config)

# =====================================================
# 3Ô∏è Custom Trainer for Weighted Loss
# =====================================================
class WeightedTrainer(Trainer):
    """Custom Trainer to apply the calculated pos_weight to the BCE loss."""
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        weights = pos_weights_tensor.to(labels.device)
        loss_fct = BCEWithLogitsLoss(pos_weight=weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# =====================================================
# 4Ô∏è Training Arguments & Execution (SIMPLIFIED FOR COMPATIBILITY)
# =====================================================
OUTPUT_DIR = "trainer_peft_model"

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=0,
    weight_decay=0.01,
    learning_rate=1e-5,
    logging_dir='./logs',
    logging_steps=100,
    # üõë FIX: Removed unsupported evaluation and save strategy arguments üõë
)

# Initialize metric for compute_metrics function
metric = evaluate.load("f1")

def compute_metrics(p):
    predictions = (torch.sigmoid(torch.tensor(p.predictions)) > 0.5).int().flatten()
    references = p.label_ids.flatten().astype(int)
    return metric.compute(predictions=predictions, references=references, average="micro")

# Initialize Trainer with Custom Loss
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Start training
print("\nStarting Training with Hugging Face Trainer (Simplified)...")
trainer.train()

# Save the final adapter weights from the last epoch
trainer.model.save_pretrained("final_trainer_peft_model")
print("\nFinal model saved successfully to 'final_trainer_peft_model'")


Map:   0%|          | 0/25529 [00:00<?, ? examples/s]

Map:   0%|          | 0/3191 [00:00<?, ? examples/s]

Map:   0%|          | 0/25529 [00:00<?, ? examples/s]

Map:   0%|          | 0/3191 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/IndicBERTv2-MLM-only and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = WeightedTrainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 3}.



Starting Training with Hugging Face Trainer (Simplified)...


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter: