<a href="https://colab.research.google.com/github/Chandramadi/fine_tuning_bert-base-multilingual-cased/blob/main/Fine_Tuning_bert_base_multilingual_cased.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Step 1: Install All Required Dependencies

In [None]:
# Core transformers and datasets
!pip install -q transformers datasets accelerate scikit-learn

# LoRA + quantization support (bitsandbytes + PEFT)
!pip install bitsandbytes
!pip install -q peft

In [None]:
import bitsandbytes as bnb
from google.colab import userdata
import os
from datasets import Dataset

## Step 2: Load mBERT Model & Tokenizer (Quantized + Lora)

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

model_name = "bert-base-multilingual-cased"  # this is the correct path, not "google-bert/..."

# Define quantization config
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True,  # enables CPU fallback
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)

label2id = {"bn": 0, "gu": 1, "hi": 2, "english": 3}
id2label = {0: "bn", 1: "gu", 2: "hi", 3: "english"}

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    num_labels=4,  # Important!
    id2label=id2label,
    label2id=label2id,
)

# Prepare for LoRA fine-tuning
model = prepare_model_for_kbit_training(model)

# Add LoRA adapters
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


## Step 3 — load and preprocess the samanantar dataset

### Extract and Prepare Samples from the Dataset

In [None]:
from datasets import load_dataset

# Load multiple language datasets from Samanantar
dataset = {}
languages = ['bn', 'gu', 'hi']
for lang in languages:
    dataset[lang] = load_dataset("ai4bharat/samanantar", lang, split="train[:1000]")

### Now combine the text and labels:

In [None]:
# Create text-label pairs

langs = ['bn', 'gu', 'hi']
label2id = {lang: i for i, lang in enumerate(langs)}
label2id['english'] = len(label2id)  # Add english as a new class

samples = []

# Original Indian language samples (tgt as text)
for lang in langs:
    lang_data = dataset[lang]
    for row in lang_data:
        samples.append({
            "text": row["tgt"],  # Indian language sentence
            "label": label2id[lang]
        })

# English samples (src as text)
for lang in langs:
    lang_data = dataset[lang]
    for row in lang_data:
        samples.append({
            "text": row["src"],  # English sentence
            "label": label2id["english"]
        })
raw_dataset = Dataset.from_list(samples)
print(raw_dataset)


### Split Your Dataset & Tokenize Correctly into train, valid, and test.

In [None]:
from datasets import DatasetDict

# 1. Split raw dataset
train_testvalid = raw_dataset.train_test_split(test_size=0.1, seed=42)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=42)

final_dataset = DatasetDict({
    'train': train_testvalid['train'],
    'validation': test_valid['train'],
    'test': test_valid['test'],
})

# 2. Tokenization function
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

# 3. Tokenize datasets
tokenized_datasets = final_dataset.map(tokenize_function, batched=True)

# 4. Rename 'label' to 'labels'
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

# 5. Set format for PyTorch
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# 6. Extract subsets for training/eval/test
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]
test_dataset = tokenized_datasets["test"]

# 7. Optional sanity check
print(train_dataset[0])


## Step 4 - Model Training using Hugging Face Transformers

### 4.1 - Import the Trainer and related utilities:

In [None]:
from transformers import Trainer, TrainingArguments, EvalPrediction
import numpy as np
from sklearn.metrics import accuracy_score

### 4.2 - Define a metrics function:

In [None]:
def compute_metrics(eval_pred: EvalPrediction):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}


###  4.3 - Setup training arguments:

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    num_train_epochs=3,
    save_strategy="epoch",
    logging_steps=10,
    learning_rate=2e-4,
    remove_unused_columns=False,  # Required for LoRA + PEFT
    fp16=True,
    save_total_limit=2
)


### 4.4 - Create the Trainer:

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


### 4.5 - Start training:

In [None]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [None]:
trainer.train()

## Step 5 — Evaluate on Test Set

In [None]:
trainer.evaluate(test_dataset)

## Step 6 - print predicted labels with their confidence scores

In [None]:
from transformers import AutoTokenizer
import torch
import torch.nn.functional as F

# ✅ Mapping of IDs to actual language labels (4 labels including English)
id2label = {0: 'bn', 1: 'gu', 2: 'hi', 3: 'en'}

# 🔁 Input sentence(s)
sentences = [
    "Prime Minister Modi met the French President today.",  # English
    "इनपुट उपकरण को ऑनलाइन आज़माएं",  # Hindi
    "প্রধানমন্ত্রী আজ ফরাসি রাষ্ট্রপতির সঙ্গে সাক্ষাৎ করেছেন",  # Bengali
    "પ્રધાનમંત્રીએ આજે ફ્રાંસના રાષ્ટ્રપતિ સાથે બેઠક કરી",  # Gujarati
    "helllo",  # Junk/unknown
]

# ✅ Tokenize the sentences
inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True).to(model.device)

# ✅ Forward pass through model
with torch.no_grad():
    outputs = model(**inputs)

# ✅ Apply softmax to get probabilities
probs = F.softmax(outputs.logits, dim=-1)

# ✅ Get predicted class IDs
preds = torch.argmax(probs, dim=-1)

# 🔍 Confidence threshold (adjustable)
CONFIDENCE_THRESHOLD = 0.80

# 🖨️ Print sentence, predicted label, confidence, and all class probabilities
for i, (sent, prob, pred_id) in enumerate(zip(sentences, probs, preds)):
    confidence = prob[pred_id].item()
    label = id2label[pred_id.item()] if confidence >= CONFIDENCE_THRESHOLD else "unknown"
    class_probs = {id2label[j]: round(p.item(), 4) for j, p in enumerate(prob)}

    print(f"Sentence {i+1}: {sent}")
    print(f"➡️ Predicted: {label} | Confidence: {confidence:.4f}")
    print(f"📊 Class Probabilities: {class_probs}\n")
