In [1]:
# Install required packages
!pip install transformers datasets peft accelerate sentencepiece pandas

import pandas as pd
import torch
from datasets import Dataset
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType



In [3]:
# 1. Load CSV
# =======================
# Replace 'santali_eng.csv' with your file path
df = pd.read_csv("santali_english_final.csv",on_bad_lines='skip')  # assuming first column = English, second = Santali

# Flip columns: Santali -> English
df = df[[df.columns[1], df.columns[0]]]  # now first col = Santali, second = English
df.columns = ["src", "tgt"]

dataset = Dataset.from_pandas(df)

In [4]:
# =======================
# 2. Tokenizer + Model
# =======================
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
tokenizer.src_lang = "sat_OLCK"  # custom code for Santali

def preprocess(batch):
    inputs = tokenizer(batch["src"], truncation=True, padding="max_length", max_length=64)
    labels = tokenizer(batch["tgt"], truncation=True, padding="max_length", max_length=64)
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/13182 [00:00<?, ? examples/s]

In [5]:
# =======================
# 3. LoRA Setup
# =======================
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = MBartForConditionalGeneration.from_pretrained(model_name)
model = get_peft_model(model, lora_config)


model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

In [6]:
# =======================
# 4. Training Arguments
# =======================
training_args = TrainingArguments(
    output_dir="./mbart_lora_santali",
    per_device_train_batch_size=2,
    num_train_epochs=5,
    logging_steps=10,
    save_strategy="no",
    fp16=True if torch.cuda.is_available() else False,
    learning_rate=3e-4,
     report_to=[]
)

In [7]:
# =======================
# 5. Trainer
# =======================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

trainer.train()

Step,Training Loss
10,11.8271
20,10.7728
30,10.429
40,10.0375
50,9.6
60,9.4829
70,9.4108
80,9.3899
90,9.2805
100,9.1762


TrainOutput(global_step=32955, training_loss=9.038537460834155, metrics={'train_runtime': 4292.1811, 'train_samples_per_second': 15.356, 'train_steps_per_second': 7.678, 'total_flos': 8957076418068480.0, 'train_loss': 9.038537460834155, 'epoch': 5.0})

In [8]:
# Ensure model is in eval mode
model.eval()

# Example Santali sentences
test_sentences = [
    "ᱤᱧ ᱛᱮᱦᱮᱧ ᱤᱧᱟᱜ ᱠᱟᱱᱫᱷᱟ",
    "ᱵᱟᱹᱛᱤ ᱠᱟᱱᱛᱤ"
]

# Tokenize input sentences
inputs = tokenizer(test_sentences, return_tensors="pt", padding=True, truncation=True, max_length=64)

# Move inputs to the same device as the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
inputs = {k: v.to(device) for k, v in inputs.items()}

# Generate translations
translated_tokens = model.generate(**inputs, max_length=64)

# Decode the generated tokens
translations = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)

# Print results
for src, tgt in zip(test_sentences, translations):
    print(f"Santali: {src}")
    print(f"English: {tgt}\n")


Santali: ᱤᱧ ᱛᱮᱦᱮᱧ ᱤᱧᱟᱜ ᱠᱟᱱᱫᱷᱟ
English: She is brushing the goat.

Santali: ᱵᱟᱹᱛᱤ ᱠᱟᱱᱛᱤ
English: The lights turned teal.



In [9]:
!pip install evaluate sacrebleu

import evaluate
import torch

# Load BLEU metric
bleu = evaluate.load("sacrebleu")

# Example: use last 10 sentences from your dataset as test set
test_dataset = dataset.shuffle(seed=42).select(range(10))  # adjust as needed

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

predictions = []
references = []

for item in test_dataset:
    src_text = item["src"]
    tgt_text = item["tgt"]

    # Tokenize and move to device
    inputs = tokenizer(src_text, return_tensors="pt", truncation=True, padding=True, max_length=64).to(device)
    outputs = model.generate(**inputs, max_length=64)
    pred_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    predictions.append(pred_text)
    references.append([tgt_text])  # sacrebleu expects list of references

# Compute BLEU score
bleu_score = bleu.compute(predictions=predictions, references=references)
print(f"BLEU score: {bleu_score['score']:.2f}")


Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py

Downloading builder script: 0.00B [00:00, ?B/s]

BLEU score: 3.88


In [10]:
!pip install sacrebleu evaluate

import evaluate
import torch

# Load BLEU metric with smoothing
bleu = evaluate.load("sacrebleu")

# Example: use last 10 sentences from your dataset as test set
test_dataset = dataset.shuffle(seed=42).select(range(10))  # adjust size as needed

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

predictions = []
references = []

for item in test_dataset:
    src_text = item["src"]
    tgt_text = item["tgt"]

    # Tokenize and move to device
    inputs = tokenizer(src_text, return_tensors="pt", truncation=True, padding=True, max_length=64).to(device)
    outputs = model.generate(**inputs, max_length=64)
    pred_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    predictions.append(pred_text)
    references.append([tgt_text])  # sacrebleu expects list of references

# Compute BLEU with smoothing
bleu_score = bleu.compute(predictions=predictions, references=references, smooth_method="exp")
# smooth_method="exp" is recommended for tiny datasets

print(f"Smoothed BLEU score: {bleu_score['score']:.2f}")


Smoothed BLEU score: 3.88
