In [1]:
!pip install -q transformers datasets accelerate peft sentencepiece sacrebleu


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
from google.colab import files
uploaded = files.upload()  # upload parallel_data.csv and san_eng_dict.csv
uploaded


Saving santali_english_final.csv to santali_english_final.csv




In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

DATASET_PATH = "santali_english_final.csv"  # rename if needed

df = pd.read_csv(DATASET_PATH, encoding='utf-8', engine='python', on_bad_lines='skip')

train_df, test_df = train_test_split(df, test_size=0.17, shuffle=True, random_state=42)

train_df.to_csv("train.csv", index=False)
test_df.to_csv("test.csv", index=False)

print("Train:", len(train_df), "rows")
print("Test:", len(test_df), "rows")


Train: 10940 rows
Test: 2241 rows


In [5]:
# utils (inline – no separate files needed)
import csv
import re

def load_dictionary(csv_path):
    mapping = {}
    with open(csv_path, encoding='utf-8') as f:
        reader = csv.reader(f)
        next(reader, None)
        for row in reader:
            if len(row) >= 2:
                eng = row[0].strip()
                san = row[1].strip()
                mapping[san] = eng
    return mapping

def split_tokens(text):
    return text.split()

def augment(text, dictionary):
    words = split_tokens(text)
    out = []
    for w in words:
        if w in dictionary:
            out.append(f"{w} ({dictionary[w]})")
        else:
            out.append(w)
    return " ".join(out)

def post_correct(pred, source, dictionary):
    for w in split_tokens(source):
        if w in dictionary and dictionary[w] not in pred:
            pred += f" ({dictionary[w]})"
    return pred


In [6]:
from transformers import (
    MBartForConditionalGeneration,
    MBart50TokenizerFast,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset
import torch
import pandas as pd

def train_model(
    train_csv="train.csv",
    dict_csv="san_eng_dict.csv",
    model_name="facebook/mbart-large-50-many-to-many-mmt",
    output_dir="mbart_lora_out",
    epochs=1,
    batch_size=2,
    max_len=128
):

    print("Loading tokenizer & model...")
    tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
    tokenizer.src_lang = "sat_OLCK"
    tokenizer.tgt_lang = "en_XX"

    model = MBartForConditionalGeneration.from_pretrained(model_name)
    model.config.dropout = 0.3
    model.config.attention_dropout = 0.3

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)

    # Load dictionary
    dictionary = load_dictionary(dict_csv)
    print("Dictionary entries:", len(dictionary))

    # -------------------------------------------------------------
    # CLEAN & PREPARE DATA
    # -------------------------------------------------------------
    print("Loading & cleaning training data...")
    df = pd.read_csv(train_csv, encoding="utf-8", engine="python", on_bad_lines="skip")

    # Force text columns to strings and strip
    df["english"] = df["english"].astype(str).fillna("").str.strip()
    df["santhali"] = df["santhali"].astype(str).fillna("").str.strip()

    # Remove empty rows
    df = df[(df["english"] != "") & (df["santhali"] != "")]

    # Remove rows containing list-like mess or brackets
    df = df[~df["english"].str.contains(r"[\[\]]", regex=True)]
    df = df[~df["santhali"].str.contains(r"[\[\]]", regex=True)]

    print("After cleaning:", len(df), "rows")

    # -------------------------------------------------------------
    # AUGMENT DATA WITH DICTIONARY HINTS
    # -------------------------------------------------------------
    augmented_rows = []
    for _, row in df.iterrows():
        san = str(row["santhali"])
        eng = str(row["english"])

        san_aug = augment(san, dictionary)  # add dictionary hints

        # Force these to plain strings (defensive)
        augmented_rows.append({
            "input": str(san_aug),
            "target": str(eng)
        })

    dataset = Dataset.from_list(augmented_rows)

    # -------------------------------------------------------------
    # TOKENIZATION (safe new API + extra cleaning)
    # -------------------------------------------------------------
    def tokenize(batch):
        # Ensure all batch items are strings and clean them
        clean_inputs = []
        clean_targets = []
        for inp, tgt in zip(batch.get("input", []), batch.get("target", [])):
            if not isinstance(inp, str):
                inp = str(inp)
            if not isinstance(tgt, str):
                tgt = str(tgt)

            # Remove accidental list-like markers and excessive whitespace
            inp = inp.replace("[", "").replace("]", "").strip()
            tgt = tgt.replace("[", "").replace("]", "").strip()

            clean_inputs.append(inp)
            clean_targets.append(tgt)

        enc_inputs = tokenizer(
            clean_inputs,
            truncation=True,
            padding="max_length",
            max_length=max_len
        )
        enc_labels = tokenizer(
            text_target=clean_targets,
            truncation=True,
            padding="max_length",
            max_length=max_len
        )

        enc_inputs["labels"] = enc_labels["input_ids"]
        return enc_inputs

    # DEBUG: find any remaining non-string inputs before tokenization
    bad_rows = []
    for i, ex in enumerate(dataset):
        inp = ex.get("input", None)
        tgt = ex.get("target", None)
        if not isinstance(inp, str):
            bad_rows.append((i, "input", type(inp), inp))
        if not isinstance(tgt, str):
            bad_rows.append((i, "target", type(tgt), tgt))

    print("Bad rows found before tokenization:", len(bad_rows))
    if len(bad_rows) > 0:
        for b in bad_rows[:10]:
            print("Example bad entry:", b)
        raise ValueError("Found non-string inputs in dataset. Inspect the examples printed above.")

    dataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)

    # -------------------------------------------------------------
    # LoRA SETUP
    # -------------------------------------------------------------
    lora = LoraConfig(
        r=8,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.15,
        task_type=TaskType.SEQ_2_SEQ_LM,
    )

    model = get_peft_model(model, lora)

    # -------------------------------------------------------------
    # TRAINING ARGS (wandb disabled)
    # -------------------------------------------------------------
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=batch_size,
        num_train_epochs=epochs,
        learning_rate=1e-4,
        fp16=torch.cuda.is_available(),
        logging_steps=20,
        save_total_limit=2,
        remove_unused_columns=False,
        report_to="none"  # <--- DISABLE WANDB CLEANLY
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
    )

    # -------------------------------------------------------------
    # TRAINING
    # -------------------------------------------------------------
    print("Starting training...")
    trainer.train()

    # SAVE
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    print("Training complete. Model saved to:", output_dir)


In [7]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"
os.environ["WANDB_SILENT"] = "true"


In [8]:
train_model(train_csv="train.csv", dict_csv="san_eng_dict.csv")


Loading tokenizer & model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

Dictionary entries: 10151
Loading & cleaning training data...
After cleaning: 10940 rows
Bad rows found before tokenization: 0


Map:   0%|          | 0/10940 [00:00<?, ? examples/s]

Starting training...


Step,Training Loss
20,11.1635
40,10.5444
60,10.2221
80,10.1104
100,10.0606
120,9.9839
140,9.9043
160,9.8308
180,9.8902
200,9.8633


Training complete. Model saved to: mbart_lora_out


In [9]:
def load_trained_model(model_dir="mbart_lora_out", base_model="facebook/mbart-large-50-many-to-many-mmt"):
    # Load tokenizer from base model (it contains sat_OLCK)
    tokenizer = MBart50TokenizerFast.from_pretrained(base_model)
    tokenizer.src_lang = "sat_OLCK"
    tokenizer.tgt_lang = "en_XX"

    # Load LoRA fine-tuned model
    model = MBartForConditionalGeneration.from_pretrained(model_dir)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)

    return tokenizer, model

tokenizer, model = load_trained_model()
print("Model Loaded!")


Model Loaded!


In [10]:
def translate_san_to_eng(sentence):
    inputs = tokenizer(sentence, return_tensors="pt").to(model.device)

    generated = model.generate(
        **inputs,
        max_length=128,
        num_beams=5,
        early_stopping=True
    )

    return tokenizer.decode(generated[0], skip_special_tokens=True)


In [11]:
print(translate_san_to_eng("ᱤᱧ ᱛᱮᱦᱮᱧ"))      # I am going home
print(translate_san_to_eng("ᱡᱚᱦᱲᱟᱹ ᱟᱠᱷᱟ"))  # What is your name?


We are walking.
We are walking.


In [12]:
def translate_with_dict(san):
    san_aug = augment(san, load_dictionary("san_eng_dict.csv"))
    return translate_san_to_eng(san_aug)

# Test
print(translate_with_dict("ᱤᱧ ᱛᱮᱦᱮᱧ"))


I am relaxing today.


In [13]:
!pip install sacrebleu
import sacrebleu
import pandas as pd

df_test = pd.read_csv("test.csv")

preds = []
refs = []

for _, row in df_test.iterrows():
    san = row["santhali"]
    eng = row["english"]

    pred = translate_san_to_eng(san)
    preds.append(pred)
    refs.append([eng])

print("BLEU:", sacrebleu.corpus_bleu(preds, refs).score)


BLEU: 64.34588841607616
