<a href="https://colab.research.google.com/github/Dhruv3110/Roman-to-Devanagari-Transliteration-System/blob/main/transliteration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Load Dataset -> Pandas

In [None]:
!pip install nbstripout


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


In [None]:
!pip install datasets unsloth jiwer evaluate sacrebleu

In [None]:
from huggingface_hub import login
from datasets import load_dataset, Dataset
import pandas as pd
import re
import numpy as np
import torch
from unsloth import FastLanguageModel
import os
from transformers import TrainingArguments
from unsloth.trainer import SFTTrainer
from jiwer import wer, cer
import evaluate
import numpy as np
from tqdm import tqdm
tqdm = lambda x, **kwargs: x


login()

ds = load_dataset("codebyam/Hinglish-Hindi-Transliteration-Dataset", split="train")
df = ds.to_pandas()
df


Rename columns

In [None]:
df = df.rename(columns={'Hinglish': 'Roman'})
print(df.head())

SOFT CLEANING

In [None]:
def soft_clean_row(row):
    roman, hindi = row["Roman"], row["Hindi"]

    if not isinstance(roman, str) or not isinstance(hindi, str):
        return None

    roman = roman.lower().strip()
    roman = re.sub(r"[^a-z0-9\s'!?.,\-]", "", roman)

    hindi = hindi.strip()
    hindi = re.sub(r"[^\u0900-\u097F\sред!?]", "", hindi)

    if not roman or not hindi:
        return None

    return {"roman": roman, "hindi": hindi}

df = df.apply(soft_clean_row, axis=1).dropna().apply(pd.Series)
print("After cleaning:", len(df))


NOISE DETECTION

In [None]:
HINGLISH_SLANG = {
    "m","me","mai","main","yr","yaar","bro","bruh","kr","krr","kya","kyu","kyun",
    "h","hu","hun","rha","rhi","rhe","plz","pls","sry","thx","bht","lol","lmao"
}

HINGLISH_CONSONANT_TOKENS = {"kl","tb","jb","thk","hn","thn","kch","smj"}

def noise_score(roman):
    score = 0
    tokens = roman.split()

    score += bool(re.search(r"(.)\1{2,}", roman))
    score += sum(t in HINGLISH_SLANG for t in tokens)
    score += sum(t in HINGLISH_CONSONANT_TOKENS for t in tokens)
    score += sum(re.fullmatch(r"[bcdfghjklmnpqrstvwxyz]+", t) is not None for t in tokens)
    score += bool(re.search(r"[a-z]\d[a-z]", roman))
    score += bool(re.search(r"([!?.,])\1{1,}", roman))

    return score

df["noise_score"] = df["roman"].apply(noise_score)
df["is_noisy"] = (df["noise_score"] > 0).astype(int)

print("Noise ratio:", df["is_noisy"].mean())


DATA AUGMENTATION (Clean + Controlled)

In [None]:
def augment_pair(roman, hindi, noise):
    pairs = [(roman, hindi)]
    rules = {
        "aa": ["a"], "ee": ["i"], "oo": ["u"],
        "sh": ["s"], "ch": ["c"], "th": ["t"]
    }

    for k, vs in rules.items():
        if k in roman:
            for v in vs:
                pairs.append((roman.replace(k, v), hindi))

    max_rep = 2 + min(noise, 2)
    r2 = re.sub(r"([aeiou])", r"\1"*max_rep, roman, count=1)
    if r2 != roman:
        pairs.append((r2, hindi))

    return pairs

augmented = []
for r, h, nz in df[["roman","hindi","noise_score"]].values:
    augmented.extend(augment_pair(r, h, nz))

df = pd.DataFrame(augmented, columns=["roman", "hindi"])
print("After augmentation:", len(df))
df

PROMPT CREATION

In [None]:
def make_prompt(row):
    return f"""### Instruction:
Convert Hinglish (Roman Hindi) into Hindi (Devanagari script).
Be robust to slang, shortcuts, repeated letters & noisy spellings.

### Input:
{row['roman']}

### Output:
{row['hindi']}"""

df["prompt"] = df.apply(make_prompt, axis=1)
print(df.iloc[0]["prompt"])


Convert BACK to HF Dataset (only now)

In [None]:
from datasets import Dataset

train_ds = Dataset.from_pandas(df, preserve_index=False)


Load Model + LoRA

In [None]:
from unsloth import FastLanguageModel

BASE_MODEL = "google/gemma-2b-it"

model, tokenizer = FastLanguageModel.from_pretrained(
    BASE_MODEL,
    max_seq_length=512,
    load_in_4bit=True,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=32,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"],
)

print("Gemma 2B + LoRA loaded")


Tokenization

In [None]:
def tokenize(batch):
    return tokenizer(
        batch["prompt"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

tokenized = train_ds.map(tokenize, batched=True)
tokenized = tokenized.remove_columns(
    [c for c in tokenized.column_names if c not in ["input_ids", "attention_mask"]]
)

print("Tokenization DONE")


TRAINING

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import TrainingArguments
from unsloth.trainer import SFTTrainer

args = TrainingArguments(
    output_dir="./gemma2b_hinglish_noise_aug",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=5,
    fp16=True,
    logging_steps=20,
    save_steps=300,
    warmup_ratio=0.1,
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    train_dataset=tokenized,
    dataset_text_field="prompt",
)

trainer.train()


SAVE MODEL

In [None]:
save_path = "gemma2b_hinglish_lora"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print("Model saved at:", save_path)


SMALL EVAL SET + TEXT GENERATION


In [None]:
# Take 200 samples for quick evaluation
eval_df = df.sample(n=min(200, len(df)), random_state=42)
eval_ds = Dataset.from_pandas(eval_df, preserve_index=False)
def generate_text(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=64,
            do_sample=False
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
bleu = evaluate.load("sacrebleu")
chrf = evaluate.load("chrf")

preds, refs = [], []

for ex in tqdm(eval_ds, desc="Evaluating"):
    prompt = f"""### Instruction:
Convert Hinglish (Roman Hindi) into Hindi (Devanagari).
Handle slang, shortcuts & noisy spellings.

### Input:
{ex['roman']}

### Output:
"""

    gen = generate_text(prompt)

    # Extract only model output
    if "### Output:" in gen:
        gen = gen.split("### Output:")[-1].strip()
    else:
        gen = gen.strip()

    preds.append(gen)
    refs.append(ex["hindi"])


cer_score = cer(refs, preds)
wer_score = wer(refs, preds)
chrf_score = chrf.compute(predictions=preds, references=refs)["score"]
bleu_score = bleu.compute(
    predictions=preds,
    references=[[r] for r in refs]
)["score"]

exact_match = np.mean([p == r for p, r in zip(preds, refs)])
sentence_char_acc = np.mean([
    1 - cer([r], [p]) for p, r in zip(preds, refs)
])
print("===== Evaluation Results =====")
print(f"CER  : {cer_score:.4f}")
print(f"WER  : {wer_score:.4f}")
print(f"chrF : {chrf_score:.2f}")
print(f"BLEU : {bleu_score:.2f}")
print(f"Exact Match Accuracy: {exact_match:.4f}")
print("Sentence-level Char Accuracy:", sentence_char_acc)


INFERENCE

In [None]:
def generate_text(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=128)
    return tokenizer.decode(output[0], skip_special_tokens=True)

def transliterate(text):
    prompt = f"""### Instruction:
Convert Hinglish to Hindi (Devanagari).
Handle slang and noisy spellings.

### Input:
{text}

### Output:
"""
    out = generate_text(prompt)
    return out.split("### Output:")[-1].strip()

while True:
    text = input("Hinglish: ")
    if text.lower() in ["exit", "quit"]:
        break
    print("Hindi:", transliterate(text))
