<a href="https://colab.research.google.com/github/Dhruv3110/Roman-to-Devanagari-Transliteration-System/blob/main/transliteration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Load Dataset -> Pandas

In [7]:
!pip install nbstripout




In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [None]:
!pip install datasets unsloth jiwer evaluate sacrebleu

In [None]:
from huggingface_hub import login
from datasets import load_dataset, Dataset
import pandas as pd
import re
import numpy as np
import torch
from unsloth import FastLanguageModel
import os
from transformers import TrainingArguments
from unsloth.trainer import SFTTrainer
from jiwer import wer, cer
import evaluate
import numpy as np
from tqdm import tqdm
tqdm = lambda x, **kwargs: x


login()

ds = load_dataset("codebyam/Hinglish-Hindi-Transliteration-Dataset", split="train")
df = ds.to_pandas()
df


Rename columns

In [2]:
df = df.rename(columns={'Hinglish': 'Roman'})
print(df.head())

                           Roman                      Hindi
0  haan,aaj ka match kaisa raha?   हाँ, आज का मैच कैसा रहा?
1        kya game khel rahi thi?       क्या गेम खेल रही थी?
2       abey,kaunsa level pe ho?      अबे कौनसा लेवल पर हो?
3       kya party mein aana hai?    क्या पार्टी में आना है?
4    abey,kya game khel rahi ho?  अबे, क्या गेम खेल रही हो?


SOFT CLEANING

In [3]:
def soft_clean_row(row):
    roman, hindi = row["Roman"], row["Hindi"]

    if not isinstance(roman, str) or not isinstance(hindi, str):
        return None

    roman = roman.lower().strip()
    roman = re.sub(r"[^a-z0-9\s'!?.,\-]", "", roman)

    hindi = hindi.strip()
    hindi = re.sub(r"[^\u0900-\u097F\s।!?]", "", hindi)

    if not roman or not hindi:
        return None

    return {"roman": roman, "hindi": hindi}

df = df.apply(soft_clean_row, axis=1).dropna().apply(pd.Series)
print("After cleaning:", len(df))


After cleaning: 1999


NOISE DETECTION

In [4]:
HINGLISH_SLANG = {
    "m","me","mai","main","yr","yaar","bro","bruh","kr","krr","kya","kyu","kyun",
    "h","hu","hun","rha","rhi","rhe","plz","pls","sry","thx","bht","lol","lmao"
}

HINGLISH_CONSONANT_TOKENS = {"kl","tb","jb","thk","hn","thn","kch","smj"}

def noise_score(roman):
    score = 0
    tokens = roman.split()

    score += bool(re.search(r"(.)\1{2,}", roman))
    score += sum(t in HINGLISH_SLANG for t in tokens)
    score += sum(t in HINGLISH_CONSONANT_TOKENS for t in tokens)
    score += sum(re.fullmatch(r"[bcdfghjklmnpqrstvwxyz]+", t) is not None for t in tokens)
    score += bool(re.search(r"[a-z]\d[a-z]", roman))
    score += bool(re.search(r"([!?.,])\1{1,}", roman))

    return score

df["noise_score"] = df["roman"].apply(noise_score)
df["is_noisy"] = (df["noise_score"] > 0).astype(int)

print("Noise ratio:", df["is_noisy"].mean())


Noise ratio: 0.22911455727863933


DATA AUGMENTATION (Clean + Controlled)

In [5]:
def augment_pair(roman, hindi, noise):
    pairs = [(roman, hindi)]
    rules = {
        "aa": ["a"], "ee": ["i"], "oo": ["u"],
        "sh": ["s"], "ch": ["c"], "th": ["t"]
    }

    for k, vs in rules.items():
        if k in roman:
            for v in vs:
                pairs.append((roman.replace(k, v), hindi))

    max_rep = 2 + min(noise, 2)
    r2 = re.sub(r"([aeiou])", r"\1"*max_rep, roman, count=1)
    if r2 != roman:
        pairs.append((r2, hindi))

    return pairs

augmented = []
for r, h, nz in df[["roman","hindi","noise_score"]].values:
    augmented.extend(augment_pair(r, h, nz))

df = pd.DataFrame(augmented, columns=["roman", "hindi"])
print("After augmentation:", len(df))
df

After augmentation: 6881


Unnamed: 0,roman,hindi
0,"haan,aaj ka match kaisa raha?",हाँ आज का मैच कैसा रहा?
1,"han,aj ka match kaisa raha?",हाँ आज का मैच कैसा रहा?
2,"haan,aaj ka matc kaisa raha?",हाँ आज का मैच कैसा रहा?
3,"haaan,aaj ka match kaisa raha?",हाँ आज का मैच कैसा रहा?
4,kya game khel rahi thi?,क्या गेम खेल रही थी?
...,...,...
6876,ye website mobile-friendly nahi hai,ये वेबसाइट मोबाइलफ्रेंडली नहीं है
6877,yee website mobile-friendly nahi hai,ये वेबसाइट मोबाइलफ्रेंडली नहीं है
6878,mujhe tech blog likhna start karna hai,मुझे टेक ब्लॉग लिखना स्टार्ट करना है
6879,mujhe tec blog likhna start karna hai,मुझे टेक ब्लॉग लिखना स्टार्ट करना है


PROMPT CREATION

In [6]:
def make_prompt(row):
    return f"""### Instruction:
Convert Hinglish (Roman Hindi) into Hindi (Devanagari script).
Be robust to slang, shortcuts, repeated letters & noisy spellings.

### Input:
{row['roman']}

### Output:
{row['hindi']}"""

df["prompt"] = df.apply(make_prompt, axis=1)
print(df.iloc[0]["prompt"])


### Instruction:
Convert Hinglish (Roman Hindi) into Hindi (Devanagari script).
Be robust to slang, shortcuts, repeated letters & noisy spellings.

### Input:
haan,aaj ka match kaisa raha?

### Output:
हाँ आज का मैच कैसा रहा?


Convert BACK to HF Dataset (only now)

In [7]:
from datasets import Dataset

train_ds = Dataset.from_pandas(df, preserve_index=False)


Load Model + LoRA

In [8]:
from unsloth import FastLanguageModel

BASE_MODEL = "google/gemma-2b-it"

model, tokenizer = FastLanguageModel.from_pretrained(
    BASE_MODEL,
    max_seq_length=512,
    load_in_4bit=True,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=32,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"],
)

print("Gemma 2B + LoRA loaded")


==((====))==  Unsloth 2025.12.5: Fast Gemma patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.07G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/154 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.12.5 patched 18 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


Gemma 2B + LoRA loaded


Tokenization

In [9]:
def tokenize(batch):
    return tokenizer(
        batch["prompt"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

tokenized = train_ds.map(tokenize, batched=True)
tokenized = tokenized.remove_columns(
    [c for c in tokenized.column_names if c not in ["input_ids", "attention_mask"]]
)

print("Tokenization DONE")


Map:   0%|          | 0/6881 [00:00<?, ? examples/s]

Tokenization DONE


TRAINING

In [10]:
import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import TrainingArguments
from unsloth.trainer import SFTTrainer

args = TrainingArguments(
    output_dir="./gemma2b_hinglish_noise_aug",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=5,
    fp16=True,
    logging_steps=20,
    save_steps=300,
    warmup_ratio=0.1,
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    train_dataset=tokenized,
    dataset_text_field="prompt",
)

trainer.train()


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 6,881 | Num Epochs = 5 | Total steps = 4,305
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 3,686,400 of 2,509,858,816 (0.15% trained)


Step,Training Loss
20,3.4963
40,2.9042
60,2.1049
80,1.6829
100,1.0902
120,0.7455
140,0.5889
160,0.4495
180,0.3957
200,0.3891


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using

TrainOutput(global_step=4305, training_loss=0.2460044237379413, metrics={'train_runtime': 7422.1463, 'train_samples_per_second': 4.635, 'train_steps_per_second': 0.58, 'total_flos': 1.0492963418800128e+17, 'train_loss': 0.2460044237379413, 'epoch': 5.0})

SAVE MODEL

In [11]:
save_path = "gemma2b_hinglish_lora"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print("Model saved at:", save_path)


Model saved at: gemma2b_hinglish_lora


SMALL EVAL SET + TEXT GENERATION


In [12]:
# Take 200 samples for quick evaluation
eval_df = df.sample(n=min(200, len(df)), random_state=42)
eval_ds = Dataset.from_pandas(eval_df, preserve_index=False)
def generate_text(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=64,
            do_sample=False
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [13]:
bleu = evaluate.load("sacrebleu")
chrf = evaluate.load("chrf")

preds, refs = [], []

for ex in tqdm(eval_ds, desc="Evaluating"):
    prompt = f"""### Instruction:
Convert Hinglish (Roman Hindi) into Hindi (Devanagari).
Handle slang, shortcuts & noisy spellings.

### Input:
{ex['roman']}

### Output:
"""

    gen = generate_text(prompt)

    # Extract only model output
    if "### Output:" in gen:
        gen = gen.split("### Output:")[-1].strip()
    else:
        gen = gen.strip()

    preds.append(gen)
    refs.append(ex["hindi"])


cer_score = cer(refs, preds)
wer_score = wer(refs, preds)
chrf_score = chrf.compute(predictions=preds, references=refs)["score"]
bleu_score = bleu.compute(
    predictions=preds,
    references=[[r] for r in refs]
)["score"]

exact_match = np.mean([p == r for p, r in zip(preds, refs)])
sentence_char_acc = np.mean([
    1 - cer([r], [p]) for p, r in zip(preds, refs)
])
print("===== Evaluation Results =====")
print(f"CER  : {cer_score:.4f}")
print(f"WER  : {wer_score:.4f}")
print(f"chrF : {chrf_score:.2f}")
print(f"BLEU : {bleu_score:.2f}")
print(f"Exact Match Accuracy: {exact_match:.4f}")
print("Sentence-level Char Accuracy:", sentence_char_acc)


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Evaluating: 100%|██████████| 200/200 [07:59<00:00,  2.40s/it]

===== Evaluation Results =====
CER  : 0.0428
WER  : 0.0637
chrF : 92.57
BLEU : 87.15
Exact Match Accuracy: 0.6400
Sentence-level Char Accuracy: 0.9521991508414283





INFERENCE

In [14]:
def generate_text(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=128)
    return tokenizer.decode(output[0], skip_special_tokens=True)

def transliterate(text):
    prompt = f"""### Instruction:
Convert Hinglish to Hindi (Devanagari).
Handle slang and noisy spellings.

### Input:
{text}

### Output:
"""
    out = generate_text(prompt)
    return out.split("### Output:")[-1].strip()

while True:
    text = input("Hinglish: ")
    if text.lower() in ["exit", "quit"]:
        break
    print("Hindi:", transliterate(text))


Hinglish: Kese hain aap log
Hindi: केसे हैं आप लोग
Hinglish: mein aacha hu'
Hindi: मैं अच्छा हूँ
Hinglish: exit
