Imports and Login

In [None]:
!pip install unsloth evaluate jiwer rouge-score sacrebleu

Phonetic Noise Augmentation Logic

In [None]:
import random
import re
import unicodedata
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset
from unsloth import FastLanguageModel
from unsloth.trainer import SFTTrainer
from transformers import TrainingArguments
import evaluate
from tqdm import tqdm
from sklearn.metrics import precision_recall_fscore_support

PHONETIC_SWAPS = {
    "v": ["w"], "w": ["v"], "z": ["j"], "j": ["z"], "f": ["ph"], "ph": ["f"],
    "ee": ["i"], "i": ["ee"], "oo": ["u"], "u": ["oo"], "s": ["sh"], "sh": ["s"], "y": ["i"]
}
SLANG_MAP = {"hai": ["h", "ey"], "kya": ["ky"], "nahi": ["nhi", "nai"], "raha": ["rha"]}

def inject_phonetic_noise(text, p=0.3):

    if not isinstance(text, str): return text
    words = text.split()
    augmented_words = []

    for w in words:
        low_word = w.lower()

        # 1. Noise Mapping
        if low_word in SLANG_MAP and random.random() < p:
            low_word = random.choice(SLANG_MAP[low_word])

        # 2. Phonetic Swaps
        for char, variants in PHONETIC_SWAPS.items():
            if char in low_word and random.random() < 0.2:
                low_word = low_word.replace(char, random.choice(variants))

        # 3. Middle Vowel Drop
        if len(low_word) > 4 and random.random() < 0.15:
            vowels = re.findall(r"[aeiou]", low_word[1:-1])
            if vowels:
                low_word = low_word.replace(random.choice(vowels), "", 1)

        # 4. Entity Capitalization Noise (ADDED)
        # Randomly capitalize to train model on proper nouns and varied user inputs
        if random.random() < 0.2:
            low_word = low_word.capitalize()
        elif random.random() < 0.05:
            low_word = low_word.upper()

        augmented_words.append(low_word)

    return " ".join(augmented_words)

Data Loading & Complex Character Normalization

In [None]:
# Load Dataset
ds = load_dataset("sk-community/romanized_hindi", split="train")
df = ds.to_pandas().head(3000)

def clean_and_normalize(row):
    if not isinstance(row["Hindi"], str) or not isinstance(row["Transliterated Hindi"], str):
        return None
    # NFC merges decomposed characters (critical for half-letters like)
    hindi = unicodedata.normalize('NFC', row["Hindi"].strip())
    hindi = re.sub(r"[^\u0900-\u097F0-9\sред!?]", "", hindi)
    roman = row["Transliterated Hindi"].strip()
    return {"roman": roman, "hindi": hindi}

df = df.apply(clean_and_normalize, axis=1).dropna().apply(pd.Series).reset_index(drop=True)

# Generate multi-variants to increase dataset scale and entity robustness
df_noisy = df.copy()
df_noisy['roman'] = df_noisy['roman'].apply(lambda x: inject_phonetic_noise(x))
df_final = pd.concat([df, df_noisy]).drop_duplicates().reset_index(drop=True)
print(f"Dataset expanded to {len(df_final)} rows.")

Model Setup and Phonetic Tokenization

In [None]:
from unsloth import FastLanguageModel

BASE_MODEL = "google/gemma-2b-it"
model, tokenizer = FastLanguageModel.from_pretrained(
    BASE_MODEL,
    max_seq_length=512,
    load_in_4bit=True
)

model = FastLanguageModel.get_peft_model(
    model, r=128, lora_alpha=128,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

def tokenize(batch):
    input_ids, labels = [], []
    for r, h in zip(batch["roman"], batch["hindi"]):
        prompt_text = f"Hinglish: {r}\nHindi: "
        full_text = prompt_text + h + tokenizer.eos_token

        tokenized_full = tokenizer(full_text, truncation=True, max_length=128)
        tokenized_prompt = tokenizer(prompt_text, truncation=True, max_length=128, add_special_tokens=False)

        prompt_len = len(tokenized_prompt["input_ids"])
        # Mask prompt so model only learns to predict Devanagari sequence
        label = [-100] * prompt_len + tokenized_full["input_ids"][prompt_len:]

        input_ids.append(tokenized_full["input_ids"])
        labels.append(label)
    return {"input_ids": input_ids, "labels": labels}

# Prepare final training data
dataset = Dataset.from_pandas(df_final).shuffle(seed=42)
split = dataset.train_test_split(test_size=0.1)
train_ds = split["train"].map(tokenize, batched=True, remove_columns=dataset.column_names)

Training and Evaluation

In [None]:
from unsloth.trainer import SFTTrainer
from transformers import TrainingArguments

# Trainer setup
trainer = SFTTrainer(
    model=model, tokenizer=tokenizer, train_dataset=train_ds,
    args=TrainingArguments(
        output_dir="./hinglish_lora", per_device_train_batch_size=4,
        gradient_accumulation_steps=4, learning_rate=5e-5, num_train_epochs=2,
        fp16=True, logging_steps=50, optim="adamw_8bit", report_to="none"
    ),
)
trainer.train()

Metric and Evaluation Utility Setup


In [None]:
import torch

def generate_text_fixed(roman):

    # Force lowercase as models often handle standardized case better for phonetics
    prompt = f"Hinglish: {roman.lower()}\nHindi: "
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=128,
            do_sample=False,
            repetition_penalty=1.2,
            no_repeat_ngram_size=3,
            pad_token_id=tokenizer.eos_token_id
        )

    # Extract newly generated tokens only
    input_len = inputs.input_ids.shape[1]
    decoded = tokenizer.decode(out[0][input_len:], skip_special_tokens=True).strip()
    return decoded

In [None]:
import evaluate
from sklearn.metrics import precision_recall_fscore_support
import unicodedata

def compute_all_metrics(preds, refs):


    # 1. Standard character and sequence metrics
    cer = evaluate.load("cer").compute(predictions=preds, references=refs)
    wer = evaluate.load("wer").compute(predictions=preds, references=refs)
    chrf = evaluate.load("chrf").compute(predictions=preds, references=refs)["score"]

    # 2. FLATTEN sentences into words for Word-Level Accuracy
    all_pred_words = []
    all_ref_words = []

    for p_sent, r_sent in zip(preds, refs):
        # Splitting sentences into individual words
        p_words = p_sent.split()
        r_words = r_sent.split()

        # Alignment padding: Ensure we compare equal lengths
        max_len = max(len(p_words), len(r_words))
        p_words += ["<PAD>"] * (max_len - len(p_words))
        r_words += ["<PAD>"] * (max_len - len(r_words))

        all_pred_words.extend(p_words)
        all_ref_words.extend(r_words)

    # 3. Calculate binary success for every individual word
    y_true = [1] * len(all_ref_words)
    y_pred = [1 if p == r else 0 for p, r in zip(all_pred_words, all_ref_words)]

    # Compute Word-Level metrics
    prec, rec, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average='binary', zero_division=0
    )

    return {
        "CER": cer,
        "WER": wer,
        "chrF_Score": chrf,
        "Model_Word_Precision": prec,
        "Model_Word_Recall": rec,
        "Model_Word_F1": f1
    }

Evaluation on Internal Dataset

In [None]:
print("--- Internal Dataset Evaluation (Word-Level Recall/F1) ---")

# Ensure evaluation subset is selected
eval_subset_internal = split["test"].shuffle(seed=42).select(range(min(50, len(split["test"]))))

preds_int, refs_int = [], []
for ex in tqdm(eval_subset_internal, desc="Internal Eval"):
    # Using your fixed prediction logic (Greedy + Repetition Penalty)
    p = generate_text_fixed(ex["roman"])
    # Normalize to NFC to ensure character clusters match correctly
    r = unicodedata.normalize('NFC', ex["hindi"].strip())

    preds_int.append(p)
    refs_int.append(r)

# Compute new Model Recall and F1 metrics
internal_results = compute_all_metrics(preds_int, refs_int)

for metric, value in internal_results.items():
    print(f"{metric}: {value:.4f}")

Evaluation on External codebyam Dataset

In [None]:
from datasets import load_dataset
import unicodedata
from tqdm import tqdm

print("\n--- External Dataset Evaluation (codebyam - 50 rows) ---")

# Load the external dataset
ext_ds_full = load_dataset("codebyam/Hinglish-Hindi-Transliteration-Dataset", split="train")

# Select a subset of 50 rows for evaluation
eval_subset_ext = ext_ds_full.shuffle(seed=42).select(range(50))

preds_ext, refs_ext = [], []

for ex in tqdm(eval_subset_ext, desc="External Eval"):
    # Generate transliteration using your fixed prediction function
    # (Greedy search + repetition penalty for high precision)
    p = generate_text_fixed(ex["Hinglish"])

    # Normalize Devanagari to NFC to ensure complex phonetic clusters
    # (like half-characters) are compared accurately
    r = unicodedata.normalize('NFC', ex["Hindi"].strip())

    preds_ext.append(p)
    refs_ext.append(r)

# Compute word-level metrics: CER, WER, chrF, Model Recall, and Model F1
external_results = compute_all_metrics(preds_ext, refs_ext)

for metric, value in external_results.items():
    print(f"{metric}: {value:.4f}")

Data Samples

In [None]:
print("--- Testing on 5 Samples from External Dataset (codebyam) ---")

# Create a subset of 5 samples from the external dataset
eval_subset_ext_5_samples = ext_ds_full.shuffle(seed=42).select(range(min(5, len(ext_ds_full))))

preds_ext_5_samples, refs_ext_5_samples = [], []

for i, ex in enumerate(eval_subset_ext_5_samples):
    # Generate text using sampling logic
    p = generate_text_fixed(ex["Hinglish"])
    # Normalize external references for a fair comparison of phonetic clusters
    r = unicodedata.normalize('NFC', ex["Hindi"].strip())

    preds_ext_5_samples.append(p)
    refs_ext_5_samples.append(r)

    print(f"\nSample {i+1}:")
    print(f"Roman (Input): {ex['Hinglish']}")
    print(f"Predicted Hindi: {p}")
    print(f"Actual Hindi:    {r}")

In [None]:
print("--- Testing on 5 Samples from Internal Test Split ---")

# Create a subset of 5 samples from the internal test split
eval_subset_5_samples = split["test"].shuffle(seed=42).select(range(min(5, len(split["test"]))))

preds_5_samples, refs_5_samples = [], []

for i, ex in enumerate(eval_subset_5_samples):
    # Generate text using sampling logic
    p = generate_text_fixed(ex["roman"])
    # Normalize Devanagari for consistent character clustering
    r = unicodedata.normalize('NFC', ex["hindi"].strip())

    preds_5_samples.append(p)
    refs_5_samples.append(r)

    print(f"\nSample {i+1}:")
    print(f"Roman (Input): {ex['roman']}")
    print(f"Predicted Hindi: {p}")
    print(f"Actual Hindi:    {r}")

Performance Comparison

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Define the metrics to be plotted
plot_metrics = [
    "CER\n(Lower is Better)",
    "WER\n(Lower is Better)",
    "chrF / 100\n(Higher is Better)",
    "Word Recall\n(Higher is Better)",
    "Word F1\n(Higher is Better)"
]

# Real-time data extraction from your evaluation dictionaries
# This will automatically update whenever you re-run your evaluation blocks
try:
    internal_plot_values = [
        internal_results["CER"],
        internal_results["WER"],
        internal_results["chrF_Score"] / 100, # Normalized to 0.0-1.0 scale
        internal_results["Model_Word_Recall"],
        internal_results["Model_Word_F1"]
    ]

    external_plot_values = [
        external_results["CER"],
        external_results["WER"],
        external_results["chrF_Score"] / 100, # Normalized to 0.0-1.0 scale
        external_results["Model_Word_Recall"],
        external_results["Model_Word_F1"]
    ]
except KeyError as e:
    print(f"Error: Metric {e} not found. Ensure compute_all_metrics has run for both datasets.")
    internal_plot_values = [0]*5
    external_plot_values = [0]*5

x = np.arange(len(plot_metrics))
width = 0.35

fig, ax = plt.subplots(figsize=(12, 7))
rects1 = ax.bar(x - width/2, internal_plot_values, width, label='Internal Dataset (sk-community/romanized_hindi)', color='#4C72B0')
rects2 = ax.bar(x + width/2, external_plot_values, width, label='External Dataset (codebyam/Hinglish-Hindi-Transliteration-Dataset)', color='#9BBBEA')

# Styling and dynamic labeling
ax.set_ylabel('Metric Score (Normalized 0.0 - 1.0)')
ax.set_title('Performance Comparison ', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(plot_metrics, fontsize=10)
ax.legend()
ax.grid(axis='y', linestyle='--', alpha=0.7)

# Dynamic value labels on top of bars
def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        ax.annotate(f'{height:.4f}',
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),
                    textcoords="offset points",
                    ha='center', va='bottom', fontsize=9)

autolabel(rects1)
autolabel(rects2)

plt.ylim(0, 1.1) # Limits set for visibility of top-labels
plt.tight_layout()
plt.show()