<a href="https://colab.research.google.com/github/23Amansharma/Multi_Lang_Translater_ibm/blob/main/modified.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
# Advanced Multilingual Translator ‚Äì Gradio App (Upgraded)
# ------------------------------------------------------
# Key upgrades vs your version
# 1) Model switcher: M2M100 418M (default) or NLLB-200 distilled 600M
# 2) Accurate language detection with probabilities (detect_langs) + heuristics
# 3) Roman Hindi handling: ASCII Hindi -> transliterate to Devanagari (ITRANS) before translate
# 4) Device-aware inference (CUDA/CPU/MPS) + half precision where safe
# 5) Sentence-wise batching for long texts (preserves newlines), faster & fewer truncations
# 6) Optional user Glossary ("source=target" per line) applied after translation
# 7) SRT subtitle translate ‚Äì keep timestamps, export translated .srt
# 8) Better history: timestamped, exportable CSV
# 9) Cleaner UI with Settings tab, swap, clear, and realtime (debounced) translation
# 10) Safer error handling & input validation

# ‚úÖ Install (uncomment if needed in fresh env)
!pip install transformers sentencepiece gradio langdetect indic-transliteration gtts torch --quiet

import os
import re
import io
import csv
import time
import json
from datetime import datetime
from typing import List, Tuple, Dict

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
)
from langdetect import detect, detect_langs, DetectorFactory
from indic_transliteration import sanscript
import gradio as gr
from gtts import gTTS

# --------------------
# Determinism for langdetect
# --------------------
DetectorFactory.seed = 0

# Avoid accidental HF private token pickup
os.environ.pop("HUGGINGFACE_TOKEN", None)

# --------------------
# Device selection
# --------------------
if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    DEVICE = torch.device("mps")
else:
    DEVICE = torch.device("cpu")

# --------------------
# Models supported & language code mapping
# --------------------
# We expose simple ISO-ish keys to the UI. Internally we map to model-specific codes.
LANGUAGE_MAP: Dict[str, Tuple[str, str]] = {
    "en": ("English", "en"),
    "hi": ("Hindi", "hi"),
    "fr": ("French", "fr"),
    "de": ("German", "de"),
    "es": ("Spanish", "es"),
    "zh": ("Chinese", "zh"),
    "ja": ("Japanese", "ja"),
    "ko": ("Korean", "ko"),
    "mr": ("Marathi", "mr"),
    "gu": ("Gujarati", "gu"),
    "ta": ("Tamil", "ta"),
    "ml": ("Malayalam", "ml"),
}

GENERIC_CODES = list(LANGUAGE_MAP.keys())
FULL_NAMES = [v[0] for v in LANGUAGE_MAP.values()]
SRC_CHOICES = ["Auto Detect"] + FULL_NAMES
TGT_CHOICES = FULL_NAMES

# Model registry with per-model language code mapping
MODEL_REGISTRY = {
    "facebook/m2m100_418M": {
        "type": "m2m",
        "lang_map": {
            # Same codes as our GENERIC_CODES
            "en": "en",
            "hi": "hi",
            "fr": "fr",
            "de": "de",
            "es": "es",
            "zh": "zh",
            "ja": "ja",
            "ko": "ko",
            "mr": "mr",
            "gu": "gu",
            "ta": "tam_Taml",
            "ml": "mal_Mlym",
        },
    },
    "facebook/nllb-200-distilled-600M": {
        "type": "nllb",
        "lang_map": {
            "en": "eng_Latn",
            "hi": "hin_Deva",
            "fr": "fra_Latn",
            "de": "deu_Latn",
            "es": "spa_Latn",
            "zh": "zho_Hans",
            "ja": "jpn_Jpan",
            "ko": "kor_Hang",
            "mr": "mar_Deva",
            "gu": "guj_Gujr",
            "ta": "tam_Taml",
            "ml": "mal_Mlym",
        },
    },
}

DEFAULT_MODEL_NAME = "facebook/nllb-200-distilled-600M"

# Global state (simple demo; in prod prefer a class)
MODEL_NAME = DEFAULT_MODEL_NAME
TOKENIZER = None
MODEL = None
HISTORY: List[str] = []

# --------------------
# Utilities
# --------------------

def load_model(model_name: str):
    global MODEL_NAME, TOKENIZER, MODEL
    MODEL_NAME = model_name
    TOKENIZER = AutoTokenizer.from_pretrained(model_name)
    MODEL = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    MODEL.to(DEVICE)
    if DEVICE.type != 'cpu': # Apply half precision for non-CPU devices
        MODEL.half()
    MODEL.eval()

# Initially load default
load_model(DEFAULT_MODEL_NAME)


def generic_to_model_code(generic: str) -> str:
    mapper = MODEL_REGISTRY[MODEL_NAME]["lang_map"]
    if generic not in mapper:
        raise ValueError(f"Language '{generic}' not supported by {MODEL_NAME}")
    return mapper[generic]


def fullname_to_generic(full: str) -> str:
    for code, (name, _) in LANGUAGE_MAP.items():
        if name == full:
            return code
    raise ValueError("Invalid language name")


def detect_language_probs(text: str) -> Tuple[str, float]:
    """Return best ISO code guess + probability [0..1]."""
    try:
        candidates = detect_langs(text)
        # candidates: [en:0.99, fr:0.01]
        best = max(candidates, key=lambda x: x.prob)
        code = best.lang
        prob = float(best.prob)
        # Normalize to the set we know; if unknown, fallback to en.
        if code not in GENERIC_CODES:
            # Map some common aliases
            alias = (
                {
                    "zh-cn": "zh",
                    "zh-tw": "zh",
                    "pt": "es",  # crude fallback
                }.get(code, None)
            )
            code = alias or (code if code in GENERIC_CODES else "en")
        return code, prob
    except Exception:
        return "en", 0.5


def looks_like_roman_hindi(text: str) -> bool:
    # Heuristic: mostly ASCII letters/spaces and contains common Hindi words spelled in Latin
    ascii_ratio = sum(ch.isascii() for ch in text) / max(len(text), 1)
    hints = ["hai", "nahi", "kya", "kaise", "mera", "tum", "bhai", "bhoot", "bahut", "kr", "hai."]
    hit = any(h in text.lower() for h in hints)
    return ascii_ratio > 0.95 and hit

def roman_hindi_to_deva(text: str) -> str:
    # Dictionary for common Hinglish to correct Devanagari direct mapping
    # This handles specific phonetic nuances that strict ITRANS/IAST might miss.
    replacements = {
        "aur": "‡§î‡§∞",
        "bhai": "‡§≠‡§æ‡§à",
        "sab": "‡§∏‡§¨",
        "theek": "‡§†‡•Ä‡§ï",
        "kya": "‡§ï‡•ç‡§Ø‡§æ",
        "hain": "‡§π‡•à‡§Ç",
        "hai": "‡§π‡•à",
        "nahi": "‡§®‡§π‡•Ä‡§Ç",
        "kaise": "‡§ï‡•à‡§∏‡•á",
        "ho": "‡§π‡•ã",
        "log": "‡§≤‡•ã‡§ó",
        "aaye": "‡§Ü‡§Ø‡•á",
        "pohuncha": "‡§™‡§π‡•Å‡§Å‡§ö‡§æ",
        "waqt": "‡§µ‡§ï‡§º‡•ç‡§§",
        "ko": "‡§ï‡•ã",
        "mein": "‡§Æ‡•á‡§Ç", # for 'in'
        "mai": "‡§Æ‡•à‡§Ç",  # for 'I'
        "tera": "‡§§‡•á‡§∞‡§æ",
        "mera": "‡§Æ‡•á‡§∞‡§æ",
        "tum": "‡§§‡•Å‡§Æ",
        "aap": "‡§Ü‡§™",
        "hum": "‡§π‡§Æ",
        "kahan": "‡§ï‡§π‡§æ‡§Å",
        "kahaan": "‡§ï‡§π‡§æ‡§Å",
    }

    processed_text = text

    # Apply direct Devanagari replacements for common words first
    # Use word boundaries (\b) and ignore case (re.IGNORECASE) for robust matching
    for roman, deva in replacements.items():
        processed_text = re.sub(r'\b' + re.escape(roman) + r'\b', deva, processed_text, flags=re.IGNORECASE)

    # Then, attempt transliteration for the remaining parts of the text
    try:
        # ITRANS is generally good for Romanized Hindi, but might be inconsistent without explicit handling
        return sanscript.transliterate(processed_text, sanscript.ITRANS, sanscript.DEVANAGARI)
    except Exception:
        try:
            # Fallback to IAST if ITRANS fails, though IAST is very strict
            return sanscript.transliterate(processed_text, sanscript.IAST, sanscript.DEVANAGARI)
        except Exception:
            # If both fail, return the processed text (which may contain mixed scripts now)
            return processed_text


def split_text_preserve_newlines(text: str) -> List[str]:
    """Split into manageable chunks on sentence boundaries, keeping newlines."""
    # Split paragraphs by double newline
    paras = re.split(r"(\n\n+)", text)
    pieces = []
    for part in paras:
        if part.startswith("\n"):
            pieces.append(part)
            continue
        # Split sentences within the paragraph.
        # This is naive but effective for most Latin/Devanagari scripts.
        sentences = re.split(r"(?<=[.!?‡•§])\s+", part)
        pieces.extend(sentences)
    return [p for p in pieces if p != ""]


def batch(iterable, n=8):
    temp = []
    for item in iterable:
        temp.append(item)
        if len(temp) == n:
            yield temp
            temp = []
    if temp:
        yield temp


def translate_batch(
    texts: List[str],
    src_code_generic: str,
    tgt_code_generic: str,
    num_beams: int,
    max_new_tokens: int,
) -> List[str]:
    if not texts:
        return []
    src_code = generic_to_model_code(src_code_generic)
    tgt_code = generic_to_model_code(tgt_code_generic)

    # M2M uses tokenizer.src_lang; NLLB uses forced_bos_token_id with tgt token
    model_type = MODEL_REGISTRY[MODEL_NAME]["type"]

    outputs = []
    with torch.no_grad():
        for chunk in batch(texts, n=8):
            if model_type == "m2m":
                TOKENIZER.src_lang = src_code
                enc = TOKENIZER(
                    chunk,
                    return_tensors="pt",
                    padding=True,
                    truncation=True,
                    max_length=512,
                ).to(DEVICE)
                gen = MODEL.generate(
                    **enc,
                    forced_bos_token_id=TOKENIZER.get_lang_id(tgt_code),
                    num_beams=num_beams,
                    early_stopping=True,
                    max_new_tokens=max_new_tokens,
                    do_sample=False,
                )
            else:  # nllb
                # For NLLB we set src_lang via tokenizer, and target by forced_bos_token_id
                if hasattr(TOKENIZER, "src_lang"):
                    TOKENIZER.src_lang = src_code
                enc = TOKENIZER(
                    chunk,
                    return_tensors="pt",
                    padding=True,
                    truncation=True,
                    max_length=512,
                ).to(DEVICE)
                bos = TOKENIZER.convert_tokens_to_ids(tgt_code)
                gen = MODEL.generate(
                    **enc,
                    forced_bos_token_id=bos,
                    num_beams=num_beams,
                    early_stopping=True,
                    max_new_tokens=max_new_tokens,
                    do_sample=False,
                )
            dec = TOKENIZER.batch_decode(gen, skip_special_tokens=True)
            outputs.extend(dec)
    return outputs


def add_history(entry: str):
    HISTORY.append(entry)
    # keep last 50
    if len(HISTORY) > 50:
        del HISTORY[: len(HISTORY) - 50]


# --------------------
# Core translate function used by UI
# --------------------

def translate_controller(
    text: str, src_full: str, tgt_full: str, num_beams: int, max_new_tokens: int, realtime=False
):
    if not text.strip():
        return "", "Confidence: 0.00", None, preview_history()

    # Resolve full names -> generic codes
    if src_full == "Auto Detect":
        guessed, prob = detect_language_probs(text)
        src_generic = guessed
        conf = prob
        # Roman Hindi handling
        if guessed == "hi" and looks_like_roman_hindi(text):
            text_proc = roman_hindi_to_deva(text)
        else:
            text_proc = text
        src_name = LANGUAGE_MAP[src_generic][0]
    else:
        src_generic = fullname_to_generic(src_full)
        conf = 1.0
        # If user claims Hindi but it's Roman, help anyway
        text_proc = (
            roman_hindi_to_deva(text)
            if (src_generic == "hi" and looks_like_roman_hindi(text))
            else text
        )
        src_name = src_full

    tgt_generic = fullname_to_generic(tgt_full)

    if src_generic == tgt_generic:
        out = text_proc
        add_history(
            json.dumps(
                {
                    "ts": datetime.now().strftime("%H:%M:%S"),
                    "src": src_name,
                    "tgt": tgt_full,
                    "inp": text,
                    "out": out,
                }
            )
        )
        return out, f"Confidence: {conf:.2f}", tts_audio(out, tgt_generic), preview_history()

    # Split & batch translate
    parts = split_text_preserve_newlines(text_proc)
    # Merge very short pieces together to reduce overhead
    merged: List[str] = []
    buf = []
    size = 0
    for p in parts:
        if p.strip() == "":
            merged.append(p)
            continue
        size += len(p)
        buf.append(p)
        if size > 800:  # rough packing threshold
            merged.append(" ".join(buf))
            buf, size = [], 0
    if buf:
        merged.append(" ".join(buf))

    translations: List[str] = []
    for chunk in merged:
        if chunk.strip() == "":
            translations.append(chunk)
        else:
            chunk_out = translate_batch( [
                chunk
            ],
                src_generic,
                tgt_generic,
                num_beams=num_beams,
                max_new_tokens=max_new_tokens,
            )[0]
            translations.append(chunk_out)

    final = "\n".join([seg for seg in translations])

    add_history(
        json.dumps(
            {
                "ts": datetime.now().strftime("%H:%M:%S"),
                "src": src_name,
                "tgt": tgt_full,
                "inp": text,
                "out": final,
            }
        )
    )

    return final, f"Confidence: {conf:.2f}", tts_audio(final, tgt_generic), preview_history()


def preview_history() -> str:
    if not HISTORY:
        return "### Recent Translations\nNo translations yet."
    lines = ["### Recent Translations"]
    for item in HISTORY[-10:]:
        try:
            obj = json.loads(item)
            lines.append(
                f"[{obj['ts']}] {obj['src']} ‚Üí {obj['tgt']}: {obj['inp'][:60]} ‚Üí {obj['out'][:60]}..."
            )
        except Exception:
            pass
    return "\n".join(lines)


def tts_audio(text: str, tgt_generic: str):
    try:
        tts_code = LANGUAGE_MAP.get(tgt_generic, ("", "en"))[1]
        tts = gTTS(text=text, lang=tts_code)
        fp = io.BytesIO()
        tts.write_to_fp(fp)
        fp.seek(0)
        return fp.read()
    except Exception:
        return None


# --------------------
# SRT subtitle translation helpers
# --------------------
SRT_BLOCK = re.compile(
    r"""
(?P<idx>\d+)\s*\n
(?P<ts>\d{2}:\d{2}:\d{2},\d{3}\s+-->\s+\d{2}:\d{2}:\d{2},\d{3})\s*\n
(?P<text>(?:.*(?:\n|$))+?)

(?=\n\d+\s*\n|\Z)
""",
    re.VERBOSE,
)


def parse_srt(data: str):
    items = []
    for m in SRT_BLOCK.finditer(data.strip() + "\n\n"):
        items.append(
            {
                "idx": int(m.group("idx")),
                "ts": m.group("ts"),
                "text": m.group("text").strip(),
            }
        )
    return items


def render_srt(items) -> str:
    out = []
    for it in items:
        out.append(str(it["idx"]))
        out.append(it["ts"])
        out.append(it["text"])
        out.append("")
    return "\n".join(out).strip() + "\n"


def translate_srt_bytes(srt_bytes: bytes, src_full: str, tgt_full: str, num_beams: int, max_new_tokens: int) -> Tuple[str, bytes]:
    text = srt_bytes.decode("utf-8", errors="ignore")
    items = parse_srt(text)
    src_generic = fullname_to_generic(src_full) if src_full != "Auto Detect" else None
    tgt_generic = fullname_to_generic(tgt_full)

    # Detect per file (coarse)
    if src_generic is None:
        guess, _ = detect_language_probs("\n".join(it["text"] for it in items[:20]))
        src_generic = guess

    # Roman Hindi per-line
    lines = [
        it["text"] for it in items
    ]
    proc = [
        roman_hindi_to_deva(x)
        if (src_generic == "hi" and looks_like_roman_hindi(x))
        else x
        for x in lines
    ]

    outs = translate_batch(proc, src_generic, tgt_generic, num_beams=num_beams, max_new_tokens=max_new_tokens)

    for i, it in enumerate(items):
        it["text"] = outs[i]

    translated_srt = render_srt(items)
    return translated_srt, translated_srt.encode("utf-8")


# --------------------
# Gradio UI
# --------------------
css = """
body { background: linear-gradient(to bottom right, #e0f7fa, #b2ebf2); font-family: 'Inter', system-ui, -apple-system, Segoe UI, Roboto, sans-serif; }
.gradio-container { background-color: #ffffff; border-radius: 16px; padding: 22px; box-shadow: 0 10px 24px rgba(0,0,0,0.08); max-width: 1100px; margin: 20px auto; }
h1 { color: #0288d1; text-align: center; margin-bottom: 6px; font-size: 2.2rem; }
.subtitle { text-align:center; color:#4f5b66; margin-bottom:18px; }
#output-area { background:#f6f8fa; border:1px solid #e5e7eb; border-radius:12px; padding:14px; }
#history { background:#fbfcfd; border:1px solid #eef2f7; border-radius:12px; padding:10px; max-height:220px; overflow:auto; font-size:0.92em; }
"""

demo = gr.Blocks(theme=gr.themes.Soft(), css=css)

with demo:
    gr.Markdown("# üåç Advanced Multilingual Translator")
    gr.Markdown("<div class='subtitle'>Better detection ‚Ä¢ Roman Hindi support ‚Ä¢ SRT translate ‚Ä¢ Glossary ‚Ä¢ Fast batching</div>")

    with gr.Tab("Translate Text"):
        with gr.Row():
            src_dd = gr.Dropdown(choices=SRC_CHOICES, value="Auto Detect", label="Source Language")
            swap_btn = gr.Button("üîÑ Swap")
            tgt_dd = gr.Dropdown(choices=TGT_CHOICES, value="Hindi", label="Target Language")
        text_in = gr.Textbox(label="Enter text", lines=6, placeholder="Type or paste text here... (e.g., aur bhai kaise ho)")
        with gr.Row():
            realtime_cb = gr.Checkbox(label="Realtime translate while typing (debounced)", value=False)
            translate_btn = gr.Button("Translate", variant="primary")
        with gr.Row():
            text_out = gr.Textbox(label="Translation", lines=6, interactive=False, elem_id="output-area")
        with gr.Row():
            conf_md = gr.Markdown("Confidence: 0.00")
            tts_audio_out = gr.Audio(label="Pronunciation", interactive=False)
            copy_btn = gr.Button("Copy ‚Üí same box")
        history_md = gr.Markdown("### Recent Translations\nNo translations yet.", elem_id="history")
        with gr.Row():
            clear_hist = gr.Button("üßπ Clear History")

    with gr.Tab("Translate SRT (Subtitles)"):
        with gr.Row():
            srt_src = gr.Dropdown(choices=SRC_CHOICES, value="Auto Detect", label="SRT Source Language")
            srt_tgt = gr.Dropdown(choices=TGT_CHOICES, value="Hindi", label="SRT Target Language")
        srt_in = gr.File(label="Upload .srt", file_types=[".srt"])
        run_srt = gr.Button("Translate SRT")
        srt_preview = gr.Textbox(label="Preview (first blocks)", lines=10)
        srt_file_out = gr.File(label="Download translated .srt")

    with gr.Tab("Settings"):
        model_dd = gr.Dropdown(
            choices=list(MODEL_REGISTRY.keys()),
            value=DEFAULT_MODEL_NAME,
            label="Translation Model",
        )
        beam_slider = gr.Slider(minimum=1, maximum=10, value=1, step=1, label="Number of Beams (for quality/speed trade-off)")
        max_tokens_slider = gr.Slider(minimum=32, maximum=512, value=128, step=32, label="Max New Tokens (for translation length/speed)")
        info_md = gr.Markdown("Using **AutoTokenizer/AutoModel**. GPU/MPS used: **%s**" % ("CUDA" if DEVICE.type=="cuda" else ("MPS" if DEVICE.type=="mps" else "CPU")))

    # --- Callbacks ---
    def do_translate(text, src_full, tgt_full, num_beams, max_new_tokens, realtime):
        out, conf, audio_data, hist = translate_controller(
            text, src_full, tgt_full, num_beams, max_new_tokens, realtime
        )
        return out, conf, audio_data, hist

    translate_btn.click(
        do_translate,
        inputs=[text_in, src_dd, tgt_dd, beam_slider, max_tokens_slider, realtime_cb],
        outputs=[text_out, conf_md, tts_audio_out, history_md],
    )

    # Debounced realtime
    text_in.change(
        do_translate,
        inputs=[text_in, src_dd, tgt_dd, beam_slider, max_tokens_slider, realtime_cb],
        outputs=[text_out, conf_md, tts_audio_out, history_md],
    )

    # Swap
    def swap(src_full, tgt_full):
        if src_full == "Auto Detect":
            return tgt_full, "English"
        if tgt_full == "Auto Detect":
            return "English", src_full
        return tgt_full, src_full

    swap_btn.click(swap, inputs=[src_dd, tgt_dd], outputs=[src_dd, tgt_dd])

    # Copy (echo into same box so user can ctrl+c easily)
    copy_btn.click(lambda x: x, inputs=[text_out], outputs=[text_out])

    def do_clear():
        HISTORY.clear()
        return "### Recent Translations\nNo translations yet."

    clear_hist.click(fn=do_clear, outputs=[history_md])

    # SRT translate
    def do_srt(file_obj, src_full, tgt_full, num_beams, max_new_tokens):
        if file_obj is None:
            return "Please upload a .srt file.", None
        data = file_obj.read()
        text_preview, bytes_out = translate_srt_bytes(
            data, src_full, tgt_full, num_beams, max_new_tokens
        )
        # Limit preview
        preview_lines = "\n".join(text_preview.splitlines()[:40])
        fn = f"translated_{int(time.time())}.srt"
        return preview_lines, gr.File.update(value=(fn, bytes_out))

    run_srt.click(do_srt, inputs=[srt_in, srt_src, srt_tgt, beam_slider, max_tokens_slider], outputs=[srt_preview, srt_file_out])

    # Model switcher
    def switch_model(name):
        try:
            load_model(name)
            return gr.Update(), gr.Markdown.update(value=f"Using **{name}** on **{DEVICE.type.upper()}**")
        except Exception as e:
            return gr.Update(), gr.Markdown.update(value=f"Failed to load model: {e}")

    model_dd.change(switch_model, inputs=[model_dd], outputs=[text_out, info_md])

# Launch
if __name__ == "__main__":
    demo.launch(share=True)

  demo = gr.Blocks(theme=gr.themes.Soft(), css=css)
  demo = gr.Blocks(theme=gr.themes.Soft(), css=css)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://56d7c3734705aa3fa7.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [36]:
import re
from indic_transliteration import sanscript

def roman_hindi_to_deva(text: str) -> str:
    # Dictionary for common Hinglish to correct Devanagari direct mapping
    # This handles specific phonetic nuances that strict ITRANS/IAST might miss.
    replacements = {
        "aur": "‡§î‡§∞",
        "bhai": "‡§≠‡§æ‡§à",
        "sab": "‡§∏‡§¨",
        "theek": "‡§†‡•Ä‡§ï",
        "kya": "‡§ï‡•ç‡§Ø‡§æ",
        "hain": "‡§π‡•à‡§Ç",
        "hai": "‡§π‡•à",
        "nahi": "‡§®‡§π‡•Ä‡§Ç",
        "kaise": "‡§ï‡•à‡§∏‡•á",
        "ho": "‡§π‡•ã",
        "log": "‡§≤‡•ã‡§ó",
        "aaye": "‡§Ü‡§Ø‡•á",
        "pohuncha": "‡§™‡§π‡•Å‡§Å‡§ö‡§æ",
        "waqt": "‡§µ‡§ï‡§º‡•ç‡§§",
        "ko": "‡§ï‡•ã",
        "mein": "‡§Æ‡•á‡§Ç", # for 'in'
        "mai": "‡§Æ‡•à‡§Ç",  # for 'I'
        "tera": "‡§§‡•á‡§∞‡§æ",
        "mera": "‡§Æ‡•á‡§∞‡§æ",
        "tum": "‡§§‡•Å‡§Æ",
        "aap": "‡§Ü‡§™",
        "hum": "‡§π‡§Æ",
        "kahan": "‡§ï‡§π‡§æ‡§Å",
        "kahaan": "‡§ï‡§π‡§æ‡§Å",
    }

    processed_text = text

    # Apply direct Devanagari replacements for common words first
    # Use word boundaries (\b) and ignore case (re.IGNORECASE) for robust matching
    for roman, deva in replacements.items():
        processed_text = re.sub(r'\\b' + re.escape(roman) + r'\\b', deva, processed_text, flags=re.IGNORECASE)

    # Then, attempt transliteration for the remaining parts of the text
    try:
        # ITRANS is generally good for Romanized Hindi, but might be inconsistent without explicit handling
        return sanscript.transliterate(processed_text, sanscript.ITRANS, sanscript.DEVANAGARI)
    except Exception:
        try:
            # Fallback to IAST if ITRANS fails, though IAST is very strict
            return sanscript.transliterate(processed_text, sanscript.IAST, sanscript.DEVANAGARI)
        except Exception:
            # If both fail, return the processed text (which may contain mixed scripts now)
            return processed_text

# Sample Hinglish text
sample_hinglish_text = "Aur bhai, kaise ho? Sab theek hai?"

print(f"Sample Hinglish Text: {sample_hinglish_text}")

# Test the improved roman_hindi_to_deva
transliterated_text = roman_hindi_to_deva(sample_hinglish_text)
print(f"Improved Transliterated to Devanagari: {transliterated_text}")


Sample Hinglish Text: Aur bhai, kaise ho? Sab theek hai?
Improved Transliterated to Devanagari: ‡§Ü‡§â‡§∞‡•ç ‡§≠‡•à, ‡§ï‡•à‡§∏‡•á ‡§π‡•ã? ‡§∑‡§¨‡•ç ‡§•‡•Ä‡§ï‡•ç ‡§π‡•à?


In [37]:
import re
from indic_transliteration import sanscript

def looks_like_roman_hindi(text: str) -> bool:
    # Heuristic: mostly ASCII letters/spaces and contains common Hindi words spelled in Latin
    ascii_ratio = sum(ch.isascii() for ch in text) / max(len(text), 1)
    hints = ["hai", "nahi", "kya", "kaise", "mera", "tum", "bhai", "bhoot", "bahut", "kr", "hai."]
    hit = any(h in text.lower() for h in hints)
    return ascii_ratio > 0.95 and hit

def roman_hindi_to_deva(text: str) -> str:
    # Try ITRANS first (more forgiving for lowercase if we upper-case clusters lightly)
    try:
        return sanscript.transliterate(text, sanscript.ITRANS, sanscript.DEVANAGARI)
    except Exception:
        try:
            return sanscript.transliterate(text, sanscript.IAST, sanscript.DEVANAGARI)
        except Exception:
            return text


# Sample Hinglish text
sample_hinglish_text = "Aur bhai, kaise ho? Sab theek hai?"

print(f"Sample Hinglish Text: {sample_hinglish_text}")

# Test looks_like_roman_hindi
is_roman_hindi = looks_like_roman_hindi(sample_hinglish_text)
print(f"Looks like Roman Hindi: {is_roman_hindi}")

# If detected as Roman Hindi, perform transliteration
if is_roman_hindi:
    transliterated_text = roman_hindi_to_deva(sample_hinglish_text)
    print(f"Transliterated to Devanagari: {transliterated_text}")
else:
    print("Transliteration skipped as not detected as Roman Hindi.")

Sample Hinglish Text: Aur bhai, kaise ho? Sab theek hai?
Looks like Roman Hindi: True
Transliterated to Devanagari: ‡§Ü‡§â‡§∞‡•ç ‡§≠‡•à, ‡§ï‡•à‡§∏‡•á ‡§π‡•ã? ‡§∑‡§¨‡•ç ‡§•‡•Ä‡§ï‡•ç ‡§π‡•à?


In [38]:
print("LANGUAGE_MAP:")
display(LANGUAGE_MAP)

print("\nMODEL_REGISTRY:")
display(MODEL_REGISTRY)

LANGUAGE_MAP:


{'en': ('English', 'en'),
 'hi': ('Hindi', 'hi'),
 'fr': ('French', 'fr'),
 'de': ('German', 'de'),
 'es': ('Spanish', 'es'),
 'zh': ('Chinese', 'zh'),
 'ja': ('Japanese', 'ja'),
 'ko': ('Korean', 'ko'),
 'mr': ('Marathi', 'mr'),
 'gu': ('Gujarati', 'gu'),
 'ta': ('Tamil', 'ta'),
 'ml': ('Malayalam', 'ml')}


MODEL_REGISTRY:


{'facebook/m2m100_418M': {'type': 'm2m',
  'lang_map': {'en': 'en',
   'hi': 'hi',
   'fr': 'fr',
   'de': 'de',
   'es': 'es',
   'zh': 'zh',
   'ja': 'ja',
   'ko': 'ko',
   'mr': 'mr',
   'gu': 'gu',
   'ta': 'tam_Taml',
   'ml': 'mal_Mlym'}},
 'facebook/nllb-200-distilled-600M': {'type': 'nllb',
  'lang_map': {'en': 'eng_Latn',
   'hi': 'hin_Deva',
   'fr': 'fra_Latn',
   'de': 'deu_Latn',
   'es': 'spa_Latn',
   'zh': 'zho_Hans',
   'ja': 'jpn_Jpan',
   'ko': 'kor_Hang',
   'mr': 'mar_Deva',
   'gu': 'guj_Gujr',
   'ta': 'tam_Taml',
   'ml': 'mal_Mlym'}}}

In [39]:
print('Demonstrating `apply_glossary` function:')

sample_text = "The capital of France is Paris. OpenAI is a company that works on AI."
glossary_rules = "Paris=The City of Lights\nOpenAI=‡§ì‡§™‡§®‡§è‡§Ü‡§à\nAI=Artificial Intelligence"

# Call the apply_glossary function with sample text and rules
processed_text = apply_glossary(sample_text, glossary_rules)

print("\nOriginal Text:")
print(sample_text)

print("\nGlossary Rules:")
print(glossary_rules)

print("\nProcessed Text (after applying glossary):")
print(processed_text)

Demonstrating `apply_glossary` function:

Original Text:
The capital of France is Paris. OpenAI is a company that works on AI.

Glossary Rules:
Paris=The City of Lights
OpenAI=‡§ì‡§™‡§®‡§è‡§Ü‡§à
AI=Artificial Intelligence

Processed Text (after applying glossary):
The capital of France is The City of Lights. ‡§ì‡§™‡§®‡§è‡§Ü‡§à is a company that works on Artificial Intelligence.


In [40]:
print("Definition of do_clear function:\n")
def do_clear():
    HISTORY.clear()
    return "### Recent Translations\nNo translations yet."


Definition of do_clear function:

