In [None]:
%%capture
!git clone https://github.com/AI4Bharat/IndicTrans2.git
%cd IndicTrans2/huggingface_interface

!python3 -m pip install -q nltk sacremoses pandas regex mock transformers>=4.33.2 mosestokenizer
!python3 -c "import nltk; nltk.download('punkt')"
!python3 -m pip install -q bitsandbytes scipy accelerate datasets sentencepiece

!git clone https://github.com/VarunGumma/IndicTransToolkit.git
%cd IndicTransToolkit
!python3 -m pip install --editable ./
%cd ..

In [None]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig
from IndicTransToolkit import IndicProcessor

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 4
quantization = None  # Optional: "4-bit" or "8-bit" for memory efficiency

def initialize_model_and_tokenizer(ckpt_dir, quantization):
    if quantization == "4-bit":
        qconfig = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
    elif quantization == "8-bit":
        qconfig = BitsAndBytesConfig(
            load_in_8bit=True,
            bnb_8bit_use_double_quant=True,
            bnb_8bit_compute_dtype=torch.bfloat16,
        )
    else:
        qconfig = None

    tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, trust_remote_code=True)
    model = AutoModelForSeq2SeqLM.from_pretrained(
        ckpt_dir,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        quantization_config=qconfig,
    )
    if qconfig is None:
        model = model.to(DEVICE)
        if DEVICE == "cuda":
            model.half()
    model.eval()
    return tokenizer, model

def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip):
    translations = []
    for i in range(0, len(input_sentences), BATCH_SIZE):
        batch = input_sentences[i:i + BATCH_SIZE]
        batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)
        inputs = tokenizer(batch, truncation=True, padding="longest", return_tensors="pt").to(DEVICE)
        with torch.no_grad():
            outputs = model.generate(**inputs, use_cache=True, max_length=256, num_beams=5)
        with tokenizer.as_target_tokenizer():
            generated = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        translations += ip.postprocess_batch(generated, lang=tgt_lang)
        del inputs
        torch.cuda.empty_cache()
    return translations


In [None]:
# Load model
ckpt_dir = "ai4bharat/indictrans2-indic-indic-1B"
tokenizer_hi_mai, model_hi_mai = initialize_model_and_tokenizer(ckpt_dir, quantization)
ip = IndicProcessor(inference=True)

# Hindi input sentences
hi_sentences = [
    "मैं स्कूल जा रहा हूँ।",
    "उसने मुझे एक किताब दी।",
    "हमने कल एक फिल्म देखी।",
    "मैं अपने दोस्तों से मिलने जा रहा हूँ।"
]

src_lang, tgt_lang = "hin_Deva", "mai_Deva"
mai_translations = batch_translate(hi_sentences, src_lang, tgt_lang, model_hi_mai, tokenizer_hi_mai, ip)

print(f"\n{src_lang} → {tgt_lang}")
for hi, mai in zip(hi_sentences, mai_translations):
    print(f"{src_lang}: {hi}")
    print(f"{tgt_lang}: {mai}\n")

del tokenizer_hi_mai, model_hi_mai  # free memory


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

tokenization_indictrans.py:   0%|          | 0.00/8.13k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-indic-1B:
- tokenization_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


dict.SRC.json:   0%|          | 0.00/3.39M [00:00<?, ?B/s]

dict.TGT.json:   0%|          | 0.00/3.39M [00:00<?, ?B/s]

model.SRC:   0%|          | 0.00/3.26M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

configuration_indictrans.py:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-indic-1B:
- configuration_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_indictrans.py:   0%|          | 0.00/79.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-indic-1B:
- modeling_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/4.83G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]


hin_Deva → mai_Deva
hin_Deva: मैं स्कूल जा रहा हूँ।
mai_Deva: हम स्कूल जा रहल छी। 

hin_Deva: उसने मुझे एक किताब दी।
mai_Deva: ओ हमरा एकटा किताब देलनि। 

hin_Deva: हमने कल एक फिल्म देखी।
mai_Deva: हम सभ कल एकटा सिनेमा देखलहुँ। 

hin_Deva: मैं अपने दोस्तों से मिलने जा रहा हूँ।
mai_Deva: हम अपन मित्रसभसँ भेँट करबाक लेल जा रहल छी। 





In [None]:
# Reload model
tokenizer_mai_hi, model_mai_hi = initialize_model_and_tokenizer(ckpt_dir, quantization)
ip = IndicProcessor(inference=True)

# Maithili input sentences
mai_sentences = [
    "हम स्कूल जाए रहल छी।",
    "ओ हमरा एकटा किताब देलक।",
    "हमरा काल्हि एकटा फिल्म देखल।",
    "हम अपन मित्र सब सँ भेंट करए जाए रहल छी।"
]

src_lang, tgt_lang = "mai_Deva", "hin_Deva"
hi_translations = batch_translate(mai_sentences, src_lang, tgt_lang, model_mai_hi, tokenizer_mai_hi, ip)

print(f"\n{src_lang} → {tgt_lang}")
for mai, hi in zip(mai_sentences, hi_translations):
    print(f"{src_lang}: {mai}")
    print(f"{tgt_lang}: {hi}\n")

del tokenizer_mai_hi, model_mai_hi



mai_Deva → hin_Deva
mai_Deva: हम स्कूल जाए रहल छी।
hin_Deva: मैं स्कूल जा रहा हूँ। 

mai_Deva: ओ हमरा एकटा किताब देलक।
hin_Deva: उसने मुझे एक किताब दी। 

mai_Deva: हमरा काल्हि एकटा फिल्म देखल।
hin_Deva: मैंने कल एक फिल्म देखी। 

mai_Deva: हम अपन मित्र सब सँ भेंट करए जाए रहल छी।
hin_Deva: मैं अपने दोस्तों से मिलने जा रहा हूँ। 

