#**Installing the required dependencies**

In [None]:
%%capture
!git clone https://github.com/AI4Bharat/IndicTrans2.git

In [None]:
%%capture
%cd /content/IndicTrans2/huggingface_interface

In [None]:
%%capture
!python3 -m pip install nltk sacremoses pandas regex mock transformers>=4.33.2 mosestokenizer
!python3 -c "import nltk; nltk.download('punkt')"
!python3 -m pip install bitsandbytes scipy accelerate datasets
!python3 -m pip install sentencepiece

!git clone https://github.com/VarunGumma/IndicTransTokenizer
%cd IndicTransTokenizer
!python3 -m pip install --editable ./
%cd ..

**IMPORTANT : Restart your run-time first and then run the cells below.**

## 1. Importing the requried libraries:
  * transformer
  * torch
  * AutoModelForSeq2SeqLM from transformer
  * BitsAndBytesConfig from transformer
  * IndicProcessor from from IndicTransTokenizer
  * IndicTransTokenizer from IndicTransTokenizer

In [None]:
import transformers
import torch
from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig
from transformers import AutoTokenizer
from IndicTransTokenizer import IndicTransTokenizer, IndicProcessor

In [None]:
BATCH_SIZE = 4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
QUANTIZATION = None


#**Setting up the model initializer and tokenizer function.
.**

Create a function initialize_model_and_tokenizer which takes in 4 arguments: ckpt_dir, direction, quantization.
Inside the function, if quantization  = '4-bit' then create a variable qconfig and use appropriate BitsAndByteConfig to instantiate it. Else if quantization  = '8-bit', then do the necessary. Else, set it to None.

(To learn more, check out the documentation on [BitsAndByteConfig](https://huggingface.co/docs/transformers/en/main_classes/quantization#transformers.BitsAndBytesConfig).)

After the conditional flow, create a variable tokenizer

The next step involves making a model variable. Set it to AutoModelForSeq2SeqLM, which should load the pre-trained model from the checkpoint directory.

In [None]:
def initialize_model_and_tokenizer(ckpt_dir, direction, quantization):
    """
    Initializes and returns a model and tokenizer for sequence-to-sequence language modeling.

    Args:
        ckpt_dir (str): The directory path of the pre-trained model checkpoint.
        direction (str): The translation direction for the tokenizer (e.g., 'en-hi' for English to Hindi).
        quantization (str): The quantization level for the model.
                            Should be '4-bit', '8-bit', or None for no quantization.

    Returns:
        tuple: A tuple containing:
            - tokenizer: The tokenizer initialized with the specified direction.
            - model: The sequence-to-sequence model loaded with the specified configurations.
    """
    if quantization == '4-bit':
        qconfig = BitsAndBytesConfig(load_in_4bit=True)
    elif quantization == '8-bit':
        qconfig = BitsAndBytesConfig(load_in_8bit=True)
    else:
        qconfig = None  # No quantization applied


    tokenizer = IndicTransTokenizer(direction)

    model = AutoModelForSeq2SeqLM.from_pretrained(
        ckpt_dir,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        quantization_config=qconfig
    )

    if qconfig is None:
        model = model.to(DEVICE)
        if DEVICE == "cuda":
            model.half()

    model.eval()
    return tokenizer, model


#**Helper Function to Batch Translation**

This function translates a group of sentences from one language to another using a pre-trained model. It allows you to efficiently process multiple sentences at once, converting them from the source language to the target language.

In [None]:
def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip):
    """
    Translates a batch of input sentences from the source language to the target language using a pre-trained model.

    Args:
        input_sentences (list of str): The list of sentences to translate.
        src_lang (str): The source language code (e.g., 'en' for English).
        tgt_lang (str): The target language code (e.g., 'hi' for Hindi).
        model (torch.nn.Module): The pre-trained sequence-to-sequence model for translation.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer corresponding to the model.
        ip (object): An instance of a class with methods for preprocessing and postprocessing batches.

    Returns:
        list of str: A list of translated sentences in the target language.
    """

    translations = []

    for i in range(0, len(input_sentences), BATCH_SIZE):
        batch = input_sentences[i : i + BATCH_SIZE]

        batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)

        inputs = tokenizer(
            batch,
            src=True,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True
        ).to(DEVICE)

        with torch.no_grad():
            generated_tokens = model.generate(
                **inputs,
                use_cache=True,
                min_length=0,
                max_length=256,
                num_beams=5,
                num_return_sequences=1,
            )

        generated_tokens = tokenizer.batch_decode(
            generated_tokens.detach().cpu().tolist(),
            src=False
        )

        translations += ip.postprocess_batch(generated_tokens, lang=tgt_lang)

        del inputs
        torch.cuda.empty_cache()

    return translations


In [None]:
ckpt_dir = "ai4bharat/indictrans2-indic-indic-1B"

direction = "indic-indic"

quantization = None

tokenizer, model = initialize_model_and_tokenizer(ckpt_dir, direction, quantization)

ip = IndicProcessor(inference=True)


The official Tokenizer is available on HF and can be used as follows:
```
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
```
  tokenizer = IndicTransTokenizer(direction)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

configuration_indictrans.py:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-indic-1B:
- configuration_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_indictrans.py:   0%|          | 0.00/79.6k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-indic-indic-1B:
- modeling_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin:   0%|          | 0.00/4.83G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [None]:
%%capture
!pip install sacrebleu rouge_score

In [None]:
%%capture
# download the evaluate dataset and unzip it
!wget https://indictrans2-public.objectstore.e2enetworks.net/IN22_testset.zip && unzip IN22_testset.zip

In [None]:
input_sent = []

with open('IN22_testset/conv/test.ben_Beng', 'r') as f:
    for line in f:
        input_sent.append(line)

In [None]:
src_lang, tgt_lang = "ben_Beng", "mai_Deva"

batch = ip.preprocess_batch(
    input_sent,
    src_lang=src_lang,
    tgt_lang=tgt_lang,
)

mai_sent = batch_translate(input_sent, src_lang, tgt_lang, model, tokenizer, ip)
for i in range(len(input_sent)):
    print(input_sent[i])
    print(mai_sent[i])
    print()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

কিন্তু ম্যাম, গঙ্গা আর ব্রহ্মপুত্র দুটোই কি করে ভারতের দীর্ঘতম নদী হতে পারে?

परंच मैम, गंगा आ ब्रह्मपुत्र दुनू भारतक सबसँ लम्बा नदी केना भऽ सकैत अछि?

ব্রহ্মপুত্র তো স্পষ্টতই গঙ্গার থেকে দীর্ঘতর।

ब्रह्मपुत्र स्पष्ट रूपसँ गङ्गासँ पैघ अछि।

গঙ্গা দীর্ঘতম হলেও জলপ্রবাহের নিরিখে ব্রহ্মপুত্র হলো বৃহত্তম।

यद्यपि गङ्गा सभसँ नमगर अछि मुदा ब्रह्मपुत्र जल प्रवाहक हिसाबसँ सभसँ पैघ अछि।

'দক্ষিণ গঙ্গা' নামেও পরিচিত গোদাবরী হলো গঙ্গার পরে দ্বিতীয় দীর্ঘতম এবং দক্ষিণ ভারতের দীর্ঘতম নদী।

गोदावरी, जकरा दक्षिण गङ्गा सेहो कहल जाइत अछि, गङ्गाक बाद दोसर सभसँ नमगर आ दक्षिण भारतक सभसँ नमगर नदी अछि।

হিন্দু ধর্মগ্রন্থে বহু সহস্র কাল ধরে সম্মানিত এবং সমৃদ্ধ সাংস্কৃতিক ঐতিহ্যের ধারক ও বাহক হয়ে আছে নদীটি।

ई नदी हिन्दू धर्मग्रंथमे कतेको सहस्राब्दीसँ सम्मानित आ समृद्ध सांस्कृतिक धरोहरक धारक आ वाहक रहल अछि।

এবং কৃষ্ণা-গোদাবরী অববাহিকা হলো বিপন্ন অলিভ রিডলি কাছিমের ডিম পাড়ার প্রধান স্থানগুলির মধ্যে অন্যতম।

आ कृष्णा-गोदावरी बेसिन लुप्तप्राय ऑलिव

In [None]:
# Create a reference list for translated language : Maithili Language
original_sent = []

# Open the file and read each line
with open('IN22_testset/conv/test.mai_Deva', 'r') as f:
    for line in f:
        original_sent.append([line])

In [None]:
from datasets import load_metric

# Load the BLEU metric using the 'sacrebleu' implementation from the datasets library
metric = load_metric("sacrebleu")

# Compute the BLEU score by comparing the predicted translations with the original reference sentences
bleu_score = metric.compute(predictions = mai_sent, references=original_sent)

for key, value in bleu_score.items():
    print(f"{key}: {value}")


score: 14.646765849868627
counts: [7469, 2968, 1372, 627]
totals: [16506, 15003, 13501, 12015]
precisions: [45.250212044105176, 19.78271012464174, 10.162210206651359, 5.218476903870163]
bp: 0.9922752346995671
sys_len: 16506
ref_len: 16634


In [None]:
from datasets import load_metric

# Load the ROUGE metric using the 'rouge' implementation from the datasets library
rouge = load_metric("rouge")

# Compute the ROUGE score by comparing the first 200 predicted translations with the first 200 reference sentences
rouge_score = rouge.compute(predictions=mai_sent, references=original_sent)

# Print out each component of the ROUGE score
for key, value in rouge_score.items():
    print(f"{key}: {value}")


rouge1: AggregateScore(low=Score(precision=0.0, recall=0.0, fmeasure=0.0), mid=Score(precision=0.0, recall=0.0, fmeasure=0.0), high=Score(precision=0.0, recall=0.0, fmeasure=0.0))
rouge2: AggregateScore(low=Score(precision=0.0, recall=0.0, fmeasure=0.0), mid=Score(precision=0.0, recall=0.0, fmeasure=0.0), high=Score(precision=0.0, recall=0.0, fmeasure=0.0))
rougeL: AggregateScore(low=Score(precision=0.0, recall=0.0, fmeasure=0.0), mid=Score(precision=0.0, recall=0.0, fmeasure=0.0), high=Score(precision=0.0, recall=0.0, fmeasure=0.0))
rougeLsum: AggregateScore(low=Score(precision=0.0, recall=0.0, fmeasure=0.0), mid=Score(precision=0.0, recall=0.0, fmeasure=0.0), high=Score(precision=0.0, recall=0.0, fmeasure=0.0))


In [None]:
from sacrebleu import corpus_bleu

# Calculate the BLEU score by importing corpus_bleu from sacrebleu package
bleu_score = corpus_bleu(mai_sent, original_sent)

# Print the BLEU score
print(f"BLEU Score: {bleu_score.score}")


BLEU Score: 30.739407647563215
