In [1]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from flask import Flask, request, jsonify
from flask_cors import CORS
import time
# from translator import NMTTranslator
import pandas as pd
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
import os
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Configuration variables for the NMT application.
LANGUAGES = {
    'en': 'English',
    'hi': 'Hindi',
    'ta': 'Tamil',
    'te': 'Telugu',
    'bn': 'Bengali',
    'mr': 'Marathi',
    'gu': 'Gujarati',
    'kn': 'Kannada',
    'ml': 'Malayalam',
    'pa': 'Punjabi',
}

# Translation model from Hugging Face.
MODEL_CONFIGS = {
    'model_name': 'facebook/mbart-large-50-many-to-many-MMT',

    # Language codes for mBART model.
    'lang_codes': {
        'en': 'en_XX',
        'hi': 'hi_IN',
        'ta': 'ta_IN',
        'te': 'te_IN',
        'bn': 'bn_IN',
        'mr': 'mr_IN',
        'gu': 'gu_IN',
        'kn': 'kn_IN',
        'ml': 'ml_IN',
        'pa': 'pa_IN',
    }
}

In [3]:


class NMTTranslator:
    def __init__(self):
        self.model = None
        self.tokenizer = None
        try:
            print(f"Loading model: {MODEL_CONFIGS['model_name']}...")
            self.model = MBartForConditionalGeneration.from_pretrained(MODEL_CONFIGS['model_name'])
            self.tokenizer = MBart50TokenizerFast.from_pretrained(MODEL_CONFIGS['model_name'])
            print("Model and tokenizer loaded successfully.")
        except Exception as e:
            print(f"CRITICAL: Failed to load model or tokenizer. Error: {e}")

    def translate(self, text: str, src_lang: str, tgt_lang: str):
        if not self.model or not self.tokenizer:
            return {"error": "Translator model is not available. Check server logs for details."}

        try:
            src_code = MODEL_CONFIGS['lang_codes'].get(src_lang)
            tgt_code = MODEL_CONFIGS['lang_codes'].get(tgt_lang)

            if not src_code or not tgt_code:
                return {"error": f"Invalid language configuration for {src_lang} or {tgt_lang}."}

            if tgt_code not in self.tokenizer.lang_code_to_id:
                 return {"error": f"The model does not support the target language code: {tgt_code}"}

            # Setting the source language
            self.tokenizer.src_lang = src_code

            # Encode the input text
            encoded_text = self.tokenizer(text, return_tensors="pt")

            # Generate the translation
            generated_tokens = self.model.generate(
                **encoded_text,
                forced_bos_token_id=self.tokenizer.lang_code_to_id[tgt_code]
            )

            # Decode the generated tokens
            translated_text = self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

            return {"translation": translated_text}

        except Exception as e:
            error_message = f"An error occurred during translation: {str(e)}"
            print(error_message)
            return {"error": "An internal error occurred during translation."}

In [4]:

# Initialize Flask app
app = Flask(__name__)
# Enable Cross-Origin Resource Sharing (CORS)
CORS(app)

# Initialize translator class
print("Initializing the translator...")
translator = NMTTranslator()
print("Translator initialized successfully.")

@app.route('/api/translate', methods=['POST'])
def translate_text():
    """
    API endpoint to handle translation requests.
    Expects a JSON payload with 'text', 'src_lang', and 'tgt_lang'.
    """
    try:
        data = request.json
        text = data.get('text', '').strip()
        src_lang = data.get('src_lang', 'en')
        tgt_lang = data.get('tgt_lang', 'hi')

        # --- Input Validation ---
        if not text:
            return jsonify({'error': 'No text provided for translation.'}), 400
        if src_lang not in LANGUAGES or tgt_lang not in LANGUAGES:
            return jsonify({'error': 'Unsupported language selected.'}), 400
        if src_lang == tgt_lang:
            return jsonify({'error': 'Source and target languages are the same.'}), 400

        # --- Perform Translation ---
        start_time = time.time()
        result = translator.translate(text, src_lang, tgt_lang)
        end_time = time.time()

        if "error" in result:
            return jsonify(result), 500

        # --- Return Successful Response ---
        return jsonify({
            'translation': result.get('translation'),
            'processing_time': round(end_time - start_time, 2)
        })

    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return jsonify({'error': 'An internal server error occurred.'}), 500



Initializing the translator...
Loading model: facebook/mbart-large-50-many-to-many-MMT...
Model and tokenizer loaded successfully.
Translator initialized successfully.


In [5]:

def download_nltk_data():
    """Downloads the necessary NLTK data."""
    try:
        nltk.data.find('tokenizers/punkt')
        print("NLTK 'punkt' data already downloaded.")
    except LookupError:
        print("Downloading NLTK 'punkt' data...")
        nltk.download('punkt')  
    
    try:
        nltk.data.find('corpora/wordnet')
        print("NLTK 'wordnet' data already downloaded.")
    except LookupError:
        print("Downloading NLTK 'wordnet' data...")
        nltk.download('wordnet')
        
class TranslationEvaluator:
    def __init__(self):
        self.smoothing = SmoothingFunction().method1

    def calculate_bleu(self, reference, candidate):
        ref_tokens = nltk.word_tokenize(reference.lower())
        cand_tokens = nltk.word_tokenize(candidate.lower())
        score = sentence_bleu([ref_tokens], cand_tokens, smoothing_function=self.smoothing)
        return round(score * 100, 2)

    # Corrected method below
    def calculate_meteor(self, reference, candidate):
        # Tokenize the reference and candidate sentences first
        ref_tokens = nltk.word_tokenize(reference.lower())
        cand_tokens = nltk.word_tokenize(candidate.lower())
        
        # Pass the tokenized lists to meteor_score
        # Note: meteor_score expects a list of reference lists
        score = meteor_score([ref_tokens], cand_tokens)
        return round(score * 100, 2)

def run_evaluation(translator, test_cases):
    evaluator = TranslationEvaluator()
    results = []
    print("Running translation evaluation...")
    for i, case in enumerate(test_cases):
        print(f"  - Evaluating case {i+1}/{len(test_cases)}...")
        result = translator.translate(case['source'], case['src_lang'], case['tgt_lang'])
        if 'error' in result:
            print(f"    Error translating '{case['source']}': {result['error']}")
            continue
        pred_text = result.get('translation', '')
        bleu = evaluator.calculate_bleu(case['reference'], pred_text)
        meteor = evaluator.calculate_meteor(case['reference'], pred_text)
        results.append({
            'Source': case['source'],
            'Reference': case['reference'],
            'Prediction': pred_text,
            'BLEU': bleu,
            'METEOR': meteor
        })
    print("Evaluation complete.")
    return pd.DataFrame(results)


download_nltk_data()
nmt = NMTTranslator()
evaluation_pairs = [
    {'source': 'Hello, how are you?', 'reference': 'नमस्ते, आप कैसे हैं?', 'src_lang': 'en', 'tgt_lang': 'hi'},
    {'source': 'Good morning', 'reference': 'सुप्रभात', 'src_lang': 'en', 'tgt_lang': 'hi'},
    {'source': 'Where are you from?', 'reference': 'तुम्ही कुठून आहात?', 'src_lang': 'en', 'tgt_lang': 'mr'},
    {'source': 'This is a beautiful place', 'reference': 'हे एक सुंदर ठिकाण आहे', 'src_lang': 'en', 'tgt_lang': 'mr'},
    # {    'source': 'मी ठीक आहे', 'reference': 'I am fine', 'src_lang': 'hi', 'tgt_lang': 'en'},
    {'source': 'मी ठीक आहे', 'reference': 'I am fine', 'src_lang': 'hi', 'tgt_lang': 'en'},
    {'source': 'नमस्ते', 'reference': 'Hello', 'src_lang': 'hi', 'tgt_lang': 'en'}
]
evaluation_df = run_evaluation(nmt, evaluation_pairs)
print("\n--- Evaluation Results ---")
print(evaluation_df.to_string())
print("------------------------\n")

NLTK 'punkt' data already downloaded.
Downloading NLTK 'wordnet' data...


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\IMRAN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Loading model: facebook/mbart-large-50-many-to-many-MMT...
Model and tokenizer loaded successfully.
Running translation evaluation...
  - Evaluating case 1/6...
  - Evaluating case 2/6...
  - Evaluating case 3/6...
  - Evaluating case 4/6...
  - Evaluating case 5/6...
  - Evaluating case 6/6...
Evaluation complete.

--- Evaluation Results ---
                      Source              Reference              Prediction    BLEU  METEOR
0        Hello, how are you?   नमस्ते, आप कैसे हैं?    नमस्ते, आप कैसे हैं?  100.00   99.77
1               Good morning               सुप्रभात                  नमस्ते    0.00    0.00
2        Where are you from?     तुम्ही कुठून आहात?          तू कोठून आलोस?    8.03   12.50
3  This is a beautiful place  हे एक सुंदर ठिकाण आहे  हे एक सुंदर स ् थळ आहे   17.57   72.12
4                 मी ठीक आहे              I am fine                 I 'm OK   11.36   33.33
5                     नमस्ते                  Hello               greetings    0.00    0.00
-----------

In [6]:
import sys
import pandas as pd
import torch
from datasets import load_dataset
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

# 1. SETUP: Define model, languages, and dataset parameters
# -----------------------------------------------------------------
# model_name = "ai4bharat/indictrans2-en-indic-1B"
model_name = "google/mt5-base"
# Important: Use the correct language codes for IndicTrans2
src_lang = "eng_Latn"  # English
tgt_lang = "hin_Deva"  # Hindi
dataset_name = "ai4bharat/samanantar"
dataset_config = "hi" # English-Hindi pair
output_dir = "results/indictrans2-finetuned-en-hi"

# For demonstration, we'll use a small subset of the data.
# Increase these numbers for a real fine-tuning job.
train_sample_size = 2000
valid_sample_size = 200


In [7]:

print(f"Transformers library version: {transformers.__version__}")

# Get the module object from which the class was imported
module_name = Seq2SeqTrainingArguments.__module__
module_object = sys.modules[module_name]

print("\n'Seq2SeqTrainingArguments' is being loaded from this file:")
print(module_object.__file__)

Transformers library version: 4.55.2

'Seq2SeqTrainingArguments' is being loaded from this file:
c:\Users\IMRAN\.conda\envs\nmt\lib\site-packages\transformers\training_args_seq2seq.py


In [8]:
# 2. DATA LOADING: Fetch the dataset and create a validation split
# -----------------------------------------------------------------
print(f"Loading '{dataset_name}' dataset ({dataset_config} configuration)...")
dataset_config = "hi"
dataset = load_dataset(dataset_name, dataset_config)

# Since there's no 'validation' split, we create one from the 'train' split.
# We'll use 10% of the training data for validation.
print("Creating a validation split from the training data...")
train_validation_split = dataset['train'].train_test_split(test_size=0.1, seed=42)

# The split creates a new DatasetDict with 'train' and 'test' keys.
# We will use its 'train' for training and its 'test' as our validation set.
full_train_dataset = train_validation_split['train']
full_valid_dataset = train_validation_split['test']

# Now, select the smaller samples for our quick fine-tuning run
# We add a check to ensure we don't request more samples than available
train_sample_size = min(train_sample_size, len(full_train_dataset))
valid_sample_size = min(valid_sample_size, len(full_valid_dataset))

train_dataset = full_train_dataset.select(range(train_sample_size))
valid_dataset = full_valid_dataset.select(range(valid_sample_size))

print(f"Loaded {len(train_dataset)} training samples and {len(valid_dataset)} validation samples.")
# Example: print(train_dataset[0]) -> {'translation': {'en': '...', 'hi': '...'}}

Loading 'ai4bharat/samanantar' dataset (hi configuration)...
Creating a validation split from the training data...
Loaded 2000 training samples and 200 validation samples.


In [9]:

# 3. TOKENIZER & MODEL: Load the pre-trained model and tokenizer
# -----------------------------------------------------------------
print(f"Loading tokenizer and model for '{model_name}'...")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True)

# Set the source and target languages for the tokenizer
tokenizer.src_lang = src_lang
tokenizer.tgt_lang = tgt_lang


Loading tokenizer and model for 'google/mt5-base'...


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [10]:
# 4. PREPROCESSING: Create a function to tokenize the data
# -----------------------------------------------------------------
def preprocess_function(examples):
    # The dataset has 'src' for source (English) and 'tgt' for target (Hindi)
    inputs = examples["src"]
    targets = examples["tgt"]

    # THIS IS THE FIX: Prepend the language codes to each input sentence
    # The IndicTrans2 tokenizer expects this specific format for this model.
    # prefixed_inputs = [f"{src_lang} {tgt_lang} {text}" for text in inputs]

    # The tokenizer will now correctly handle the prefixed inputs
    model_inputs = tokenizer(
        # prefixed_inputs,  # Use the prefixed inputs for indictrans2
        inputs,  # Use the original inputs for google/mt5-base
        text_target=targets,
        max_length=128,
        truncation=True
    )
    return model_inputs

print("Preprocessing datasets...")
# No changes needed here, the fix is entirely within the function above
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=["idx", "src", "tgt"])
tokenized_valid_dataset = valid_dataset.map(preprocess_function, batched=True, remove_columns=["idx", "src", "tgt"])

Preprocessing datasets...


In [11]:
# %pip uninstall -y transformers
# %pip cache purge
# %pip install transformers accelerate

In [12]:

print(f"Transformers library version: {transformers.__version__}")

# Get the module object from which the class was imported
module_name = Seq2SeqTrainingArguments.__module__
module_object = sys.modules[module_name]

print("\n'Seq2SeqTrainingArguments' is being loaded from this file:")
print(module_object.__file__)

Transformers library version: 4.55.2

'Seq2SeqTrainingArguments' is being loaded from this file:
c:\Users\IMRAN\.conda\envs\nmt\lib\site-packages\transformers\training_args_seq2seq.py


In [20]:
# 5. TRAINING SETUP: Configure the training arguments and trainer
# -----------------------------------------------------------------
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# Check if a GPU is available and set fp16 accordingly
use_fp16 = torch.cuda.is_available()
print(f"FP16 training enabled: {use_fp16}")

# This code is CORRECT for modern versions of the transformers library
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    # evaluation_strategy="epoch",  # This is not recognized
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    # optim="paged_adamw_8bit",  # only for CUDA based training
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    # fp16=use_fp16,
    push_to_hub=False,
    no_cuda=True,      # Forcing to run on CPU
    fp16=False,        # For CPU based training
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_valid_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

FP16 training enabled: True


  trainer = Seq2SeqTrainer(


In [21]:
import torch
print(f"Is CUDA available? {torch.cuda.is_available()}")

Is CUDA available? True


In [None]:

# 6. START TRAINING: Fine-tune the model
# -----------------------------------------------------------------
print("🚀 Starting fine-tuning...")
trainer.train()

print(f"Fine-tuning complete. Saving model to '{output_dir}'")
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print("✅ Model and tokenizer saved successfully!")


🚀 Starting fine-tuning...


Step,Training Loss


In [None]:
if __name__ == '__main__':
    # Run the Flask app in debug mode
    app.run(debug=True, port=3000)