## Fine-tune MarianMT model for English-Hindi translation

In [5]:
import torch
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import (
    MarianMTModel,
    MarianTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
import os
import warnings
warnings.filterwarnings("ignore")

def prepare_translation_data():
    """Prepare translation data from samanantar dataset."""
    print("Loading samanantar dataset...")
    dataset = load_dataset("ai4bharat/samanantar", "hi", split='train')
    
    # Filter valid pairs and create smaller dataset for testing
    valid_pairs = []
    for i, example in enumerate(dataset):
        if i >= 5000:  # Use smaller dataset for debugging
            break
        if example['src'] and example['tgt'] and len(example['src'].strip()) > 0 and len(example['tgt'].strip()) > 0:
            valid_pairs.append({
                'english': example['src'].strip(),
                'hindi': example['tgt'].strip()
            })
    
    print(f"Found {len(valid_pairs)} valid translation pairs")
    
    # Split into train/val
    split_idx = int(0.9 * len(valid_pairs))
    train_pairs = valid_pairs[:split_idx]
    val_pairs = valid_pairs[split_idx:]
    
    return train_pairs, val_pairs

def fine_tune_marian_model():
    """Fine-tune MarianMT model for English-Hindi translation."""
    print("--- Starting Fine-Tuning with MarianMT ---")
    
    # Use Helsinki-NLP's multilingual model as base
    model_name = "Helsinki-NLP/opus-mt-en-hi"
    output_dir = "results/marian_en_hi_finetuned"
    
    try:
        # Load model and tokenizer
        print(f"Loading {model_name}...")
        tokenizer = MarianTokenizer.from_pretrained(model_name)
        model = MarianMTModel.from_pretrained(model_name)
        
        # Prepare data
        train_pairs, val_pairs = prepare_translation_data()
        
        def preprocess_function(examples):
            inputs = examples["english"]
            targets = examples["hindi"] 
            model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding=False)
            
            # Setup the tokenizer for targets
            labels = tokenizer(text_target=targets, max_length=128, truncation=True, padding=False)
            model_inputs["labels"] = labels["input_ids"]
            return model_inputs
        
        # Convert to datasets
        train_dataset = Dataset.from_list(train_pairs)
        val_dataset = Dataset.from_list(val_pairs)
        
        print("Preprocessing datasets...")
        train_dataset = train_dataset.map(preprocess_function, batched=True)
        val_dataset = val_dataset.map(preprocess_function, batched=True)
        
        # Data collator
        data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
        
        # Training arguments
        training_args = Seq2SeqTrainingArguments(
            output_dir=output_dir,
            evaluation_strategy="steps",
            eval_steps=200,
            logging_steps=50,
            save_steps=200,
            save_total_limit=2,
            learning_rate=3e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=3,
            weight_decay=0.01,
            warmup_steps=200,
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
        )
        
        # Trainer
        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            data_collator=data_collator,
        )
        
        print("Starting training...")
        trainer.train()
        
        print(f"Saving model to {output_dir}")
        trainer.save_model(output_dir)
        tokenizer.save_pretrained(output_dir)
        
        return output_dir
        
    except Exception as e:
        print(f"MarianMT failed: {e}")
        print("Falling back to T5 model...")
        return fine_tune_t5_model()

def fine_tune_t5_model():
    """Fallback: Fine-tune T5 model for translation."""
    from transformers import T5ForConditionalGeneration, T5Tokenizer
    
    model_name = "t5-small"
    output_dir = "results/t5_en_hi_finetuned"
    
    print(f"Loading {model_name}...")
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    
    # Prepare data
    train_pairs, val_pairs = prepare_translation_data()
    
    # Create training data with T5 format
    train_data = []
    val_data = []
    
    for pair in train_pairs:
        train_data.append({
            'input_text': f"translate English to Hindi: {pair['english']}",
            'target_text': pair['hindi']
        })
    
    for pair in val_pairs:
        val_data.append({
            'input_text': f"translate English to Hindi: {pair['english']}",
            'target_text': pair['hindi']
        })
    
    def preprocess_function(examples):
        inputs = examples["input_text"]
        targets = examples["target_text"]
        model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding=False)
        labels = tokenizer(text_target=targets, max_length=128, truncation=True, padding=False)
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs
    
    # Convert to datasets
    train_dataset = Dataset.from_list(train_data)
    val_dataset = Dataset.from_list(val_data)
    
    print("Preprocessing datasets...")
    train_dataset = train_dataset.map(preprocess_function, batched=True)
    val_dataset = val_dataset.map(preprocess_function, batched=True)
    
    # Data collator
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
    
    # Training arguments
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="steps",
        eval_steps=200,
        logging_steps=50,
        save_steps=200,
        save_total_limit=2,
        learning_rate=3e-4,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=5,
        weight_decay=0.01,
        warmup_steps=200,
        predict_with_generate=True,
        fp16=torch.cuda.is_available(),
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
    )
    
    # Trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
    
    print("Starting T5 training...")
    trainer.train()
    
    print(f"Saving T5 model to {output_dir}")
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    
    return output_dir

class UniversalTranslator:
    """Universal translator that works with different model types."""
    
    def __init__(self, model_path):
        print(f"Loading translator from: {model_path}")
        
        # Detect model type
        config_path = os.path.join(model_path, "config.json")
        self.model_type = "unknown"
        
        if os.path.exists(config_path):
            import json
            with open(config_path, 'r') as f:
                config = json.load(f)
                if "marian" in config.get("architectures", [""])[0].lower():
                    self.model_type = "marian"
                elif "t5" in config.get("architectures", [""])[0].lower():
                    self.model_type = "t5"
        
        print(f"Detected model type: {self.model_type}")
        
        if self.model_type == "marian":
            self.tokenizer = MarianTokenizer.from_pretrained(model_path)
            self.model = MarianMTModel.from_pretrained(model_path)
        else:  # Default to T5
            from transformers import T5ForConditionalGeneration, T5Tokenizer
            self.tokenizer = T5Tokenizer.from_pretrained(model_path)
            self.model = T5ForConditionalGeneration.from_pretrained(model_path)
            self.model_type = "t5"
        
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        self.model.eval()
        
        print("Translator ready!")

    def translate(self, text, src_lang='en', tgt_lang='hi'):
        """Translate text based on model type."""
        if self.model_type == "marian":
            # MarianMT expects plain text
            input_text = text
        else:  # T5
            input_text = f"translate English to Hindi: {text}"
        
        # Tokenize
        inputs = self.tokenizer(
            input_text,
            return_tensors="pt",
            max_length=128,
            truncation=True,
            padding=True
        )
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        # Generate
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_length=128,
                num_beams=4,
                length_penalty=0.6,
                early_stopping=True,
                do_sample=False,
            )
        
        # Decode
        translation = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return translation.strip()

class TranslationEvaluator:
    def __init__(self):
        self.smoothing = SmoothingFunction().method1
    
    def calculate_bleu(self, reference, candidate):
        if not reference.strip() or not candidate.strip():
            return 0.0
        ref_tokens = nltk.word_tokenize(reference.lower())
        cand_tokens = nltk.word_tokenize(candidate.lower())
        return round(sentence_bleu([ref_tokens], cand_tokens, smoothing_function=self.smoothing) * 100, 2)
    
    def calculate_meteor(self, reference, candidate):
        if not reference.strip() or not candidate.strip():
            return 0.0
        ref_tokens = nltk.word_tokenize(reference.lower())
        cand_tokens = nltk.word_tokenize(candidate.lower())
        return round(meteor_score([ref_tokens], cand_tokens) * 100, 2)

def run_evaluation(model_path):
    """Evaluate the trained model."""
    print("\n--- Starting Evaluation ---")
    
    # Download NLTK data
    for corpus in ['punkt', 'wordnet', 'omw-1.4']:
        nltk.download(corpus, quiet=True)

    translator = UniversalTranslator(model_path)
    evaluator = TranslationEvaluator()
    
    test_cases = [
        {'source': 'Hello', 'reference': 'नमस्ते'},
        {'source': 'How are you?', 'reference': 'आप कैसे हैं?'},
        {'source': 'Good morning', 'reference': 'सुप्रभात'},
        {'source': 'Thank you', 'reference': 'धन्यवाद'},
        {'source': 'I am fine', 'reference': 'मैं ठीक हूं'},
        {'source': 'What is your name?', 'reference': 'आपका नाम क्या है?'},
        {'source': 'Nice to meet you', 'reference': 'आपसे मिलकर खुशी हुई'},
        {'source': 'How much does this cost?', 'reference': 'इसकी कीमत कितनी है?'},
    ]
    
    results = []
    print("Evaluating translations...")
    
    for i, case in enumerate(test_cases, 1):
        print(f"{i:2d}. '{case['source']}'")
        
        try:
            prediction = translator.translate(case['source'])
            bleu = evaluator.calculate_bleu(case['reference'], prediction)
            meteor = evaluator.calculate_meteor(case['reference'], prediction)
            
            results.append({
                'Source': case['source'],
                'Reference': case['reference'], 
                'Prediction': prediction,
                'BLEU': bleu,
                'METEOR': meteor
            })
            
            print(f"    → {prediction}")
            
        except Exception as e:
            print(f"    → ERROR: {str(e)}")
            results.append({
                'Source': case['source'],
                'Reference': case['reference'],
                'Prediction': f"ERROR: {str(e)}",
                'BLEU': 0.0,
                'METEOR': 0.0
            })
    
    # Results summary
    df = pd.DataFrame(results)
    print(f"\n{'='*80}")
    print("EVALUATION RESULTS")
    print(f"{'='*80}")
    print(df.to_string(index=False, max_colwidth=40))
    
    valid_results = df[df['BLEU'] > 0]
    if len(valid_results) > 0:
        avg_bleu = valid_results['BLEU'].mean()
        avg_meteor = valid_results['METEOR'].mean()
        print(f"\nAverage BLEU: {avg_bleu:.2f}")
        print(f"Average METEOR: {avg_meteor:.2f}")
        print(f"Success rate: {len(valid_results)}/{len(results)} ({100*len(valid_results)/len(results):.1f}%)")
    else:
        print("\nNo successful translations generated.")

def main():
    """Main execution function."""
    os.makedirs("results", exist_ok=True)
    
    # Train model (try MarianMT first, fallback to T5)
    model_path = fine_tune_marian_model()
    
    # Evaluate model
    run_evaluation(model_path)
    
    return model_path

if __name__ == "__main__":
    main()

--- Starting Fine-Tuning with MarianMT ---
Loading Helsinki-NLP/opus-mt-en-hi...


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Loading samanantar dataset...
Found 5000 valid translation pairs
Preprocessing datasets...


Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Starting training...


Step,Training Loss,Validation Loss
200,4.1681,3.757415
400,3.6922,3.588024
600,3.3591,3.52431
800,3.3135,3.505749


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]], 'forced_eos_token_id': 0}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]], 'forced_eos_token_id': 0}


Saving model to results/marian_en_hi_finetuned

--- Starting Evaluation ---
Loading translator from: results/marian_en_hi_finetuned
Detected model type: marian
Translator ready!
Evaluating translations...
 1. 'Hello'
    → सलाम
 2. 'How are you?'
    → आप कैसे हैं?
 3. 'Good morning'
    → सुप्रभात
 4. 'Thank you'
    → धन्यवाद
 5. 'I am fine'
    → मैं ठीक हूं
 6. 'What is your name?'
    → आपका GroupWise कूटशब्द क्या है?
 7. 'Nice to meet you'
    → आपसे मिलकर अच्छा लगा
 8. 'How much does this cost?'
    → यह खर्च कितना है?

EVALUATION RESULTS
                  Source           Reference                      Prediction   BLEU  METEOR
                   Hello              नमस्ते                            सलाम   0.00    0.00
            How are you?        आप कैसे हैं?                    आप कैसे हैं? 100.00   99.22
            Good morning            सुप्रभात                        सुप्रभात  17.78   50.00
               Thank you             धन्यवाद                         धन्यवाद  17

In [None]:
from flask import Flask, request, jsonify
from flask_cors import CORS
import time
import threading

app = Flask(__name__)
CORS(app)

# Global translator instance
translator = None
LANGUAGES = {'en': 'English', 'hi': 'Hindi'}

def initialize_translator():
    global translator
    print("Initializing mBART translator...")
    try:
        # Try fine-tuned model first
        model_path = "results/mbart_en_hi_bidirectional"
        translator = MBartTranslator(model_path)
        print("Fine-tuned mBART translator initialized.")
    except Exception as e:
        print(f"Fine-tuned model not found: {e}")
        print("Using pre-trained mBART model...")
        try:
            # Fallback to pre-trained mBART
            class PretrainedMBart:
                def __init__(self):
                    from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
                    model_name = "facebook/mbart-large-50-many-to-many-mmt"
                    self.tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
                    self.model = MBartForConditionalGeneration.from_pretrained(model_name)
                    self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
                    self.model.to(self.device)
                    self.model.eval()
                    
                def translate(self, text, src_lang='en', tgt_lang='hi'):
                    lang_codes = {'en': 'en_XX', 'hi': 'hi_IN'}
                    src_code = lang_codes[src_lang]
                    tgt_code = lang_codes[tgt_lang]
                    
                    self.tokenizer.src_lang = src_code
                    inputs = self.tokenizer(text, return_tensors="pt", max_length=128, truncation=True)
                    inputs = {k: v.to(self.device) for k, v in inputs.items()}
                    
                    generated_tokens = self.model.generate(
                        **inputs,
                        forced_bos_token_id=self.tokenizer.lang_code_to_id[tgt_code],
                        max_length=128,
                        num_beams=5,
                    )
                    
                    return self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0].strip()
            
            translator = PretrainedMBart()
            print("Pre-trained mBART translator initialized.")
        except Exception as e2:
            print(f"Failed to initialize any translator: {e2}")
            translator = None

@app.route('/api/health', methods=['GET'])
def health_check():
    return jsonify({
        'status': 'healthy',
        'translator_ready': translator is not None,
        'supported_languages': LANGUAGES
    })

@app.route('/api/translate', methods=['POST'])
def translate_text():
    try:
        if translator is None:
            return jsonify({'error': 'Translator not initialized. Please check model path.'}), 503
        
        data = request.json
        if not data:
            return jsonify({'error': 'No JSON data provided.'}), 400
            
        text = data.get('text', '').strip()
        src_lang = data.get('src_lang', 'en')
        tgt_lang = data.get('tgt_lang', 'hi')

        if not text:
            return jsonify({'error': 'No text provided for translation.'}), 400
        if src_lang not in LANGUAGES or tgt_lang not in LANGUAGES:
            return jsonify({'error': 'Unsupported language selected.'}), 400
        if src_lang == tgt_lang:
            return jsonify({'error': 'Source and target languages are the same.'}), 400

        start_time = time.time()
        translation = translator.translate(text, src_lang, tgt_lang)
        end_time = time.time()

        return jsonify({
            'source_text': text,
            'source_language': src_lang,
            'target_language': tgt_lang,
            'translation': translation,
            'processing_time': round(end_time - start_time, 3)
        })

    except Exception as e:
        print(f"Translation error: {e}")
        return jsonify({'error': 'An internal server error occurred.'}), 500

@app.route('/api/batch-translate', methods=['POST'])
def batch_translate():
    try:
        if translator is None:
            return jsonify({'error': 'Translator not initialized.'}), 503
            
        data = request.json
        if not data:
            return jsonify({'error': 'No JSON data provided.'}), 400
            
        texts = data.get('texts', [])
        src_lang = data.get('src_lang', 'en')
        tgt_lang = data.get('tgt_lang', 'hi')
        
        if not texts or not isinstance(texts, list):
            return jsonify({'error': 'No texts array provided.'}), 400
        if len(texts) > 100:
            return jsonify({'error': 'Maximum 100 texts allowed per batch.'}), 400

        start_time = time.time()
        translations = []
        
        for text in texts:
            if isinstance(text, str) and text.strip():
                try:
                    translation = translator.translate(text.strip(), src_lang, tgt_lang)
                    translations.append({
                        'source': text.strip(),
                        'translation': translation,
                        'success': True
                    })
                except Exception as e:
                    translations.append({
                        'source': text.strip(),
                        'translation': None,
                        'success': False,
                        'error': str(e)
                    })
            else:
                translations.append({
                    'source': text,
                    'translation': None,
                    'success': False,
                    'error': 'Invalid text format'
                })
        
        end_time = time.time()
        
        return jsonify({
            'translations': translations,
            'total_count': len(translations),
            'success_count': sum(1 for t in translations if t['success']),
            'processing_time': round(end_time - start_time, 3)
        })
        
    except Exception as e:
        print(f"Batch translation error: {e}")
        return jsonify({'error': 'An internal server error occurred.'}), 500

@app.route('/api/languages', methods=['GET'])
def get_languages():
    return jsonify({
        'supported_languages': LANGUAGES,
        'translation_pairs': [
            {'source': 'en', 'target': 'hi'},
            {'source': 'hi', 'target': 'en'}
        ]
    })

def run_flask_app():
    initialize_translator()
    app.run(debug=False, use_reloader=False, host='0.0.0.0', port=5000, threaded=True)

def start_api_server():
    thread = threading.Thread(target=run_flask_app, daemon=True)
    thread.start()
    return thread

# For Jupyter notebook usage
def start_translation_api():
    print("Starting translation API server...")
    thread = start_api_server()
    print("API server started on http://localhost:5000")
    print("\nAvailable endpoints:")
    print("  GET  /api/health - Check server status")
    print("  GET  /api/languages - Get supported languages")
    print("  POST /api/translate - Translate single text")
    print("  POST /api/batch-translate - Translate multiple texts")
    print("\nExample usage:")
    print("curl -X POST http://localhost:5000/api/translate \\")
    print("  -H 'Content-Type: application/json' \\")
    print("  -d '{\"text\": \"Hello\", \"src_lang\": \"en\", \"tgt_lang\": \"hi\"}'")
    return thread

if __name__ == '__main__':
    run_flask_app()

Initializing mBART translator...
Loading mBART translator from: results/mbart_en_hi_bidirectional
mBART translator ready!
Fine-tuned mBART translator initialized.
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://192.168.202.209:5000
[33mPress CTRL+C to quit[0m
127.0.0.1 - - [19/Aug/2025 09:05:02] "OPTIONS /api/translate HTTP/1.1" 200 -
127.0.0.1 - - [19/Aug/2025 09:05:03] "POST /api/translate HTTP/1.1" 200 -
127.0.0.1 - - [19/Aug/2025 09:05:05] "POST /api/translate HTTP/1.1" 200 -
127.0.0.1 - - [19/Aug/2025 09:05:16] "OPTIONS /api/translate HTTP/1.1" 200 -
127.0.0.1 - - [19/Aug/2025 09:05:16] "POST /api/translate HTTP/1.1" 200 -
127.0.0.1 - - [19/Aug/2025 09:05:23] "OPTIONS /api/translate HTTP/1.1" 200 -
127.0.0.1 - - [19/Aug/2025 09:05:23] "POST /api/translate HTTP/1.1" 200 -
127.0.0.1 - - [19/Aug/2025 09:05:26] "POST /api/translate HTTP/1.1" 200 -


: 