## Neural Machine Translation (NMT) application:


<div style="float: left; margin-right: 20px;">

**Group ID:** 8

**Group Members Name with Student ID:**

| Sl.              | Student Name  | ID |
| :---- | :------: | ----: |
| 1. |  Imran Khan   | 2023ac05619 |
| 2. |  Priya M   | 2023AC05056 |
| 3. |  Mandar Khollam   | 2023AC05073 |
| 4. |  Ketan Bharat Purohit   | 2023AD05062 |
| 5. |  Nilesh Narayan Sonwane | 2023AC05827 |

</div>


### Setup for Fine-Tunning 

In [1]:
# Import Libraries
import torch
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import (
    MarianMTModel,
    MarianTokenizer,
    T5ForConditionalGeneration,
    T5Tokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
import os
import warnings
import json
warnings.filterwarnings("ignore")

###  Prepare translation data from ai4bharat/samanantar datasets.

In [2]:
# Data Preparation Functions
def prepare_translation_data(language="hi"):
    print(f"Loading samanantar dataset for {language}...")
    dataset = load_dataset("ai4bharat/samanantar", language, split='train')
    
    valid_pairs = []
    for i, example in enumerate(dataset):
        if i >= 5000: # smaller dataset for debugging
            break
        # Filter valid pairs
        if example['src'] and example['tgt'] and len(example['src'].strip()) > 0 and len(example['tgt'].strip()) > 0:
            valid_pairs.append({
                'english': example['src'].strip(),
                'target': example['tgt'].strip()
            })
    
    print(f"Found {len(valid_pairs)} valid translation pairs")
    
    # Split into train and validation
    split_idx = int(0.9 * len(valid_pairs))
    train_pairs = valid_pairs[:split_idx]
    val_pairs = valid_pairs[split_idx:]
    
    return train_pairs, val_pairs


### Fine-tune MarianMT model for English-Hindi translation.

In [3]:
# Fine tuning MarianMT model for English-Hindi
def fine_tune_marian_en_hi():
    print("===== Starting Fine-Tuning MarianMT English-Hindi =====")
    # Load MarianMT model
    model_name = "Helsinki-NLP/opus-mt-en-hi"
    # Output directory
    output_dir = "results/marian_en_hi_finetuned"
    
    try:
        tokenizer = MarianTokenizer.from_pretrained(model_name)
        model = MarianMTModel.from_pretrained(model_name)
        # Prepare data
        train_pairs, val_pairs = prepare_translation_data("hi")
        
        def preprocess_function(examples):
            inputs = examples["english"]
            targets = examples["target"] 
            model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding=False)
            labels = tokenizer(text_target=targets, max_length=128, truncation=True, padding=False)
            model_inputs["labels"] = labels["input_ids"]
            return model_inputs
        
        train_dataset = Dataset.from_list(train_pairs)
        val_dataset = Dataset.from_list(val_pairs)
        # Preprocess
        train_dataset = train_dataset.map(preprocess_function, batched=True)
        val_dataset = val_dataset.map(preprocess_function, batched=True)
        # Data Collator
        data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
        # Training Arguments
        training_args = Seq2SeqTrainingArguments(
            output_dir=output_dir,
            evaluation_strategy="steps",
            eval_steps=200,
            logging_steps=50,
            save_steps=200,
            save_total_limit=2,
            learning_rate=3e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=3,
            weight_decay=0.01,
            warmup_steps=200,
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
        )
        # Trainer
        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            data_collator=data_collator,
        )
        # Train
        trainer.train()
        # Save model
        trainer.save_model(output_dir)
        # Save tokenizer
        tokenizer.save_pretrained(output_dir)
        
        return output_dir
        
    except Exception as e:
        print(f"MarianMT English-Hindi failed: {e}")
        return None


### Fine-tune MarianMT model for Hindi-English translation.

In [4]:
# Fine tuning MarianMT model for Hindi-English
def fine_tune_marian_hi_en():
    print("===== Starting Fine-Tuning MarianMT Hindi-English =====")
    # Load MarianMT model
    model_name = "Helsinki-NLP/opus-mt-hi-en"
    output_dir = "results/marian_hi_en_finetuned"
    
    try:
        tokenizer = MarianTokenizer.from_pretrained(model_name)
        model = MarianMTModel.from_pretrained(model_name)
        
        train_pairs, val_pairs = prepare_translation_data("hi")
        
        train_pairs_reversed = [{'english': pair['target'], 'target': pair['english']} for pair in train_pairs]
        val_pairs_reversed = [{'english': pair['target'], 'target': pair['english']} for pair in val_pairs]
        
        def preprocess_function(examples):
            inputs = examples["english"]
            targets = examples["target"]
            model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding=False)
            labels = tokenizer(text_target=targets, max_length=128, truncation=True, padding=False)
            model_inputs["labels"] = labels["input_ids"]
            return model_inputs
        
        train_dataset = Dataset.from_list(train_pairs_reversed)
        val_dataset = Dataset.from_list(val_pairs_reversed)
        
        train_dataset = train_dataset.map(preprocess_function, batched=True)
        val_dataset = val_dataset.map(preprocess_function, batched=True)
        
        data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
        
        training_args = Seq2SeqTrainingArguments(
            output_dir=output_dir,
            evaluation_strategy="steps",
            eval_steps=200,
            logging_steps=50,
            save_steps=200,
            save_total_limit=2,
            learning_rate=3e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=3,
            weight_decay=0.01,
            warmup_steps=200,
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
        )
        
        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            data_collator=data_collator,
        )
        
        trainer.train()
        trainer.save_model(output_dir)
        tokenizer.save_pretrained(output_dir)
        
        return output_dir
        
    except Exception as e:
        print(f"MarianMT Hindi-English failed: {e}")
        return None


### Fine-tune MarianMT model for English-Kannada translation.

In [5]:
# Fine tuning MarianMT model for English-Kannada
def fine_tune_marian_en_kn():
    
    print("===== Starting Fine-Tuning MarianMT English-Kannada =====")
    
    model_name = "Helsinki-NLP/opus-mt-en-mul"
    output_dir = "results/marian_en_kn_finetuned"
    
    try:
        tokenizer = MarianTokenizer.from_pretrained(model_name)
        model = MarianMTModel.from_pretrained(model_name)
        
        train_pairs, val_pairs = prepare_translation_data("kn")
        
        def preprocess_function(examples):
            inputs = examples["english"]
            targets = examples["target"] 
            model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding=False)
            labels = tokenizer(text_target=targets, max_length=128, truncation=True, padding=False)
            model_inputs["labels"] = labels["input_ids"]
            return model_inputs
        
        train_dataset = Dataset.from_list(train_pairs)
        val_dataset = Dataset.from_list(val_pairs)
        
        train_dataset = train_dataset.map(preprocess_function, batched=True)
        val_dataset = val_dataset.map(preprocess_function, batched=True)
        
        data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
        
        training_args = Seq2SeqTrainingArguments(
            output_dir=output_dir,
            evaluation_strategy="steps",
            eval_steps=200,
            logging_steps=50,
            save_steps=200,
            save_total_limit=2,
            learning_rate=3e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=3,
            weight_decay=0.01,
            warmup_steps=200,
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
        )
        
        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            data_collator=data_collator,
        )
        
        trainer.train()
        trainer.save_model(output_dir)
        tokenizer.save_pretrained(output_dir)
        
        return output_dir
        
    except Exception as e:
        print(f"MarianMT English-Kannada failed: {e}")
        return None


### Fine-tune MarianMT model for Kannada-English translation.

In [6]:
# Fine tuning MarianMT model for Kannada-English
def fine_tune_marian_kn_en():
    print("===== Starting Fine-Tuning MarianMT Kannada-English =====")
    
    model_name = "Helsinki-NLP/opus-mt-mul-en"
    output_dir = "results/marian_kn_en_finetuned"
    
    try:
        tokenizer = MarianTokenizer.from_pretrained(model_name)
        model = MarianMTModel.from_pretrained(model_name)
        
        train_pairs, val_pairs = prepare_translation_data("kn")
        
        train_pairs_reversed = [{'english': pair['target'], 'target': pair['english']} for pair in train_pairs]
        val_pairs_reversed = [{'english': pair['target'], 'target': pair['english']} for pair in val_pairs]
        
        def preprocess_function(examples):
            inputs = examples["english"]
            targets = examples["target"]
            model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding=False)
            labels = tokenizer(text_target=targets, max_length=128, truncation=True, padding=False)
            model_inputs["labels"] = labels["input_ids"]
            return model_inputs
        
        train_dataset = Dataset.from_list(train_pairs_reversed)
        val_dataset = Dataset.from_list(val_pairs_reversed)
        
        train_dataset = train_dataset.map(preprocess_function, batched=True)
        val_dataset = val_dataset.map(preprocess_function, batched=True)
        
        data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
        
        training_args = Seq2SeqTrainingArguments(
            output_dir=output_dir,
            evaluation_strategy="steps",
            eval_steps=200,
            logging_steps=50,
            save_steps=200,
            save_total_limit=2,
            learning_rate=3e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=3,
            weight_decay=0.01,
            warmup_steps=200,
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
        )
        
        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            data_collator=data_collator,
        )
        
        trainer.train()
        trainer.save_model(output_dir)
        tokenizer.save_pretrained(output_dir)
        
        return output_dir
        
    except Exception as e:
        print(f"MarianMT Kannada-English failed: {e}")
        return None



### Define a Universal Translator Class

In [7]:
# Universal Translator Class for all model types
class UniversalTranslator:
    # Initialize the translator
    def __init__(self, model_path):
        self.model_path = model_path
        self.model_type = self._detect_model_type()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self._load_model()
    # Detect the model type
    def _detect_model_type(self):
        config_path = os.path.join(self.model_path, "config.json")
        
        if os.path.exists(config_path):
            try:
                with open(config_path, 'r') as f:
                    config = json.load(f)
                    architecture = config.get("architectures", [""])[0].lower()
                    if "marian" in architecture:
                        return "marian"
                    elif "t5" in architecture:
                        return "t5"
            except:
                pass # Ignore errors
            
        # If config file doesn't exist, check model name
        if "marian" in self.model_path.lower():
            return "marian"
        elif "t5" in self.model_path.lower():
            return "t5"
        # Default to T5
        return "t5"
    
    # Load the model
    def _load_model(self):
        if self.model_type == "marian":
            self.tokenizer = MarianTokenizer.from_pretrained(self.model_path)
            self.model = MarianMTModel.from_pretrained(self.model_path)
        else:
            self.tokenizer = T5Tokenizer.from_pretrained(self.model_path)
            self.model = T5ForConditionalGeneration.from_pretrained(self.model_path)
            self.model_type = "t5"
        
        self.model.to(self.device)
        self.model.eval()
    
    # Translate text
    def translate(self, text, src_lang='en', tgt_lang='hi'):
        if self.model_type == "marian":
            input_text = text
        else:
            lang_map = {'en': 'English', 'hi': 'Hindi', 'kn': 'Kannada'}
            input_text = f"translate {lang_map[src_lang]} to {lang_map[tgt_lang]}: {text}"
        
        # Tokenize and generate
        inputs = self.tokenizer(
            input_text,
            return_tensors="pt",
            max_length=128,
            truncation=True,
            padding=True
        )
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        # Generate
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_length=128,
                num_beams=4,
                length_penalty=0.6,
                early_stopping=True,
                do_sample=False,
            )
        
        # Decode and return
        translation = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return translation.strip()

### Translation Evaluation with METEOR and BLUE

In [8]:
# Evaluator Class
class TranslationEvaluator:
    def __init__(self):
        self.smoothing = SmoothingFunction().method1
    # Calculate BLEU
    def calculate_bleu(self, reference, candidate):
        if not reference.strip() or not candidate.strip():
            return 0.0
        ref_tokens = nltk.word_tokenize(reference.lower())
        cand_tokens = nltk.word_tokenize(candidate.lower())
        return round(sentence_bleu([ref_tokens], cand_tokens, smoothing_function=self.smoothing) * 100, 2)
    # Calculate METEOR
    def calculate_meteor(self, reference, candidate):
        if not reference.strip() or not candidate.strip():
            return 0.0
        ref_tokens = nltk.word_tokenize(reference.lower())
        cand_tokens = nltk.word_tokenize(candidate.lower())
        return round(meteor_score([ref_tokens], cand_tokens) * 100, 2)
    
# Run Evaluation
def run_evaluation(model_paths):
    """Evaluate the trained models."""
    for corpus in ['punkt', 'wordnet', 'omw-1.4']:
        nltk.download(corpus, quiet=True)
    # Initialize evaluator    
    evaluator = TranslationEvaluator()
    
    # Test cases for both directions
    test_cases = {
        'en_hi': [
            {'source': 'Hello', 'reference': 'नमस्ते'},
            {'source': 'How are you?', 'reference': 'आप कैसे हैं?'},
            {'source': 'Good morning', 'reference': 'सुप्रभात'},
            {'source': 'Thank you', 'reference': 'धन्यवाद'},
            {'source': 'I am fine', 'reference': 'मैं ठीक हूं'},
        ],
        'hi_en': [
            {'source': 'नमस्ते', 'reference': 'Hello'},
            {'source': 'आप कैसे हैं?', 'reference': 'How are you?'},
            {'source': 'सुप्रभात', 'reference': 'Good morning'},
            {'source': 'धन्यवाद', 'reference': 'Thank you'},
            {'source': 'मैं ठीक हूं', 'reference': 'I am fine'},
        ],
        'en_kn': [
            {'source': 'Hello', 'reference': 'ನಮಸ್ಕಾರ'},
            {'source': 'How are you?', 'reference': 'ಹೇಗಿದ್ದೀರಾ?'},
            {'source': 'Good morning', 'reference': 'ಶುಭೋದಯ'},
            {'source': 'Thank you', 'reference': 'ಧನ್ಯವಾದಗಳು'},
            {'source': 'I am fine', 'reference': 'ನಾನು ಚೆನ್ನಾಗಿದ್ದೇನೆ'},
        ],
        'kn_en': [
            {'source': 'ನಮಸ್ಕಾರ', 'reference': 'Hello'},
            {'source': 'ಹೇಗಿದ್ದೀರಾ?', 'reference': 'How are you?'},
            {'source': 'ಶುಭೋದಯ', 'reference': 'Good morning'},
            {'source': 'ಧನ್ಯವಾದಗಳು', 'reference': 'Thank you'},
            {'source': 'ನಾನು ಚೆನ್ನಾಗಿದ್ದೇನೆ', 'reference': 'I am fine'},
        ]
    }
    
    for direction, cases in test_cases.items():
        model_path = model_paths.get(direction)
        if not model_path or not os.path.exists(model_path):
            continue
            
        print(f"\n===== Evaluating {direction.upper()} Translation =====")
        translator = UniversalTranslator(model_path)
        results = []
        
        src_lang, tgt_lang = direction.split('_')
        for i, case in enumerate(cases, 1):
            try:
                prediction = translator.translate(case['source'], src_lang, tgt_lang)
                bleu = evaluator.calculate_bleu(case['reference'], prediction)
                meteor = evaluator.calculate_meteor(case['reference'], prediction)
                
                results.append({
                    'Source': case['source'],
                    'Reference': case['reference'], 
                    'Prediction': prediction,
                    'BLEU': bleu,
                    'METEOR': meteor
                })
                
            except Exception as e:
                results.append({
                    'Source': case['source'],
                    'Reference': case['reference'],
                    'Prediction': f"ERROR: {str(e)}",
                    'BLEU': 0.0,
                    'METEOR': 0.0
                })
        
        # Data Frame to format the results
        df = pd.DataFrame(results)
        print(df.to_string(index=False, max_colwidth=40))
        
        valid_results = df[df['BLEU'] > 0]
        if len(valid_results) > 0:
            avg_bleu = valid_results['BLEU'].mean()
            avg_meteor = valid_results['METEOR'].mean()
            print(f"\nAverage BLEU: {avg_bleu:.2f}")
            print(f"Average METEOR: {avg_meteor:.2f}")

### Start Fine-tunning processes

In [9]:
# Train all four models
def train_all_models():
    """Train all four models."""
    os.makedirs("results", exist_ok=True)
    
    model_paths = {}
    
    # Train English-Hindi
    en_hi_path = fine_tune_marian_en_hi()
    if en_hi_path:
        model_paths['en_hi'] = en_hi_path
    
    # Train Hindi-English
    hi_en_path = fine_tune_marian_hi_en()
    if hi_en_path:
        model_paths['hi_en'] = hi_en_path
    
    # Train English-Kannada
    en_kn_path = fine_tune_marian_en_kn()
    if en_kn_path:
        model_paths['en_kn'] = en_kn_path
    
    # Train Kannada-English
    kn_en_path = fine_tune_marian_kn_en()
    if kn_en_path:
        model_paths['kn_en'] = kn_en_path
    
    # Evaluate models
    if model_paths:
        run_evaluation(model_paths)
    
    return model_paths
# Capture the trained models paths
trained_models = train_all_models()

===== Starting Fine-Tuning MarianMT English-Hindi =====
Loading samanantar dataset for hi...
Found 5000 valid translation pairs


Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss
200,4.1681,3.757415
400,3.6922,3.588024
600,3.3591,3.52431
800,3.3135,3.505749


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]], 'forced_eos_token_id': 0}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61949]], 'forced_eos_token_id': 0}


===== Starting Fine-Tuning MarianMT Hindi-English =====
Loading samanantar dataset for hi...
Found 5000 valid translation pairs


Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss
200,4.1523,3.755752
400,3.6103,3.590309
600,3.2306,3.539832
800,3.2081,3.515731


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[61126]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[61126]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[61126]], 'forced_eos_token_id': 0}
Checkpoint destination directory results/marian_hi_en_finetuned/checkpoint-800 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[61126]], 'forced_eos_token_id': 0}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[61126]], 'forc

===== Starting Fine-Tuning MarianMT English-Kannada =====
Loading samanantar dataset for kn...
Found 5000 valid translation pairs


Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss
200,2.4617,2.21324
400,2.1583,2.07892
600,1.9569,2.021572
800,1.9608,1.994814


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}
Checkpoint destination directory results/marian_en_kn_finetuned/checkpoint-800 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forced_eos_token_id': 0}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[64109]], 'forc

===== Starting Fine-Tuning MarianMT Kannada-English =====
Loading samanantar dataset for kn...
Found 5000 valid translation pairs


Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss
200,3.0744,2.816344
400,2.5334,2.70301
600,2.1646,2.672754
800,2.1095,2.651908


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[64171]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[64171]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[64171]], 'forced_eos_token_id': 0}
Checkpoint destination directory results/marian_kn_en_finetuned/checkpoint-800 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[64171]], 'forced_eos_token_id': 0}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[64171]], 'forc


===== Evaluating EN_HI Translation =====
      Source    Reference   Prediction   BLEU  METEOR
       Hello       नमस्ते         सलाम   0.00    0.00
How are you? आप कैसे हैं? आप कैसे हैं? 100.00   99.22
Good morning     सुप्रभात     सुप्रभात  17.78   50.00
   Thank you      धन्यवाद      धन्यवाद  17.78   50.00
   I am fine  मैं ठीक हूं  मैं ठीक हूं  56.23   98.15

Average BLEU: 47.95
Average METEOR: 74.34

===== Evaluating HI_EN Translation =====
      Source    Reference   Prediction   BLEU  METEOR
      नमस्ते        Hello          Hi.   0.00   45.45
आप कैसे हैं? How are you? How are you? 100.00   99.22
    सुप्रभात Good morning Good morning  31.62   93.75
     धन्यवाद    Thank you    Thank you  31.62   93.75
 मैं ठीक हूं    I am fine     I'm fine  13.51   33.33

Average BLEU: 44.19
Average METEOR: 80.01

===== Evaluating EN_KN Translation =====
      Source           Reference     Prediction  BLEU  METEOR
       Hello             ನಮಸ್ಕಾರ           ಹಾಲೊ  0.00    0.00
How are you?    

In [11]:
# Import Libraries
import time
import os
from flask import Flask, request, jsonify
from flask_cors import CORS
import threading

# Flask API Setup
app = Flask(__name__)
CORS(app)

# Global Variables
translator_cache = {}
LANGUAGES = {'en': 'English', 'hi': 'Hindi', 'kn': 'Kannada'}
MODEL_PATHS = {
    'en_hi': 'results/marian_en_hi_finetuned',
    'hi_en': 'results/marian_hi_en_finetuned',
    'en_kn': 'results/marian_en_kn_finetuned',
    'kn_en': 'results/marian_kn_en_finetuned'
}

# Function to get model path
def get_model_path(src_lang, tgt_lang):
    language_pair = f"{src_lang}_{tgt_lang}"
    
    if language_pair in MODEL_PATHS and os.path.exists(MODEL_PATHS[language_pair]):
        return MODEL_PATHS[language_pair]
    
    for path in MODEL_PATHS.values():
        if os.path.exists(path):
            return path
        
    return None

# Function to get translator
def get_translator(src_lang, tgt_lang):
    cache_key = f"{src_lang}_{tgt_lang}"
    
    if cache_key in translator_cache:
        return translator_cache[cache_key]
    
    model_path = get_model_path(src_lang, tgt_lang)
    
    if not model_path:
        raise Exception(f"No model available for {src_lang} -> {tgt_lang} translation")
    
    try:
        translator = UniversalTranslator(model_path)
        translator_cache[cache_key] = translator
        return translator
    except Exception as e:
        raise Exception(f"Failed to load translator: {str(e)}")

# API Endpoints
@app.route('/api/health', methods=['GET'])
def health_check():
    model_status = {}
    # Check model paths
    for pair, path in MODEL_PATHS.items():
        model_status[pair] = {
            'path': path,
            'exists': os.path.exists(path),
            'cached': pair in translator_cache
        }
    # Check translator cache
    return jsonify({
        'status': 'healthy',
        'supported_languages': LANGUAGES,
        'model_status': model_status,
        'cached_translators': list(translator_cache.keys()),
        'device': 'cuda' if torch.cuda.is_available() else 'cpu'
    })

@app.route('/api/translate', methods=['POST'])
def translate_text():
    try:
        data = request.json
        if not data:
            return jsonify({'error': 'No JSON data provided.'}), 400
            
        text = data.get('text', '').strip()
        src_lang = data.get('src_lang', 'en')
        tgt_lang = data.get('tgt_lang', 'hi')

        if not text:
            return jsonify({'error': 'No text provided for translation.'}), 400
        if src_lang not in LANGUAGES or tgt_lang not in LANGUAGES:
            return jsonify({'error': 'Unsupported language selected.'}), 400
        if src_lang == tgt_lang:
            return jsonify({'error': 'Source and target languages are the same.'}), 400

        start_time = time.time()
        
        try:
            translator = get_translator(src_lang, tgt_lang)
            translation = translator.translate(text, src_lang, tgt_lang)
        except Exception as e:
            return jsonify({'error': f'Translation failed: {str(e)}'}), 503
        
        end_time = time.time()

        return jsonify({
            'source_text': text,
            'source_language': src_lang,
            'source_language_name': LANGUAGES[src_lang],
            'target_language': tgt_lang,
            'target_language_name': LANGUAGES[tgt_lang],
            'translation': translation,
            'model_used': get_model_path(src_lang, tgt_lang),
            'processing_time': round(end_time - start_time, 3)
        })

    except Exception as e:
        return jsonify({'error': 'An internal server error occurred.'}), 500

@app.route('/api/batch-translate', methods=['POST'])
def batch_translate():
    try:
        data = request.json
        if not data:
            return jsonify({'error': 'No JSON data provided.'}), 400
            
        texts = data.get('texts', [])
        src_lang = data.get('src_lang', 'en')
        tgt_lang = data.get('tgt_lang', 'hi')
        
        if not texts or not isinstance(texts, list):
            return jsonify({'error': 'No texts array provided.'}), 400
        if len(texts) > 100:
            return jsonify({'error': 'Maximum 100 texts allowed per batch.'}), 400
        if src_lang not in LANGUAGES or tgt_lang not in LANGUAGES:
            return jsonify({'error': 'Unsupported language selected.'}), 400
        if src_lang == tgt_lang:
            return jsonify({'error': 'Source and target languages are the same.'}), 400

        start_time = time.time()
        
        try:
            translator = get_translator(src_lang, tgt_lang)
        except Exception as e:
            return jsonify({'error': f'Failed to load translator: {str(e)}'}), 503
        
        translations = []
        
        for text in texts:
            if isinstance(text, str) and text.strip():
                try:
                    translation = translator.translate(text.strip(), src_lang, tgt_lang)
                    translations.append({
                        'source': text.strip(),
                        'translation': translation,
                        'success': True
                    })
                except Exception as e:
                    translations.append({
                        'source': text.strip(),
                        'translation': None,
                        'success': False,
                        'error': str(e)
                    })
            else:
                translations.append({
                    'source': text,
                    'translation': None,
                    'success': False,
                    'error': 'Invalid text format'
                })
        
        end_time = time.time()
        
        return jsonify({
            'translations': translations,
            'source_language': src_lang,
            'target_language': tgt_lang,
            'model_used': get_model_path(src_lang, tgt_lang),
            'total_count': len(translations),
            'success_count': sum(1 for t in translations if t['success']),
            'processing_time': round(end_time - start_time, 3)
        })
        
    except Exception as e:
        return jsonify({'error': 'An internal server error occurred.'}), 500

@app.route('/api/languages', methods=['GET'])
def get_languages():
    available_pairs = []
    
    for src in LANGUAGES.keys():
        for tgt in LANGUAGES.keys():
            if src != tgt:
                model_path = get_model_path(src, tgt)
                if model_path:
                    available_pairs.append({
                        'source': src,
                        'target': tgt,
                        'source_name': LANGUAGES[src],
                        'target_name': LANGUAGES[tgt],
                        'model_path': model_path
                    })
    
    return jsonify({
        'supported_languages': LANGUAGES,
        'available_translation_pairs': available_pairs,
        'total_pairs': len(available_pairs)
    })

def initialize_translators():
    print("Initializing translators...")
    
    common_pairs = [('en', 'hi'), ('hi', 'en'), ('en', 'kn'), ('kn', 'en')]
    
    for src_lang, tgt_lang in common_pairs:
        try:
            get_translator(src_lang, tgt_lang)
            print(f"✓ Initialized {src_lang} -> {tgt_lang} translator")
        except Exception as e:
            print(f"✗ Failed to initialize {src_lang} -> {tgt_lang}: {e}")

def start_api_server():
    initialize_translators()
    app.run(debug=False, use_reloader=False, host='0.0.0.0', port=5005, threaded=True)

def start_translation_api():
    thread = threading.Thread(target=start_api_server, daemon=True)
    thread.start()
    print("API server started on http://localhost:5005")
    return thread

# Execute training and start API
if __name__ == "__main__":
    # Train all models
    # trained_models = train_all_models()
    
    # Start API server
    start_translation_api()

Initializing translators...
API server started on http://localhost:5005


✓ Initialized en -> hi translator
✓ Initialized hi -> en translator
✓ Initialized en -> kn translator
✓ Initialized kn -> en translator
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5005
 * Running on http://192.168.202.209:5005
[33mPress CTRL+C to quit[0m
