Apply Transformer Model for Machine translation task

In [13]:
import torch
from transformers import MarianMTModel, MarianTokenizer
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')



In [14]:
class TransformerTranslator:
    def __init__(self, model_name="Helsinki-NLP/opus-mt-en-fr"):
        """
        Initialize transformer model for translation
        Default: English to French translation
        """
        print(f"Loading model: {model_name}")
        self.tokenizer = MarianTokenizer.from_pretrained(model_name)
        self.model = MarianMTModel.from_pretrained(model_name)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.to(self.device)
        print(f"Model loaded on device: {self.device}")

    def translate_text(self, text):
        """Translate a single text"""
        # Tokenize input
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        # Generate translation
        with torch.no_grad():
            outputs = self.model.generate(**inputs, max_length=512, num_beams=4, early_stopping=True)

        # Decode output
        translation = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return translation

    def translate_batch(self, texts, batch_size=8):
        """Translate multiple texts efficiently"""
        translations = []

        for i in tqdm(range(0, len(texts), batch_size), desc="Translating"):
            batch = texts[i:i+batch_size]

            # Tokenize batch
            inputs = self.tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
            inputs = {k: v.to(self.device) for k, v in inputs.items()}

            # Generate translations
            with torch.no_grad():
                outputs = self.model.generate(**inputs, max_length=512, num_beams=4, early_stopping=True)

            # Decode outputs
            batch_translations = [self.tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
            translations.extend(batch_translations)

        return translations



In [15]:
def evaluate_translations(original, translated, reference=None):
    """Simple evaluation metrics"""
    results = {
        'total_sentences': len(original),
        'avg_original_length': sum(len(text.split()) for text in original) / len(original),
        'avg_translated_length': sum(len(text.split()) for text in translated) / len(translated),
    }

    if reference:
        # Calculate BLEU score (simplified)
        from collections import Counter
        bleu_scores = []
        for trans, ref in zip(translated, reference):
            trans_words = trans.lower().split()
            ref_words = ref.lower().split()

            # Simple BLEU-1 (unigram precision)
            trans_counter = Counter(trans_words)
            ref_counter = Counter(ref_words)
            overlap = sum((trans_counter & ref_counter).values())
            precision = overlap / len(trans_words) if trans_words else 0
            bleu_scores.append(precision)

        results['avg_bleu_1'] = sum(bleu_scores) / len(bleu_scores)

    return results



In [16]:
def main():
    print("=== Transformer Machine Translation Demo ===\n")

    # Initialize translator (English to French)
    translator = TransformerTranslator("Helsinki-NLP/opus-mt-en-fr")

    # Load a sample dataset for testing
    print("\nLoading sample dataset...")
    try:
        # Using a small subset of WMT14 English-French dataset
        dataset = load_dataset("wmt14", "fr-en", split="test[:100]")  # Small subset for demo

        # Extract English sentences
        english_texts = [item['translation']['en'] for item in dataset]
        french_references = [item['translation']['fr'] for item in dataset]

        print(f"Loaded {len(english_texts)} sentence pairs from WMT14 dataset")

    except Exception as e:
        print(f"Could not load WMT14 dataset: {e}")
        print("Using manual sample sentences instead...")

        # Fallback to manual examples
        english_texts = [
            "Hello, how are you today?",
            "The weather is beautiful outside.",
            "I love learning about artificial intelligence.",
            "Machine translation has improved significantly with transformers.",
            "This is a demonstration of neural machine translation.",
            "Python is a powerful programming language.",
            "Deep learning models require large datasets.",
            "Natural language processing is fascinating.",
            "Transformers revolutionized machine translation.",
            "Thank you for your attention."
        ]
        french_references = None

    # Demonstrate single translation
    print(f"\n=== Single Translation Example ===")
    sample_text = english_texts[0]
    print(f"Original (EN): {sample_text}")

    translation = translator.translate_text(sample_text)
    print(f"Translation (FR): {translation}")

    # Demonstrate batch translation
    print(f"\n=== Batch Translation ===")
    print(f"Translating {len(english_texts)} sentences...")

    translations = translator.translate_batch(english_texts[:10])  # Translate first 10 for demo

    # Display results
    print(f"\n=== Translation Results ===")
    for i, (orig, trans) in enumerate(zip(english_texts[:10], translations)):
        print(f"\n{i+1}.")
        print(f"EN: {orig}")
        print(f"FR: {trans}")
        if french_references and i < len(french_references):
            print(f"REF: {french_references[i]}")

    # Evaluate translations
    print(f"\n=== Evaluation ===")
    eval_results = evaluate_translations(
        english_texts[:10],
        translations,
        french_references[:10] if french_references else None
    )

    for metric, value in eval_results.items():
        print(f"{metric}: {value:.3f}" if isinstance(value, float) else f"{metric}: {value}")

    # Interactive translation
    print(f"\n=== Interactive Translation ===")
    print("Enter English sentences to translate (type 'quit' to exit):")

    while True:
        user_input = input("\nEN: ").strip()
        if user_input.lower() in ['quit', 'exit', 'q']:
            break
        if user_input:
            translation = translator.translate_text(user_input)
            print(f"FR: {translation}")

    print("\nTranslation demo completed!")

if __name__ == "__main__":
    # Check if required packages are available
    try:
        import transformers
        import datasets
        print("All required packages found!")
    except ImportError as e:
        print(f"Missing package: {e}")
        print("Please install required packages:")
        print("pip install transformers datasets torch tqdm pandas")
        exit(1)

    main()

All required packages found!
=== Transformer Machine Translation Demo ===

Loading model: Helsinki-NLP/opus-mt-en-fr


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Model loaded on device: cpu

Loading sample dataset...


Downloading readme:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Could not load WMT14 dataset: Invalid pattern: '**' can only be an entire path component
Using manual sample sentences instead...

=== Single Translation Example ===
Original (EN): Hello, how are you today?
Translation (FR): Bonjour, comment ça va aujourd'hui ?

=== Batch Translation ===
Translating 10 sentences...


Translating: 100%|██████████| 2/2 [00:02<00:00,  1.43s/it]



=== Translation Results ===

1.
EN: Hello, how are you today?
FR: Bonjour, comment ça va aujourd'hui ?

2.
EN: The weather is beautiful outside.
FR: Le temps est beau dehors.

3.
EN: I love learning about artificial intelligence.
FR: J'adore apprendre sur l'intelligence artificielle.

4.
EN: Machine translation has improved significantly with transformers.
FR: La traduction automatique s'est considérablement améliorée avec les transformateurs.

5.
EN: This is a demonstration of neural machine translation.
FR: Il s'agit d'une démonstration de traduction automatique neuronale.

6.
EN: Python is a powerful programming language.
FR: Python est un langage de programmation puissant.

7.
EN: Deep learning models require large datasets.
FR: Les modèles d'apprentissage profond nécessitent de gros ensembles de données.

8.
EN: Natural language processing is fascinating.
FR: Le traitement naturel du langage est fascinant.

9.
EN: Transformers revolutionized machine translation.
FR: Transformateu

In [17]:
# For CPU only (lighter)
#pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

# For GPU support (if you have CUDA)
#pip install torch torchvision torchaudio

# Then install other dependencies
#pip install transformers datasets tqdm pandas