# Multilingual Content Delivery with Translation Models on AWS and Hugging Face
This notebook demonstrates a full pipeline for multilingual content delivery using machine translation.
We cover:
- Detecting the source language
- Translating content using pre-trained models
- Evaluating translation quality
- Storing translations
- Using LLM agent logic to decide translation strategies

We use Hugging Face Transformers, Langdetect for language detection, and simulate AWS integrations.

In [None]:
# Install necessary libraries (if not already installed)
!pip install transformers sentencepiece langdetect --quiet


In [None]:
# Import required libraries
import os
from langdetect import detect
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import pandas as pd
from typing import List, Tuple
import random
import json


In [None]:
# Sample content in different languages
content_items = [
    {"id": 1, "title": "Bienvenue à Paris", "body": "Paris est la capitale de la France, connue pour sa culture et son histoire."},
    {"id": 2, "title": "Willkommen in Berlin", "body": "Berlin ist die Hauptstadt von Deutschland und eine Stadt voller Kunst."},
    {"id": 3, "title": "Welcome to New York", "body": "New York is a bustling city in the United States, famous for its skyline."},
    {"id": 4, "title": "स्वागत है मुंबई में", "body": "मुंबई भारत का एक प्रमुख शहर है, जो अपने सिनेमा और व्यापार के लिए प्रसिद्ध है।"},
    {"id": 5, "title": "Bem-vindo ao Rio de Janeiro", "body": "Rio é uma cidade maravilhosa no Brasil, famosa pelo carnaval."}
]

df = pd.DataFrame(content_items)
df


In [None]:
# Detect language using langdetect
def detect_language(text: str) -> str:
    try:
        return detect(text)
    except:
        return "unknown"

df["language_detected"] = df["body"].apply(detect_language)
df


In [None]:
# Load translation model: English as target language
model_name = "Helsinki-NLP/opus-mt-{}-en"

# Available source languages
supported_langs = ["fr", "de", "hi", "pt"]  # French, German, Hindi, Portuguese

translator_pipelines = {}

for lang in supported_langs:
    try:
        pipeline_obj = pipeline("translation", model=model_name.format(lang))
        translator_pipelines[lang] = pipeline_obj
    except Exception as e:
        print(f"Model for '{lang}' could not be loaded: {e}")


In [None]:
# Translate content to English
def translate_to_english(text: str, lang: str) -> str:
    if lang in translator_pipelines:
        return translator_pipelines[lang](text)[0]["translation_text"]
    return "Translation not available"

df["translated_body"] = df.apply(lambda row: translate_to_english(row["body"], row["language_detected"]), axis=1)
df[["id", "language_detected", "translated_body"]]


In [None]:
# Simulate a basic evaluation (BLEU, ROUGE would require ref translations)
# Here we simulate confidence by language match and sentence length

def estimate_quality(original: str, translated: str) -> float:
    if translated == "Translation not available":
        return 0.0
    len_ratio = len(translated) / (len(original) + 1e-6)
    ratio_score = max(0.0, min(1.0, 1.0 - abs(1 - len_ratio)))
    return round(ratio_score, 2)

df["quality_score"] = df.apply(lambda row: estimate_quality(row["body"], row["translated_body"]), axis=1)
df


In [None]:
# Save translated content to a JSON file
output_data = df[["id", "title", "translated_body", "language_detected", "quality_score"]].to_dict(orient="records")

with open("translated_content.json", "w", encoding="utf-8") as f:
    json.dump(output_data, f, ensure_ascii=False, indent=2)

print("Translated content saved to translated_content.json")


In [None]:
# Simulate an LLM agent-like translation strategy
def agent_translate(content: str, lang: str, threshold: float = 0.6) -> Tuple[str, str]:
    if lang not in supported_langs:
        return "Unsupported language", "fallback"
    translation = translate_to_english(content, lang)
    quality = estimate_quality(content, translation)
    strategy = "high_quality_model" if quality >= threshold else "human_review_required"
    return translation, strategy

# Apply to all rows
df[["agent_translation", "strategy"]] = df.apply(
    lambda row: pd.Series(agent_translate(row["body"], row["language_detected"])), axis=1
)

df[["id", "language_detected", "strategy", "agent_translation"]]


In [None]:
# Summary report
summary = df.groupby("strategy").size().reset_index(name="count")
summary


In [None]:
# Save entire dataframe to CSV for further analysis
df.to_csv("multilingual_translation_report.csv", index=False)


## Conclusion
In this notebook, we demonstrated a multilingual content delivery system using Hugging Face translation models, language detection, quality estimation, and LLM agent-style logic to adapt strategy per sentence. This can be extended to incorporate AWS Translate, store in S3, or serve via SageMaker endpoints.