In [35]:
from transformers import pipeline
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
import re

In [36]:
clasificador = pipeline(
    "zero-shot-classification",
    model="joeddav/xlm-roberta-large-xnli",
    device="cuda"
)

detector = pipeline(
    "text-classification", 
    model="papluca/xlm-roberta-base-language-detection",
    device="cuda"
)

model_name = "facebook/m2m100_418M"
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
traductor = M2M100ForConditionalGeneration.from_pretrained(model_name)

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda
Device set to use cuda


In [43]:
IDIOMAS = {
    # Español
    "español": "es",
    "spanish": "es",
    "espagnol": "es",
    "spanisch": "es",
    "es": "es",

    # Inglés
    "inglés": "en",
    "english": "en",
    "anglais": "en",
    "englisch": "en",
    "en": "en",

    # Francés
    "francés": "fr",
    "french": "fr",
    "français": "fr",
    "französisch": "fr",
    "fr": "fr",

    # Alemán
    "alemán": "de",
    "german": "de",
    "deutsch": "de",
    "allemand": "de",
    "de": "de",
}

def detectar_idioma(texto: str):
    """
    Detecta el idioma de destino a partir de expresiones como:
    'a francés', 'to english', 'ins deutsche', etc.
    Retorna el código ('es', 'en', 'fr', 'de').
    """
    texto_lower = texto.lower()

    for nombre, codigo in IDIOMAS.items():
        patron = r"\b" + re.escape(nombre) + r"\b"
        if re.search(patron, texto_lower):
            return codigo

    # fallback: por si no encuentra nada
    return "en"


In [40]:
def traducir(texto: str) -> str:
    # 1. Detectar idioma destino (mejorado)
    idioma_destino = detectar_idioma(texto)

    # 2. Extraer contenido real a traducir
    match = re.search(r":\s*(.+)$", texto)
    contenido = match.group(1) if match else texto

    # 3. Detectar idioma de origen del contenido
    idioma_origen = detector(contenido, top_k=1)[0]["label"]

    # 4. Traducir solo el contenido
    tokenizer.src_lang = idioma_origen
    inputs = tokenizer(contenido, return_tensors="pt").to(traductor.device)

    generated_tokens = traductor.generate(
        **inputs,
        forced_bos_token_id=tokenizer.get_lang_id(idioma_destino)
    )
    return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

In [49]:
print(traducir("translate this to german: hello how are you"))

Hallo wie bist du
