In [1]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from datasets import load_dataset, Dataset
from peft import get_peft_model, LoraConfig, TaskType, PeftModel
import torch

In [None]:
!unzip mbart_lora_es_pt.zip -d mbart_lora_es_pt

In [45]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,
)

In [46]:
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)
model = get_peft_model(model, peft_config)

In [2]:
!pip install -U datasets



In [3]:
import huggingface_hub
huggingface_hub.login() # now you will be prompted to enter your token; enter it.

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
ds_es = load_dataset("openlanguagedata/flores_plus", "spa_Latn", split="dev")
ds_pt = load_dataset("openlanguagedata/flores_plus", "por_Latn", split="dev")
parallel_pt = [{"translation": {"es": e["text"], "pt": p["text"]}} for e, p in zip(ds_es, ds_pt)]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Resolving data files:   0%|          | 0/219 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/213 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/219 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/213 [00:00<?, ?it/s]

In [50]:
dataset_pt = Dataset.from_list(parallel_pt).train_test_split(test_size=0.1, seed=42)
train_pt = dataset_pt["train"]
eval_pt = dataset_pt["test"]

In [51]:
tokenizer.src_lang = "es_XX"
tokenizer.tgt_lang = "pt_XX"

def tokenize_pt(batch):
    src = [x["es"] for x in batch["translation"]]
    tgt = [x["pt"] for x in batch["translation"]]
    inputs = tokenizer(src, truncation=True, padding="max_length", max_length=128)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(tgt, truncation=True, padding="max_length", max_length=128)
    inputs["labels"] = labels["input_ids"]
    return inputs

train_tokenized_pt = train_pt.map(tokenize_pt, batched=True)
eval_tokenized_pt = eval_pt.map(tokenize_pt, batched=True)

Map:   0%|          | 0/897 [00:00<?, ? examples/s]



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [52]:
training_args_pt = Seq2SeqTrainingArguments(
    output_dir="./mbart_lora_es_pt",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-4,
    num_train_epochs=15,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs_pt",
    predict_with_generate=True,  # Importante para traducción
    fp16=torch.cuda.is_available(),  # Entrenamiento más rápido si tienes GPU
    save_total_limit=1,
    report_to="none",
    label_names=["labels"]
)

trainer_pt = Seq2SeqTrainer(
    model=model,
    args=training_args_pt,
    train_dataset=train_tokenized_pt,
    eval_dataset=eval_tokenized_pt,
    processing_class=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
)

In [53]:
trainer_pt.train()

Epoch,Training Loss,Validation Loss
1,No log,8.235757
2,No log,8.224859
3,8.276700,8.226983
4,8.276700,8.234759
5,8.091900,8.241047
6,8.091900,8.258754
7,8.028200,8.266699
8,8.028200,8.279073
9,7.962200,8.283938
10,7.962200,8.295945


TrainOutput(global_step=3375, training_loss=8.013915364583333, metrics={'train_runtime': 667.1738, 'train_samples_per_second': 20.167, 'train_steps_per_second': 5.059, 'total_flos': 3657031200276480.0, 'train_loss': 8.013915364583333, 'epoch': 15.0})

In [54]:
model.save_pretrained("./mbart_lora_es_pt")
tokenizer.save_pretrained("./mbart_lora_es_pt")
print("✅ Adaptadores LoRA para Español → Portugués guardados.")

✅ Adaptadores LoRA para Español → Portugués guardados.


In [8]:
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
base_model = MBartForConditionalGeneration.from_pretrained(model_name)
model = PeftModel.from_pretrained(base_model, "./mbart_lora_es_pt/mbart_lora_es_pt")

In [9]:
model.train()
for name, param in model.named_parameters():
    if "lora" in name:
        param.requires_grad = True

model.print_trainable_parameters()

trainable params: 1,179,648 || all params: 612,059,136 || trainable%: 0.1927


In [10]:
src_lang = "es_XX"
tgt_lang = "pt_XX"
tokenizer.src_lang = src_lang

# Función de traducción
def traducir(texto):
    inputs = tokenizer(texto, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Indicamos que la secuencia de salida debe comenzar en portugués
    generated_tokens = model.generate(
        **inputs,
        forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang],
        max_length=128
    )

    traduccion = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
    return traduccion

In [65]:
frases = [
    "Hola mundo",
    "¿Cómo estás?",
    "Me gusta aprender cosas nuevas.",
    "Estoy entrenando un modelo de traducción.",
    "La inteligencia artificial es fascinante."
]

for f in frases:
    print(f"ES: {f}")
    print(f"PT: {traducir(f)}\n")

ES: Hola mundo
PT: Olá ao mundo!

ES: ¿Cómo estás?


KeyboardInterrupt: 

In [11]:
ds_gl = load_dataset("openlanguagedata/flores_plus", "glg_Latn", split="dev")
parallel_gl = [{"translation": {"es": e["text"], "gl": g["text"]}} for e, g in zip(ds_es, ds_gl)]

dataset_gl = Dataset.from_list(parallel_gl).train_test_split(test_size=0.1, seed=42)
train_gl = dataset_gl["train"]
eval_gl = dataset_gl["test"]

Resolving data files:   0%|          | 0/219 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/213 [00:00<?, ?it/s]

glg_Latn.parquet:   0%|          | 0.00/123k [00:00<?, ?B/s]

glg_Latn.parquet:   0%|          | 0.00/128k [00:00<?, ?B/s]

Generating dev split:   0%|          | 0/997 [00:00<?, ? examples/s]

Generating devtest split:   0%|          | 0/1012 [00:00<?, ? examples/s]

In [12]:
tokenizer.src_lang = "es_XX"
tokenizer.tgt_lang = "gl_ES"

def tokenize_gl(batch):
    src = [x["es"] for x in batch["translation"]]
    tgt = [x["gl"] for x in batch["translation"]]
    inputs = tokenizer(src, truncation=True, padding="max_length", max_length=128)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(tgt, truncation=True, padding="max_length", max_length=128)
    inputs["labels"] = labels["input_ids"]
    return inputs

train_tokenized_gl = train_gl.map(tokenize_gl, batched=True)
eval_tokenized_gl = eval_gl.map(tokenize_gl, batched=True)

Map:   0%|          | 0/897 [00:00<?, ? examples/s]



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [13]:
training_args_gl = Seq2SeqTrainingArguments(
    output_dir="./mbart_lora_es_gl",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-4,
    num_train_epochs=15,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs_gl",
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    save_total_limit=1,
    report_to="none",
    label_names=["labels"]
)

trainer_gl = Seq2SeqTrainer(
    model=model,
    args=training_args_gl,
    train_dataset=train_tokenized_gl,
    eval_dataset=eval_tokenized_gl,
    processing_class=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
)

In [14]:
trainer_gl.train()
model.save_pretrained("./mbart_lora_es_gl")
tokenizer.save_pretrained("./mbart_lora_es_gl")

Epoch,Training Loss,Validation Loss
1,No log,8.279472
2,No log,8.275163
3,8.177100,8.288431
4,8.177100,8.29384
5,8.029600,8.303224
6,8.029600,8.321163
7,7.972700,8.342236
8,7.972700,8.353626
9,7.903800,8.365123
10,7.903800,8.373678


('./mbart_lora_es_gl/tokenizer_config.json',
 './mbart_lora_es_gl/special_tokens_map.json',
 './mbart_lora_es_gl/sentencepiece.bpe.model',
 './mbart_lora_es_gl/added_tokens.json',
 './mbart_lora_es_gl/tokenizer.json')

In [13]:
pip install evaluate sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.2.0 sacrebleu-2.5.1


In [14]:
from evaluate import load

In [29]:
def test_translation_batch(sentences_es, references_gl, model_path="./mbart_lora_es_gl"):
    try:
        # Cargar modelo base + adaptadores LoRA
        base_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
        model = PeftModel.from_pretrained(base_model, model_path)
        model.eval()
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model.to(device)

        # Cargar tokenizer
        tokenizer = MBart50TokenizerFast.from_pretrained(model_path)
        inputs = tokenizer(sentences_es, return_tensors="pt", padding=True, truncation=True).to(device)

        # Generar traducciones
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=128,
                num_beams=4,
                early_stopping=True,
                do_sample=False
            )

        translations = [tokenizer.decode(out, skip_special_tokens=True) for out in outputs]

        # Mostrar resultados
        for src, pred, ref in zip(sentences_es, translations, references_gl):
            print(f"EN: {src}")
            print(f"GL (pred): {pred}")
            print(f"GL (ref):  {ref}")
            print("-" * 60)

        # Calcular métricas BLEU y chrF en lote
        print("\n📊 Métricas globales:")
        bleu = load("bleu")
        chrf = load("chrf")

        bleu_score = bleu.compute(predictions=translations, references=[[r] for r in references_gl])
        chrf_score = chrf.compute(predictions=translations, references=references_gl)

        print(f"BLEU: {bleu_score['bleu']:.4f}")
        print(f"chrF: {chrf_score['score']:.2f}")

    except Exception as e:
        print(f"❌ Error en traducción: {e}")

In [30]:
ds_es = load_dataset("openlanguagedata/flores_plus", "spa_Latn", split="devtest[:50]")
ds_gl = load_dataset("openlanguagedata/flores_plus", "glg_Latn", split="devtest[:50]")

# Tomar una muestra de 50 ejemplos para visualización
sample_es = ds_es["text"][:50]
sample_gl = ds_gl["text"][:50]

Resolving data files:   0%|          | 0/219 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/213 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/219 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/213 [00:00<?, ?it/s]

In [31]:
test_translation_batch(sample_es, sample_gl)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


EN: «Actualmente, tenemos ratones de cuatro meses de edad que antes solían ser diabéticos y que ya no lo son», agregó.
GL (pred): «Agora temos ratos de catro meses de idade que antes sufrían diabetes e xa non o están», añadiu.
GL (ref):  "Agora temos ratos de 4 meses que xa non son diabéticos, pero que no seu momento si que o foron", engadiu.
------------------------------------------------------------
EN: La investigación todavía se ubica en su etapa inicial, conforme indicara el Dr. Ehud Ur, docente en la carrera de medicina de la Universidad de Dalhousie, en Halifax, Nueva Escocia, y director del departamento clínico y científico de la Asociación Canadiense de Diabetes.
GL (pred): A investigación aínda está na súa etapa inicial, afirma o Dr. Ehud Ur, profesor de medicina de carrera na Universidade de Dalhousie en Halifax ( Nova Escocia) e director do departamento clínico e científico da Diabetes Association Canada.
GL (ref):  O Dr. Ehud Ur, profesor de medicina na Universidade Dalho