In [3]:
import torch
from google.colab import drive
from datasets import load_dataset
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
from peft import get_peft_model, LoraConfig, TaskType
import os

In [None]:

drive.mount('/content/drive')
RUTE_DRIVE = "/content/drive/MyDrive/translate/"

if not os.path.exists(RUTE_DRIVE + "train_clean.es"):
    print("No está")
else:
    print(f"Si está en: {RUTE_DRIVE}")

In [None]:

MODEL_ID = "facebook/nllb-200-distilled-600M"
SRC_FILE = RUTE_DRIVE + "train_clean.es"
TGT_FILE = RUTE_DRIVE + "train_clean.it"
SRC_LANG = "spa_Latn"
TGT_LANG = "ita_Latn"

dataset_src = load_dataset("text", data_files={"train": SRC_FILE})
dataset_tgt = load_dataset("text", data_files={"train": TGT_FILE})

full_dataset = dataset_src["train"].add_column("target_text", dataset_tgt["train"]["text"])
full_dataset = full_dataset.rename_column("text", "source_text")
dataset_splits = full_dataset.train_test_split(test_size=0.01)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, src_lang=SRC_LANG)

def preprocess_function(examples):
    inputs = examples["source_text"]
    tokenizer.src_lang = SRC_LANG
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)
    tokenizer.tgt_lang = TGT_LANG
    labels = tokenizer(text_target=examples["target_text"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset_splits.map(preprocess_function, batched=True)

In [6]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID)
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=32, lora_alpha=32, lora_dropout=0.1,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"]
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

trainable params: 7,077,888 || all params: 622,151,680 || trainable%: 1.1376


In [None]:

import os
os.environ["WANDB_DISABLED"] = "true"

training_args = Seq2SeqTrainingArguments(
    output_dir="./nllb-traductor-es-it",
    learning_rate=2e-4,

    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    fp16=True,
    gradient_checkpointing=False,

    num_train_epochs=1,
    weight_decay=0.01,
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=1000,
    logging_steps=50,
    push_to_hub=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Seq2SeqTrainer(


Step,Training Loss,Validation Loss


In [None]:

output_path = RUTE_DRIVE + "SPtoIT"
model.save_pretrained(output_path)
tokenizer.save_pretrained(output_path)
print(f"Modelo guardado en: {output_path}")