# Importaciones

In [1]:
%pip install mlflow
from google.colab import files
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer



# Uso de Modelo y Red Neuronal Pre-entrenada

In [2]:
modelo_base = "LeoCordoba/beto2beto-mlsum"
tokenizador = AutoTokenizer.from_pretrained(modelo_base)
modelo = AutoModelForSeq2SeqLM.from_pretrained(modelo_base)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
try:
    # Se encarga de limpiar la salida para no stackear el modelo
    modelo.unload()
except:
    pass

lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=.3,
    target_modules=["query", "key", "value"]
)

modelo = get_peft_model(modelo, lora_config)
modelo.print_trainable_parameters()

trainable params: 1,327,104 || all params: 249,428,250 || trainable%: 0.5321


# Dataset para Entrenamiento

In [4]:
ds_base = load_dataset("ai4privacy/pii-masking-300k")
ds_base

DatasetDict({
    train: Dataset({
        features: ['source_text', 'target_text', 'privacy_mask', 'span_labels', 'mbert_text_tokens', 'mbert_bio_labels', 'id', 'language', 'set'],
        num_rows: 177677
    })
    validation: Dataset({
        features: ['source_text', 'target_text', 'privacy_mask', 'span_labels', 'mbert_text_tokens', 'mbert_bio_labels', 'id', 'language', 'set'],
        num_rows: 47728
    })
})

### Selección y Filtro de Batches

In [5]:
ds = ds_base.filter(lambda x: x["language"] == "Spanish")
ds

DatasetDict({
    train: Dataset({
        features: ['source_text', 'target_text', 'privacy_mask', 'span_labels', 'mbert_text_tokens', 'mbert_bio_labels', 'id', 'language', 'set'],
        num_rows: 28847
    })
    validation: Dataset({
        features: ['source_text', 'target_text', 'privacy_mask', 'span_labels', 'mbert_text_tokens', 'mbert_bio_labels', 'id', 'language', 'set'],
        num_rows: 7816
    })
})

### Tokenización y Limpieza de Batches

In [6]:
def tokenizar(batch):
    inputs = tokenizador(
        batch["source_text"],
        truncation=True,
        max_length=256,
        padding="max_length",
    )

    with tokenizador.as_target_tokenizer():
        labels = tokenizador(
            batch["target_text"],
            truncation=True,
            max_length=128,
            padding="max_length",
        )

    inputs["labels"] = labels["input_ids"]
    return inputs

ds_tokenizado = ds.map(tokenizar, batched=True, remove_columns=ds["train"].column_names)#type:ignore

Map:   0%|          | 0/7816 [00:00<?, ? examples/s]



### Colección de Datos para Tabular

In [7]:
data_collator = DataCollatorForSeq2Seq(tokenizador, model=modelo) #type:ignore

# Entrenamiento del Modelo con LoRA

In [11]:
args = Seq2SeqTrainingArguments(
    num_train_epochs=3,
    output_dir="./resultados_sum",
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    logging_strategy="steps",
    logging_steps=500,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    predict_with_generate=True,
    report_to="mlflow",
    run_name="sum-lora"
)

In [12]:
trainer = Seq2SeqTrainer(
    model=modelo,
    args=args,
    train_dataset=ds_tokenizado["train"],#type:ignore
    eval_dataset=ds_tokenizado["validation"],#type:ignore
    tokenizer=tokenizador, #type:ignore
    data_collator=data_collator
)

  trainer = Seq2SeqTrainer(


In [None]:
"""
Se muestra un entrenamiento limpio, sin overtting y con buenas métricas,
tanto de entrenamiento como de validación, teniendo una pérdida ínfima
y un crecimiento estable
"""
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': None}.
2025/10/22 13:47:48 ERROR mlflow.utils.async_logging.async_logging_queue: Run Id 5da1fbbdc7f44a138247192ce4ff2036: Failed to log run data: Exception: Changing param values is not allowed. Param with key='eval_steps' was already logged with value='100' for run ID='5da1fbbdc7f44a138247192ce4ff2036'. Attempted logging new value '500'.
2025/10/22 13:47:48 ERROR mlflow.utils.async_logging.async_logging_queue: Run Id 5da1fbbdc7f44a138247192ce4ff2036: Failed to log run data: Exception: Changing param values is not allowed. Param with key='logging_dir' was already logged with value='./resultados_sum/runs/Oct22_13-45-12_6de325f2340b' for run ID='5da1fbbdc7f44a138247192ce4ff2036'. Attempted logging new value './resultados_sum/runs/Oct22_13-47-36_6de325f2

Step,Training Loss,Validation Loss
500,2.3534,0.930618
1000,0.9447,0.550321
1500,0.6939,0.43484
2000,0.5781,0.369143
2500,0.5199,0.329101
3000,0.4798,0.293743
3500,0.4458,0.275223
4000,0.4238,0.264483
4500,0.4052,0.252649
5000,0.398,0.250157




TrainOutput(global_step=5409, training_loss=0.6993505330155397, metrics={'train_runtime': 7800.2921, 'train_samples_per_second': 11.095, 'train_steps_per_second': 0.693, 'total_flos': 2.6720937381325824e+16, 'train_loss': 0.6993505330155397, 'epoch': 3.0})

# Descarga de Resultados y Reporte 

In [None]:
!zip -r resultados_sum.zip /content/resultados_sum /content/mlruns

files.download("resultados_sum.zip")
