In [None]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "trl<0.9.0" peft accelerate bitsandbytes

Librerías a utilizar

In [None]:
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset
import torch

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


Carga y configuración del modelo preentrenado (Llama3 + Unsloth)

In [None]:
max_seq_length = 1024
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    load_in_4bit = True,
)

==((====))==  Unsloth 2025.11.3: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

Configuración de LoRA

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_alpha = 16,
    use_gradient_checkpointing = "unsloth",
)

Not an error, but Unsloth cannot patch MLP layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Unsloth 2025.11.3 patched 32 layers with 32 QKV layers, 32 O layers and 0 MLP layers.


Carga del dataset

In [None]:
ds_raw = load_dataset("somosnlp/lenguaje-claro-dataset")
ds_split = ds_raw["train"].train_test_split(test_size=0.1, seed=42)

print(f"Entrenamiento: {len(ds_split['train'])} filas")
print(f"Validación: {len(ds_split['test'])} filas")

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4094 [00:00<?, ? examples/s]

Entrenamiento: 3684 filas
Validación: 410 filas


Formateo de los datos

In [None]:
def formatting_prompts_func(examples):
    instructions = "Eres un asistente legal. Simplifica el siguiente texto complejo a lenguaje claro."
    inputs = examples["question"]
    outputs = examples["answer"]
    texts = []
    for input, output in zip(inputs, outputs):
        text = f"""<|start_header_id|>system<|end_header_id|>\n\n{instructions}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{output}<|eot_id|>"""
        texts.append(text)
    return { "text" : texts, }

train_dataset = ds_split["train"].map(formatting_prompts_func, batched = True)
eval_dataset  = ds_split["test"].map(formatting_prompts_func, batched = True)

Map:   0%|          | 0/3684 [00:00<?, ? examples/s]

Map:   0%|          | 0/410 [00:00<?, ? examples/s]

Entrenamiento

In [None]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    args = TrainingArguments(
        per_device_train_batch_size = 8,
        gradient_accumulation_steps = 2,
        warmup_steps = 5,
        num_train_epochs = 1,
        learning_rate = 2e-4,
        logging_steps = 10,
        eval_strategy = "steps",
        eval_steps = 50,
        save_strategy = "steps",
        save_steps = 30,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        output_dir = "outputs",
        optim = "adamw_8bit",
        report_to="none",
    ),
)

trainer.train()

Map:   0%|          | 0/3684 [00:00<?, ? examples/s]

Map:   0%|          | 0/410 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3,684 | Num Epochs = 1 | Total steps = 231
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 2 x 1) = 16
 "-____-"     Trainable parameters = 13,631,488 of 8,043,892,736 (0.17% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
50,1.1792,1.175477
100,1.1314,1.137828
150,1.1231,1.119518
200,1.107,1.111323


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


TrainOutput(global_step=231, training_loss=1.2101010837596216, metrics={'train_runtime': 4736.3498, 'train_samples_per_second': 0.778, 'train_steps_per_second': 0.049, 'total_flos': 6.697842599264256e+16, 'train_loss': 1.2101010837596216, 'epoch': 1.0})

Prueba del modelo con un texto de ejemplo

In [None]:
from unsloth import FastLanguageModel
from google.colab import drive
import torch

drive.mount('/content/drive')

max_seq_length = 1024
dtype = None
load_in_4bit = True

ruta_modelo_drive = "/content/drive/MyDrive/Modelos/Llama3-Legal-Final"

print(f"Cargando modelo desde: {ruta_modelo_drive} ...")

try:
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = ruta_modelo_drive,
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    print("Modelo cargado exitosamente")
except Exception as e:
    print(f"Error al cargar")
    print(f"Detalle del error: {e}")

FastLanguageModel.for_inference(model)

texto_legal = """
El arrendatario responderá de los daños y perjuicios que se ocasionen por su negligencia o la de las personas que convivan en el domicilio.
"""

prompt = f"""<|start_header_id|>system<|end_header_id|>

Eres un asistente legal. Simplifica el siguiente texto complejo a lenguaje claro.<|eot_id|><|start_header_id|>user<|end_header_id|>

{texto_legal}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""

inputs = tokenizer([prompt], return_tensors = "pt").to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens = 128,
    use_cache = True,
    temperature = 0.1,
)

resultado = tokenizer.batch_decode(outputs)
limpio = resultado[0].split("<|start_header_id|>assistant<|end_header_id|>")[-1].replace("<|eot_id|>", "")

print("\n" + "="*30)
print("RESULTADO SIMPLIFICADO:")
print("="*30)
print(limpio)

Guardamos el modelo (adaptadores) en Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

save_path = "/content/drive/MyDrive/Modelos/Llama3-Legal-Final"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Modelo guardado exitosamente en: {save_path}")