# Generación de cartas de presentación con LoRA en dos LLMs (Qwen2.5-3B vs Granite/Watson 3B)

Este notebook entrena con LoRA dos modelos ~3B en el dataset de cover letters y compara su rendimiento.


In [None]:
from datasets import load_dataset, DatasetDict
from sklearn.model_selection import train_test_split

ds = load_dataset("dhruvvaidh/cover-letter-dataset-llama3")
# El dataset expone campos como "Instruction", "Prompt" y "Output"
# Unimos en un único prompt-condición y target a generar.

def build_example(ex):
    # Prompt: combinamos system instruction + user prompt si existen; si no, usamos Instruction y Prompt
    instr = ex.get("Instruction", "")
    user = ex.get("Prompt", "")
    # Target: Output
    target = ex.get("Output", "")
    # Plantilla simple estilo instruct
    merged_prompt = f"### Instrucción:\n{instr}\n\n### Datos del candidato y puesto:\n{user}\n\n### Respuesta:"
    return {"text_input": merged_prompt, "text_target": target}

ds_proc = ds["train"].map(build_example)
# split train/valid
train_idx, valid_idx = train_test_split(range(len(ds_proc)), test_size=0.15, random_state=42)
train_ds = ds_proc.select(train_idx)
valid_ds = ds_proc.select(valid_idx)

dataset = DatasetDict({"train": train_ds, "validation": valid_ds})
len(dataset["train"]), len(dataset["validation"])


# Utilidades de tokenización y data collator


In [None]:
from transformers import AutoTokenizer
import torch
MAX_LEN = 1024  # suficiente para prompts del dataset

def make_tokenize_fn(tokenizer):
    def tok_fn(ex):
        # Entrenamos causal LM con input+target concatenados; calculamos labels en la parte target
        inp = ex["text_input"]
        tgt = ex["text_target"]
        # Separador claro para delimitar target
        full = inp + "\n"
        # Tokenizamos por separado para localizar offset
        in_ids = tokenizer(full, truncation=True, max_length=MAX_LEN)["input_ids"]
        tgt_ids = tokenizer(tgt, truncation=True, max_length=MAX_LEN)["input_ids"]
        input_ids = in_ids + tgt_ids + [tokenizer.eos_token_id]
        # labels: -100 en la parte del prompt, y etiquetas en la parte target + eos
        labels = [-100] * len(in_ids) + tgt_ids + [tokenizer.eos_token_id]
        attn = [1] * len(input_ids)
        return {"input_ids": input_ids, "labels": labels, "attention_mask": attn}
    return tok_fn

class DataCollatorForCausalLM:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id
    def __call__(self, batch):
        maxlen = max(len(x["input_ids"]) for x in batch)
        def pad(seq, val):
            return seq + [val] * (maxlen - len(seq))
        input_ids = torch.tensor([pad(x["input_ids"], self.pad_id) for x in batch])
        labels = torch.tensor([pad(x["labels"], -100) for x in batch])
        attn = torch.tensor([pad(x["attention_mask"], 0) for x in batch])
        return {"input_ids": input_ids, "labels": labels, "attention_mask": attn}


# Configuración LoRA común

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],  # genérico para LLMs
    bias="none"
)


# Entrenamiento del modelo 1: Qwen2.5-3B (base)


In [None]:
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments

qwen_model_id = "Qwen/Qwen2.5-3B"  # base
qwen_tok = AutoTokenizer.from_pretrained(qwen_model_id)
if qwen_tok.pad_token is None:
    qwen_tok.pad_token = qwen_tok.eos_token

tok_qwen = dataset.map(make_tokenize_fn(qwen_tok), batched=False)
collator_qwen = DataCollatorForCausalLM(qwen_tok)

qwen_base = AutoModelForCausalLM.from_pretrained(
    qwen_model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
qwen_lora = get_peft_model(qwen_base, lora_config)
qwen_lora.print_trainable_parameters()

args_qwen = TrainingArguments(
    output_dir="./qwen3b-lora",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    warmup_ratio=0.05,
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=200,
    save_steps=200,
    save_total_limit=2,
    fp16=False,
    bf16=True,
    dataloader_num_workers=2,
    report_to=[],
)
trainer_qwen = Trainer(
    model=qwen_lora,
    args=args_qwen,
    train_dataset=tok_qwen["train"],
    eval_dataset=tok_qwen["validation"],
    data_collator=collator_qwen,
)
trainer_qwen.train()



# Entrenamiento del modelo 2: Granite/Watson 3B (base)


In [None]:
granite_model_id = "ibm-granite/granite-3.1-3b-a800m-instruct"

granite_tok = AutoTokenizer.from_pretrained(granite_model_id)
if granite_tok.pad_token is None and granite_tok.eos_token:
    granite_tok.pad_token = granite_tok.eos_token

tok_granite = dataset.map(make_tokenize_fn(granite_tok), batched=False)
collator_granite = DataCollatorForCausalLM(granite_tok)

granite_base = AutoModelForCausalLM.from_pretrained(
    granite_model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
granite_lora = get_peft_model(granite_base, lora_config)
granite_lora.print_trainable_parameters()

args_granite = TrainingArguments(
    output_dir="./granite3b-lora",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    warmup_ratio=0.05,
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=200,
    save_steps=200,
    save_total_limit=2,
    fp16=False,
    bf16=True,
    dataloader_num_workers=2,
    report_to=[],
)
trainer_granite = Trainer(
    model=granite_lora,
    args=args_granite,
    train_dataset=tok_granite["train"],
    eval_dataset=tok_granite["validation"],
    data_collator=collator_granite,
)
trainer_granite.train()


# Evaluación: Perplexity (a partir de pérdida) y ROUGE en validación


In [None]:
import math
from evaluate import load as load_metric

rouge = load_metric("rouge")
bleu = load_metric("bleu")

def eval_model(trainer, tokenizer, dataset, n_samples=64, max_new_tokens=400):
    # 1) Perplexity a partir de evaluación
    metrics = trainer.evaluate()
    ppl = math.exp(metrics["eval_loss"]) if "eval_loss" in metrics else float("nan")
    # 2) Generación y ROUGE/BLEU
    preds, refs = [], []
    subset = dataset.select(range(min(n_samples, len(dataset))))
    model = trainer.model
    model.eval()
    for ex in subset:
        in_ids = tokenizer(ex["text_input"], return_tensors="pt", truncation=True, max_length=1024).to(model.device)
        with torch.no_grad():
            gen = model.generate(
                **in_ids,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                top_p=0.9,
                temperature=0.7,
                eos_token_id=tokenizer.eos_token_id
            )
        text = tokenizer.decode(gen[0], skip_special_tokens=True)
        # Extraer la parte de respuesta después de "### Respuesta:"
        if "### Respuesta:" in text:
            text = text.split("### Respuesta:")[-1].strip()
        preds.append(text)
        refs.append(ex["text_target"])
    rouge_res = rouge.compute(predictions=preds, references=refs)
    # BLEU requiere referencias tokenizadas
    bleu_res = bleu.compute(predictions=[p.split() for p in preds],
                            references=[[r.split()] for r in refs])
    return {"perplexity": ppl, "rougeL": rouge_res.get("rougeL"), "bleu": bleu_res.get("bleu")}

res_qwen = eval_model(trainer_qwen, qwen_tok, dataset["validation"])
res_granite = eval_model(trainer_granite, granite_tok, dataset["validation"])
res_qwen, res_granite


# Comparación y ejemplos cualitativos

In [None]:
import pandas as pd

df_metrics = pd.DataFrame([
    {"modelo": "Qwen2.5-3B LoRA", **res_qwen},
    {"modelo": "Granite/Watson 3B LoRA", **res_granite},
])
df_metrics


# Muestreo de generaciones para inspección manual

In [None]:
def sample_generations(trainer, tokenizer, dataset, k=5):
    samples = dataset.select(range(min(k, len(dataset))))
    outs = []
    model = trainer.model
    for ex in samples:
        in_ids = tokenizer(ex["text_input"], return_tensors="pt", truncation=True, max_length=1024).to(model.device)
        with torch.no_grad():
            gen = model.generate(
                **in_ids,
                max_new_tokens=400,
                do_sample=True,
                top_p=0.9,
                temperature=0.7,
                eos_token_id=tokenizer.eos_token_id
            )
        pred = tokenizer.decode(gen[0], skip_special_tokens=True)
        if "### Respuesta:" in pred:
            pred = pred.split("### Respuesta:")[-1].strip()
        outs.append({"prompt": ex["text_input"], "target": ex["text_target"], "pred": pred})
    return pd.DataFrame(outs)

qwen_samples = sample_generations(trainer_qwen, qwen_tok, dataset["validation"])
granite_samples = sample_generations(trainer_granite, granite_tok, dataset["validation"])

qwen_samples.head(), granite_samples.head()


# Conclusión
# Interpretamos las métricas y las muestras para decidir el modelo con mejor desempeño.
# Consideramos: menor perplexity, mayor ROUGE-L/BLEU, y mejor alineación con requisitos del puesto y CV en ejemplos.

# Guardado de adaptadores LoRA


In [None]:
Çqwen_lora.save_pretrained("./qwen3b-lora-adapter")
granite_lora.save_pretrained("./granite3b-lora-adapter")
