In [1]:
import os
os.environ["BNB_CUDA_VERSION"] = "124"  # force using CUDA 12.4 binary

!pip install -U transformers accelerate evaluate
!pip install rouge_score sacrebleu
!pip install -U peft
!pip uninstall -y bitsandbytes
!pip install bitsandbytes


Collecting transformers
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate
  Downloading accelerate-1.12.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting pyarrow>=21.0.0 (from datasets>=2.0.0->evaluate)
  Downloading pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-n

# Generación de cartas de presentación con LoRA en dos LLMs (Qwen2.5-3B vs Granite/Watson 3B)

Este notebook entrena con LoRA dos modelos ~3B en el dataset de cover letters y compara su rendimiento.


In [2]:
from datasets import load_dataset, DatasetDict
from sklearn.model_selection import train_test_split

ds = load_dataset("dhruvvaidh/cover-letter-dataset-llama3")
# El dataset expone campos como "Instruction", "Prompt" y "Output"
# Unimos en un único prompt-condición y target a generar.

def build_example(ex):
    # Take the Instruction and Output
    instr = ex.get("Instruction", "")
    target = ex.get("Output", "")

    # Custom system prompt (plain text, no special tokens)
    system_prompt = (
        "You are an assistant that writes professional cover letters. "
        "Given the job description and candidate information, "
        "generate a tailored cover letter that highlights relevant experience, "
        "skills, and enthusiasm for the company."
    )

    # Merge into a simple instruct-style input
    merged_prompt = (
        f"SYSTEM PROMPT:\n{system_prompt}\n\n"
        f"INSTRUCTION:\n{instr}\n\n"
        f"RESPONSE:"
    )
    return {"text_input": merged_prompt, "text_target": target}


ds_proc = ds["train"].map(
    build_example,
    remove_columns=ds["train"].column_names
)

# split train/valid
train_idx, valid_idx = train_test_split(range(len(ds_proc)), test_size=0.1, random_state=42)
train_ds = ds_proc.select(train_idx)
valid_ds = ds_proc.select(valid_idx)

dataset = DatasetDict({"train": train_ds, "validation": valid_ds})
len(dataset["train"]), len(dataset["validation"])


README.md:   0%|          | 0.00/448 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/841k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/367k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/813 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/349 [00:00<?, ? examples/s]

Map:   0%|          | 0/813 [00:00<?, ? examples/s]

(731, 82)

In [3]:
lengths = [len(example['text_target']) for example in train_ds]
max_len = max(lengths)
min_len = min(lengths)
mean_len = sum(lengths) / len(lengths)

print(f"Maximum length of text_target: {max_len}")
print(f"Minimum length of text_target: {min_len}")
print(f"Mean length of text_target: {mean_len:.2f}")

Maximum length of text_target: 1832
Minimum length of text_target: 170
Mean length of text_target: 875.09


In [None]:
ds_proc[0]

{'text_input': 'SYSTEM PROMPT:\nYou are an assistant that writes professional cover letters. Given the job description and candidate information, generate a tailored cover letter that highlights relevant experience, skills, and enthusiasm for the company.\n\nINSTRUCTION:\nJob Title: Senior Java Developer\nPreferred Qualifications: \n5+ years of experience in Java Development\nHiring Company: \nGoogle\nApplicant Name: \nJohn Doe\nWorking Experience: Java Developer at XYZ for 3 years \nSenior Java Developer at ABC for 2 years\nQualifications: BSc in Computer Science\nSkillsets: \nJava, Spring Boot, Hibernate, SQL\n\nRESPONSE:',
 'text_target': 'I am writing to express my interest in the Senior Java Developer position at Google. With over 5 years of experience in Java development, I am confident in my ability to contribute effectively to your team. My professional experience includes designing and implementing Java applications, managing the full software development lifecycle, and troubl

# Utilidades de tokenización y data collator


In [3]:
from transformers import AutoTokenizer
import torch
MAX_LEN = 600  # suficiente para prompts del dataset

def make_tokenize_fn(tokenizer):
    def tok_fn(ex):
        # Entrenamos causal LM con input+target concatenados; calculamos labels en la parte target
        inp = ex["text_input"]
        tgt = ex["text_target"]
        # Separador claro para delimitar target
        full = inp + "\n"
        # Tokenizamos por separado para localizar offset
        in_ids = tokenizer(full, truncation=True, max_length=MAX_LEN)["input_ids"]
        tgt_ids = tokenizer(tgt, truncation=True, max_length=MAX_LEN)["input_ids"]
        input_ids = in_ids + tgt_ids + [tokenizer.eos_token_id]
        # labels: -100 en la parte del prompt, y etiquetas en la parte target + eos
        labels = [-100] * len(in_ids) + tgt_ids + [tokenizer.eos_token_id]
        attn = [1] * len(input_ids)
        return {"input_ids": input_ids, "labels": labels, "attention_mask": attn}
    return tok_fn

class DataCollatorForCausalLM:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id
    def __call__(self, batch):
        maxlen = max(len(x["input_ids"]) for x in batch)
        def pad(seq, val):
            return seq + [val] * (maxlen - len(seq))
        input_ids = torch.tensor([pad(x["input_ids"], self.pad_id) for x in batch])
        labels = torch.tensor([pad(x["labels"], -100) for x in batch])
        attn = torch.tensor([pad(x["attention_mask"], 0) for x in batch])
        return {"input_ids": input_ids, "labels": labels, "attention_mask": attn}


# Configuración LoRA común

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],  # genérico para LLMs
    bias="none"
)


# Entrenamiento del modelo 1: Qwen2.5-3B (base)


In [None]:
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, BitsAndBytesConfig
import torch

qwen_model_id = "Qwen/Qwen2.5-3B"  # base
qwen_tok = AutoTokenizer.from_pretrained(qwen_model_id)
if qwen_tok.pad_token is None:
    qwen_tok.pad_token = qwen_tok.eos_token

tok_qwen = dataset.map(make_tokenize_fn(qwen_tok), batched=False)
collator_qwen = DataCollatorForCausalLM(qwen_tok)

quantization_config = BitsAndBytesConfig(load_in_4bit=True)

qwen_base = AutoModelForCausalLM.from_pretrained(
    qwen_model_id,
    device_map="auto",
    quantization_config=quantization_config
)
qwen_lora = get_peft_model(qwen_base, lora_config)
qwen_lora.print_trainable_parameters()

args_qwen = TrainingArguments(
    output_dir="./qwen3b-lora",
    num_train_epochs=3,
    per_device_train_batch_size=1,   # Batch size para entrenamiento
    gradient_accumulation_steps=4,   # Acumular gradientes para simular un batch size de 4*1=4
    per_device_eval_batch_size=1,    # Batch size para evaluación
    learning_rate=2e-4,              # Learning rate (típicamente más alto con LoRA: 1e-4 a 3e-4)
    weight_decay=0.01,               # Regularización L2
    eval_strategy="epoch",           # Evaluar al final de cada época
    save_strategy="no",              # No guardar checkpoints (para rapidez)
    logging_steps=50,                # Log cada 50 steps
    report_to="none",                # No reportar a wandb/tensorboard
    # bf16=True,                       # Ya usando bnb_4bit_compute_dtype=torch.bfloat16
    fp16=torch.cuda.is_available(),  # Mixed precision training si hay GPU
)

trainer_qwen = Trainer(
    model=qwen_lora,
    args=args_qwen,
    train_dataset=tok_qwen["train"],
    eval_dataset=tok_qwen["validation"],
    data_collator=collator_qwen,
)
trainer_qwen.train()

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/731 [00:00<?, ? examples/s]

Map:   0%|          | 0/82 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/683 [00:00<?, ?B/s]

This can be used to load a bitsandbytes version built with a CUDA version that is different from the PyTorch CUDA version.
If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=



model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

trainable params: 29,933,568 || all params: 3,115,872,256 || trainable%: 0.9607


Epoch,Training Loss,Validation Loss
1,0.6829,0.546626
2,0.5435,0.470549
3,0.4189,0.464479


TrainOutput(global_step=549, training_loss=0.5901047684021552, metrics={'train_runtime': 1579.0501, 'train_samples_per_second': 1.389, 'train_steps_per_second': 0.348, 'total_flos': 1.1581752830828544e+16, 'train_loss': 0.5901047684021552, 'epoch': 3.0})

In [None]:
qwen_lora.cpu()
# Guardar solo los adaptadores LoRA
qwen_lora.save_pretrained("./qwen3b-lora-adapter")


('./qwen3b-lora-tok/tokenizer_config.json',
 './qwen3b-lora-tok/special_tokens_map.json',
 './qwen3b-lora-tok/chat_template.jinja',
 './qwen3b-lora-tok/vocab.json',
 './qwen3b-lora-tok/merges.txt',
 './qwen3b-lora-tok/added_tokens.json',
 './qwen3b-lora-tok/tokenizer.json')


# Entrenamiento del modelo 2: Granite/Watson 3B (base)


In [None]:
granite_model_id = "ibm-granite/granite-3.1-1b-a400m-instruct"

granite_tok = AutoTokenizer.from_pretrained(granite_model_id)
if granite_tok.pad_token is None and granite_tok.eos_token:
    granite_tok.pad_token = granite_tok.eos_token

tok_granite = dataset.map(make_tokenize_fn(granite_tok), batched=False)
collator_granite = DataCollatorForCausalLM(granite_tok)

granite_base = AutoModelForCausalLM.from_pretrained(
    granite_model_id,
    device_map="auto",
)


granite_lora = get_peft_model(granite_base, lora_config)
granite_lora.print_trainable_parameters()

args_granite = TrainingArguments(
    output_dir="./granite3b-lora",
    num_train_epochs=3,
    per_device_train_batch_size=1,   # Reducido de 8 a 4
    per_device_eval_batch_size=1,   # Reducido de 16 a 8
    learning_rate=2e-4,              # Learning rate (típicamente más alto con LoRA: 1e-4 a 3e-4)
    weight_decay=0.01,               # Regularización L2
    eval_strategy="epoch",           # Evaluar al final de cada época
    save_strategy="no",              # No guardar checkpoints (para rapidez)
    logging_steps=50,                # Log cada 50 steps
    report_to="none",                # No reportar a wandb/tensorboard
    fp16=torch.cuda.is_available(),  # Mixed precision training si hay GPU
)

trainer_granite = Trainer(
    model=granite_lora,
    args=args_granite,
    train_dataset=tok_granite["train"],
    eval_dataset=tok_granite["validation"],
    data_collator=collator_granite,
)
trainer_granite.train()

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

Map:   0%|          | 0/731 [00:00<?, ? examples/s]

Map:   0%|          | 0/82 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/889 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


trainable params: 2,752,512 || all params: 1,337,380,864 || trainable%: 0.2058


Epoch,Training Loss,Validation Loss
1,0.591,0.489053
2,0.4657,0.431401
3,0.3513,0.418237


TrainOutput(global_step=2193, training_loss=0.46292017585764833, metrics={'train_runtime': 1028.0304, 'train_samples_per_second': 2.133, 'train_steps_per_second': 2.133, 'total_flos': 6157674070861824.0, 'train_loss': 0.46292017585764833, 'epoch': 3.0})

In [None]:
granite_lora.cpu()

granite_lora.save_pretrained("./granite3b-lora-adapter")

In [None]:
import zipfile
import os

def zip_folders(folder_paths, output_filename):
    """Creates a zip archive from a list of folder paths.

    Args:
        folder_paths (list): A list of paths to the folders to be zipped.
        output_filename (str): The name of the output zip file.
    """
    with zipfile.ZipFile(output_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for folder_path in folder_paths:
            for root, _, files in os.walk(folder_path):
                for file in files:
                    file_path = os.path.join(root, file)
                    # Calculate the archive name by making it relative to the base folder_path
                    archive_name = os.path.relpath(file_path, start=os.path.dirname(folder_path))
                    zipf.write(file_path, archive_name)
    print(f"Successfully created {output_filename} containing: {', '.join(folder_paths)}")

# Specify the folders to zip
folders_to_zip = ["./qwen3b-lora-adapter", "./granite3b-lora-adapter"]

# Specify the output zip filename
output_zip_file = "lora_adapters.zip"

# Call the function to create the zip archive
zip_folders(folders_to_zip, output_zip_file)

# Evaluación: Perplexity (a partir de pérdida) y ROUGE en validación


In [None]:
import math
import torch

# Ensure the model is on the correct device before evaluation
if torch.cuda.is_available():
    qwen_lora.to("cuda")

eval_results_qwen = trainer_qwen.evaluate(eval_dataset=tok_qwen["validation"])
eval_loss_qwen = eval_results_qwen["eval_loss"]
perplexity_qwen = math.exp(eval_loss_qwen)

res_qwen = {"eval_loss": eval_loss_qwen, "perplexity": perplexity_qwen}
print(f"Qwen2.5-3B LoRA Evaluation Loss: {eval_loss_qwen:.4f}")
print(f"Qwen2.5-3B LoRA Perplexity: {perplexity_qwen:.4f}")

Qwen2.5-3B LoRA Evaluation Loss: 0.4645
Qwen2.5-3B LoRA Perplexity: 1.5912


In [None]:
if torch.cuda.is_available():
    granite_lora.to("cuda")

eval_results_granite = trainer_granite.evaluate(eval_dataset=tok_granite["validation"])
eval_loss_granite = eval_results_granite["eval_loss"]
perplexity_granite = math.exp(eval_loss_granite)

res_granite = {"eval_loss": eval_loss_granite, "perplexity": perplexity_granite}
print(f"Granite/Watson 3B LoRA Evaluation Loss: {eval_loss_granite:.4f}")
print(f"Granite/Watson 3B LoRA Perplexity: {perplexity_granite:.4f}")

Granite/Watson 3B LoRA Evaluation Loss: 0.4182
Granite/Watson 3B LoRA Perplexity: 1.5192


In [7]:
import evaluate

rouge = evaluate.load("rouge")
sacrebleu = evaluate.load("sacrebleu")

print("ROUGE and SacreBLEU metrics loaded.")

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

ROUGE and SacreBLEU metrics loaded.


In [8]:
def compute_metrics(predictions, references):
    # ROUGE
    rouge_results = rouge.compute(predictions=predictions, references=references)

    # SacreBLEU
    # SacreBLEU expects references as a list of lists, where each inner list contains one reference string.
    # For multiple references per prediction, it would be [[ref1, ref2], [ref3, ref4]], etc.
    # Here, we assume a single reference per prediction.
    sacrebleu_references = [[ref] for ref in references]
    sacrebleu_results = sacrebleu.compute(predictions=predictions, references=sacrebleu_references)

    return {
        "rouge_l_fmeasure": rouge_results["rougeLsum"],
        "sacrebleu": sacrebleu_results["score"]
    }

def generate_and_evaluate(model, tokenizer, dataset, device):
    predictions = []
    references = []

    # Ensure model is in evaluation mode and on the correct device
    model.eval()
    model.to(device)

    for i, example in enumerate(dataset):
        input_text = example["text_input"]
        target_text = example["text_target"]

        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=MAX_LEN).to(device)

        with torch.no_grad():
            generated_tokens = model.generate(
                **inputs,
                max_new_tokens=400, # Increased max_new_tokens for longer cover letters
                do_sample=True,
                top_p=0.9,
                temperature=0.7,
                eos_token_id=tokenizer.eos_token_id
            )

        # Decode generated text and remove prompt if present
        generated_text = tokenizer.decode(generated_tokens[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)

        # Clean up generated text (remove potential extra SYSTEM PROMPT or INSTRUCTION parts if model hallucinates)
        if "SYSTEM PROMPT:" in generated_text:
            generated_text = generated_text.split("SYSTEM PROMPT:")[0].strip()
        if "INSTRUCTION:" in generated_text:
            generated_text = generated_text.split("INSTRUCTION:")[0].strip()
        if "RESPONSE:" in generated_text:
            generated_text = generated_text.split("RESPONSE:")[-1].strip()

        predictions.append(generated_text)
        references.append(target_text)

    metrics = compute_metrics(predictions, references)
    return metrics

# Assuming device is available and set as 'cuda' if applicable
device = "cuda" if torch.cuda.is_available() else "cpu"

print("Generating and evaluating for Qwen2.5-3B Base...")
qwen_metrics_base = generate_and_evaluate(qwen_base_loaded, qwen_tok_loaded, tok_qwen["validation"], device)
res_qwen_base = {}
res_qwen_base.update(qwen_metrics_base)
print("Qwen2.5-3B Base ROUGE-L F-measure:", res_qwen_base["rouge_l_fmeasure"])
print("Qwen2.5-3B Base SacreBLEU:", res_qwen_base["sacrebleu"])

print("\nGenerating and evaluating for Qwen2.5-3B LoRA...")
qwen_metrics_lora = generate_and_evaluate(qwen_lora_loaded, qwen_tok_loaded, tok_qwen["validation"], device)
res_qwen_lora = {}
res_qwen_lora.update(qwen_metrics_lora)
print("Qwen2.5-3B LoRA ROUGE-L F-measure:", res_qwen_lora["rouge_l_fmeasure"])
print("Qwen2.5-3B LoRA SacreBLEU:", res_qwen_lora["sacrebleu"])


print("\nGenerating and evaluating for Granite/Watson 1B Base...")
granite_metrics_base = generate_and_evaluate(granite_base_loaded, granite_tok_loaded, tok_granite["validation"], device)
res_granite_base = {}
res_granite_base.update(granite_metrics_base)
print("Granite/Watson 3B Base ROUGE-L F-measure:", res_granite_base["rouge_l_fmeasure"])
print("Granite/Watson 3B Base SacreBLEU:", res_granite_base["sacrebleu"])

print("\nGenerating and evaluating for Granite/Watson 1B LoRA...")
granite_metrics_lora = generate_and_evaluate(granite_lora_loaded, granite_tok_loaded, tok_granite["validation"], device)
res_granite_lora = {}
res_granite_lora.update(granite_metrics_lora)
print("Granite/Watson 3B LoRA ROUGE-L F-measure:", res_granite_lora["rouge_l_fmeasure"])
print("Granite/Watson 3B LoRA SacreBLEU:", res_granite_lora["sacrebleu"])

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Generating and evaluating for Qwen2.5-3B Base...


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for

Qwen2.5-3B Base ROUGE-L F-measure: 0.6184562331407417
Qwen2.5-3B Base SacreBLEU: 46.16392860996896

Generating and evaluating for Qwen2.5-3B LoRA...


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for

Qwen2.5-3B LoRA ROUGE-L F-measure: 0.6265485298408235
Qwen2.5-3B LoRA SacreBLEU: 47.006842602866776

Generating and evaluating for Granite/Watson 1B Base...
Granite/Watson 3B Base ROUGE-L F-measure: 0.6229845864331012
Granite/Watson 3B Base SacreBLEU: 46.80988128584835

Generating and evaluating for Granite/Watson 1B LoRA...
Granite/Watson 3B LoRA ROUGE-L F-measure: 0.6370141040864109
Granite/Watson 3B LoRA SacreBLEU: 46.89581584193709


# Muestreo de generaciones para inspección manual

In [3]:
import zipfile
import os

# Path to your zip file
zip_path = "lora_adapters.zip"

# Open and extract
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(".")

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import torch
import pandas as pd # Ensure pandas is imported for DataFrame

# Reuse the quantization config from earlier for Qwen
quantization_config = BitsAndBytesConfig(load_in_4bit=True)

# 1. Load Qwen Tokenizer and Model with LoRA
print("Loading Qwen2.5-3B model and tokenizer with LoRA adapters...")
qwen_model_id = "Qwen/Qwen2.5-3B"
qwen_tok_loaded =AutoTokenizer.from_pretrained(qwen_model_id)
if qwen_tok_loaded.pad_token is None:
    qwen_tok_loaded.pad_token = qwen_tok_loaded.eos_token
tok_qwen = dataset.map(make_tokenize_fn(qwen_tok_loaded), batched=False)

qwen_base_loaded = AutoModelForCausalLM.from_pretrained(
    qwen_model_id,
    device_map="auto",
    quantization_config=quantization_config
)
qwen_lora_loaded = PeftModel.from_pretrained(qwen_base_loaded, "/kaggle/input/lora-adapters/qwen3b-lora-adapter")
qwen_lora_loaded.eval() # Set to evaluation mode
print("Qwen2.5-3B model loaded successfully.")


# 2. Load Granite Tokenizer and Model with LoRA
print("\nLoading Granite/Watson 3B model and tokenizer with LoRA adapters...")
granite_model_id = "ibm-granite/granite-3.1-1b-a400m-instruct"
granite_tok_loaded = AutoTokenizer.from_pretrained(granite_model_id) # Load from HF as tokenizer was not explicitly saved for granite
if granite_tok_loaded.pad_token is None and granite_tok_loaded.eos_token:
    granite_tok_loaded.pad_token = granite_tok_loaded.eos_token
tok_granite = dataset.map(make_tokenize_fn(granite_tok_loaded), batched=False)

granite_base_loaded = AutoModelForCausalLM.from_pretrained(
    granite_model_id,
    device_map="auto",
    offload_folder="./offload"
)
granite_lora_loaded = PeftModel.from_pretrained(granite_base_loaded, "/kaggle/input/lora-adapters/granite3b-lora-adapter")
granite_lora_loaded.eval() # Set to evaluation mode
print("Granite/Watson 3B model loaded successfully.")

# 3. Adapt and Execute sample_generations Function
print("\nGenerating sample outputs for manual review...")

def sample_generations_adapted(model, tokenizer, dataset, k=5):
    samples = dataset.select(range(min(k, len(dataset))))
    outs = []
    device = model.device # Get the device where the model resides
    for ex in samples:
        input_text = ex["text_input"]
        in_ids = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=MAX_LEN).to(device)
        with torch.no_grad():
            gen = model.generate(
                **in_ids,
                max_new_tokens=400,
                do_sample=True,
                top_p=0.9,
                temperature=0.7,
                eos_token_id=tokenizer.eos_token_id
            )
        # Decode generated text and remove prompt if present
        generated_text = tokenizer.decode(gen[0][in_ids["input_ids"].shape[1]:], skip_special_tokens=True)

        # Clean up generated text (remove potential extra SYSTEM PROMPT or INSTRUCTION parts if model hallucinates)
        if "SYSTEM PROMPT:" in generated_text:
            generated_text = generated_text.split("SYSTEM PROMPT:")[0].strip()
        if "INSTRUCTION:" in generated_text:
            generated_text = generated_text.split("INSTRUCTION:")[0].strip()
        if "RESPONSE:" in generated_text:
            generated_text = generated_text.split("RESPONSE:")[-1].strip()

        outs.append({"prompt": ex["text_input"], "target": ex["text_target"], "pred": generated_text})
    return pd.DataFrame(outs)

print("Con LoRA:")
# Generar con LoRA
qwen_samples_lora = sample_generations_adapted(qwen_lora_loaded, qwen_tok_loaded, tok_qwen["validation"])
granite_samples_lora = sample_generations_adapted(granite_lora_loaded, granite_tok_loaded, tok_granite["validation"])

print("Sin LoRA:")
# Generar sin LoRA
qwen_samples_base = sample_generations_adapted(qwen_base_loaded, qwen_tok_loaded, tok_qwen["validation"])
granite_samples_base = sample_generations_adapted(granite_base_loaded, granite_tok_loaded, tok_granite["validation"])

2025-11-22 15:34:55.014520: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763825695.172328      48 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763825695.220031      48 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Loading Qwen2.5-3B model and tokenizer with LoRA adapters...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/731 [00:00<?, ? examples/s]

Map:   0%|          | 0/82 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/683 [00:00<?, ?B/s]

This can be used to load a bitsandbytes version built with a CUDA version that is different from the PyTorch CUDA version.
If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=



model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

Qwen2.5-3B model loaded successfully.

Loading Granite/Watson 3B model and tokenizer with LoRA adapters...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

Map:   0%|          | 0/731 [00:00<?, ? examples/s]

Map:   0%|          | 0/82 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/889 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Granite/Watson 3B model loaded successfully.

Generating sample outputs for manual review...
Con LoRA:


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Sin LoRA:


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


In [5]:
# Print first sample for Qwen (LoRA vs Base)
print("Qwen2.5-3B Base Target:")
print(qwen_samples_base.loc[0]["target"])
print()
print("Qwen2.5-3B Base Prediction:")
print(qwen_samples_base.loc[0]["pred"])
print("\n---\n")
print("Qwen2.5-3B LoRA Target:")
print(qwen_samples_lora.loc[0]["target"])
print()
print("Qwen2.5-3B LoRA Prediction:")
print(qwen_samples_lora.loc[0]["pred"])

# Print first sample for Granite (LoRA vs Base)
print("\nGranite/Watson 3B Base Target:")
print(granite_samples_base.loc[0]["target"])
print()
print("Granite/Watson 3B Base Prediction:")
print(granite_samples_base.loc[0]["pred"])
print("\n---\n")
print("Granite/Watson 3B LoRA Target:")
print(granite_samples_lora.loc[0]["target"])
print()
print("Granite/Watson 3B LoRA Prediction:")
print(granite_samples_lora.loc[0]["pred"])


Qwen2.5-3B Base Target:
Dear Hiring Manager,

I am writing to express my interest in the Data Scientist position at XYZ Data Solutions. With my strong background in data analysis and visualization, I believe I am wellsuited for this role.

In my current position as a Data Scientist at XYZ Data Solutions, I have successfully led a project team in creating cuttingedge visualizations to meet our clients' needs. I have also automated build and deployment processes, resulting in more efficient data management. Additionally, I have been responsible for investigating new product features and identifying new opportunities for our clients.

I have a deep knowledge and experience within Azure, including Databricks, Data Factory, SSAS, Synapse, and Function. I am proficient in Power BI and SQL, and I have a strong understanding of data visualization and management. My strong communication skills and ability to work well within a team make me an ideal candidate for this position.

I hold a Bachelo

In [9]:
qwen_samples_base.to_csv("qwen_samples_base.csv")
qwen_samples_lora.to_csv("qwen_samples_lora.csv")
granite_samples_lora.to_csv("granite_samples_lora.csv")
granite_samples_base.to_csv("granite_samples_base.csv")