In [None]:
!pip install -U datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency re

In [None]:
from huggingface_hub import login

login(...)

# New Section

In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

Finetune model normally

In [None]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load small dataset (already 300 samples)
dataset = load_dataset("OnnieNLP/InformationExtractionQA", split="train")

# Load tokenizer
model_id = "meta-llama/Llama-3.2-3B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

# Load model with memory optimization
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    device_map="auto",
    low_cpu_mem_usage=True
)
model.config.use_cache = False

# Format dataset
def format_example(example):
    prompt = (
        "You are a helpful assistant that answers questions based on provided context. Keep answers short.\n\n"
        f"Context: {example['text']}\n\nQuestion: {example['question']}\n\nAnswer: {example['answer']}"
    )
    return {"text": prompt}

formatted_dataset = dataset.map(format_example)

# Tokenize with truncation only (no padding)
MAX_LENGTH = 512
tokenized_dataset = formatted_dataset.map(
    lambda e: tokenizer(e["text"], truncation=True, max_length=MAX_LENGTH, padding=False),
    batched=True,
    remove_columns=dataset.column_names
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training arguments optimized for A100
training_args = TrainingArguments(
    output_dir="./llama3-qa",
    logging_dir=None,
    report_to="none",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    save_strategy="no",
    learning_rate=2e-4,
    weight_decay=0.01,
    fp16=False,          # Disable fp16
    bf16=True,           # Enable bf16 (A100 optimized)
    gradient_checkpointing=True,
    dataloader_pin_memory=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()


Using device: cuda


PackageNotFoundError: No package metadata was found for bitsandbytes

In [None]:
trainer.save_model("./llama3-qa")
tokenizer.save_pretrained("./llama3-qa")

('./llama3-qa/tokenizer_config.json',
 './llama3-qa/special_tokens_map.json',
 './llama3-qa/tokenizer.json')

Generate predictions from the finetuned model

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import json
from tqdm import tqdm

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model and tokenizer from local path
model_dir = "/content/llama3-qa"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto", torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32)
model.eval()

# Load test split of dataset
test_dataset = load_dataset("OnnieNLP/InformationExtractionQA", split="test")

# Format function
def format_prompt(example):
    return (
        "You are a helpful assistant that answers questions based on provided context. Keep answers short.\n\n"
        f"Context: {example['text']}\n\nQuestion: {example['question']}\n\nAnswer:"
    )

# Inference
results = []
for example in tqdm(test_dataset, desc="Generating answers"):
    prompt = format_prompt(example)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=64,
            temperature=0.7,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )

    generated = tokenizer.decode(output[0], skip_special_tokens=True)
    predicted_answer = generated.split("Answer:")[-1].strip()

    results.append({
        "question": example["question"],
        "context": example["text"],
        "true_answer": example["answer"],
        "predicted_answer": predicted_answer
    })

# Save to JSON
output_path = "predicted_answers.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print(f"Saved predictions to {output_path}")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Generating answers: 100%|██████████| 51/51 [01:57<00:00,  2.30s/it]

Saved predictions to predicted_answers.json





In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp /content/llama3-qa/model-00001-of-00002.safetensors /content/drive/MyDrive/

In [None]:
!cp /content/llama3-qa/model-00002-of-00002.safetensors /content/drive/MyDrive/

In [None]:
from google.colab import files
files.download('/content/llama3-qa/model-00001-of-00002.safetensors')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Finetune model with LoRA

In [None]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load small dataset
dataset = load_dataset("OnnieNLP/InformationExtractionQA", split="train")

# Load tokenizer
model_id = "meta-llama/Llama-3.2-3B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

# Configure 4-bit quantization for memory efficiency
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True
)

# Load model with quantization
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    use_cache=False
)

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

# Configure LoRA
peft_config = LoraConfig(
    r=8,                   # Rank
    lora_alpha=32,         # Scaling factor
    lora_dropout=0.05,     # Dropout
    target_modules=["q_proj", "v_proj"],  # Target modules
    task_type="CAUSAL_LM",
    bias="none"            # No bias for LoRA
)

# Apply LoRA to model
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()  # Show trainable parameters count

# Format dataset
def format_example(example):
    prompt = (
        "You are a helpful assistant that answers questions based on provided context. Keep answers short.\n\n"
        f"Context: {example['text']}\n\nQuestion: {example['question']}\n\nAnswer: {example['answer']}"
    )
    return {"text": prompt}

formatted_dataset = dataset.map(format_example)

# Tokenize dataset
MAX_LENGTH = 512
tokenized_dataset = formatted_dataset.map(
    lambda e: tokenizer(
        e["text"],
        truncation=True,
        max_length=MAX_LENGTH,
        padding=False
    ),
    batched=True,
    remove_columns=formatted_dataset.column_names
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Optimized training arguments
training_args = TrainingArguments(
    output_dir="./llama3-qa-lora",
    per_device_train_batch_size=2,          # Increased due to LoRA memory efficiency
    gradient_accumulation_steps=4,           # Adjust based on GPU memory
    num_train_epochs=3,
    learning_rate=1e-4,                     # Lower LR for adapter training
    weight_decay=0.01,
    bf16=True,                              # Use bfloat16
    logging_steps=10,
    optim="paged_adamw_8bit",               # Optimizer for 8-bit models
    save_strategy="no",
    report_to="none",
    gradient_checkpointing=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Start training
trainer.train()

# Save adapter weights
model.save_pretrained("llama3-qa-lora-adapter")

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 2,293,760 || all params: 3,215,043,584 || trainable%: 0.0713


Map:   0%|          | 0/349 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,2.475
20,1.9355
30,1.4444
40,1.1682
50,1.0858
60,1.0805
70,1.0436
80,1.0477
90,1.0154
100,1.0054


Generate predictions for the LORA-finetuned model

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
import json
from tqdm import tqdm

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load adapter configuration
adapter_dir = "/content/llama3-qa-lora-adapter"  # Path to saved LoRA adapter
config = PeftConfig.from_pretrained(adapter_dir)

# Load base model with quantization (must match training setup)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True
)

base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    quantization_config=bnb_config,
    device_map="auto",
    use_cache=False
)

# Attach LoRA adapter to base model
model = PeftModel.from_pretrained(base_model, adapter_dir)
model.eval()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

# Load test dataset
test_dataset = load_dataset("OnnieNLP/InformationExtractionQA", split="test")

# Format function
def format_prompt(example):
    return (
        "You are a helpful assistant that answers questions based on provided context. Keep answers short.\n\n"
        f"Context: {example['text']}\n\nQuestion: {example['question']}\n\nAnswer:"
    )

# Inference
results = []
for example in tqdm(test_dataset, desc="Generating answers"):
    prompt = format_prompt(example)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=64,
            temperature=0.7,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )

    generated = tokenizer.decode(output[0], skip_special_tokens=True)
    predicted_answer = generated.split("Answer:")[-1].strip()

    results.append({
        "question": example["question"],
        "context": example["text"],
        "true_answer": example["answer"],
        "predicted_answer": predicted_answer
    })

# Save to JSON
output_path = "predicted_answers.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print(f"Saved predictions to {output_path}")

Using device: cuda


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Generating answers:   0%|          | 0/51 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating answers:   2%|▏         | 1/51 [00:06<05:03,  6.07s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating answers:   4%|▍         | 2/51 [00:11<04:48,  5.89s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating answers:   6%|▌         | 3/51 [00:17<04:40,  5.84s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating answers:   8%|▊         | 4/51 [00:23<04:33,  5.82s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORME

Saved predictions to predicted_answers.json





In [None]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-c

In [None]:
!pip install --upgrade transformers

Collecting transformers
  Downloading transformers-4.53.1-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.53.1-py3-none-any.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.53.0
    Uninstalling transformers-4.53.0:
      Successfully uninstalled transformers-4.53.0
Successfully installed transformers-4.53.1


In [None]:
!pip install transformers peft accelerate




In [None]:
!pip install -U datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency r

Second adapter training

In [None]:
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftConfig
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig, LoraConfig, get_peft_model

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
model_id = "meta-llama/Llama-3.2-3B"


dataset = load_dataset("OnnieNLP/InformationExtractionQA", split="train")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# First adapter
path_adapter_1 = "/content/"

# Load adapter configuration
config = PeftConfig.from_pretrained(path_adapter_1)

# Load base model with quantization (must match training setup)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True
)
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    quantization_config=bnb_config,
    device_map="auto",
    use_cache=False
)


Using device: cuda


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

In [None]:
# Load first adapter and freeze it
model = prepare_model_for_kbit_training(base_model)
model = PeftModel.from_pretrained(base_model, path_adapter_1)
model.print_trainable_parameters()  # should show only LoRA trainable

# # Freeze the weights of this adapter
# for param in model.parameters():
#     param.requires_grad = False

trainable params: 0 || all params: 3,215,043,584 || trainable%: 0.0000


In [None]:
from peft import prepare_model_for_kbit_training  # if using QLoRA
from peft import LoraConfig

# Define new LoRA config for second adapter
lora_config_v2 = LoraConfig(
    r=4,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # depends on model architecture
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

# Add new adapter on top (name it 'lora_v2')
model.add_adapter("lora_v2", lora_config_v2)

# Enable only lora_v2 for training
model.set_adapter("lora_v2")
# for name, param in model.named_parameters():
#     param.requires_grad = "lora_v2" in name


In [None]:
model.print_trainable_parameters()  # should show only LoRA trainable


trainable params: 1,146,880 || all params: 3,216,190,464 || trainable%: 0.0357


In [None]:
# Format dataset
def format_example(example):
    prompt = (
        "You are a helpful assistant that answers questions based on provided context. Keep answers short.\n\n"
        f"Context: {example['text']}\n\nQuestion: {example['question']}\n\nAnswer: {example['answer']}"
    )
    return {"text": prompt}

def preprocess(example):
    prompt = f"Context: {example['text']}\nQuestion: {example['question']}\nAnswer:"
    full_text = prompt + " " + example["answer"]

    # Tokenize full text
    tokenized = tokenizer(full_text, truncation=True, padding="max_length", max_length=512)

    # Mask out the prompt portion in the labels so loss is only on the answer
    input_ids = tokenized["input_ids"]
    attention_mask = tokenized["attention_mask"]

    # Find index where answer starts
    answer_start = len(tokenizer(prompt)["input_ids"])
    labels = [-100] * answer_start + input_ids[answer_start:]

    tokenized["labels"] = labels
    return tokenized

tokenized_dataset = dataset.map(preprocess)

# Tokenize dataset
# MAX_LENGTH = 512
# tokenized_dataset = formatted_dataset.map(
#     lambda e: tokenizer(e["text"], truncation=True, max_length=MAX_LENGTH, padding=False),
#     batched=True,
#     remove_columns=dataset.column_names
# )

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Optimized training arguments
training_args = TrainingArguments(
    output_dir="/content/lora_adapter_v2",
    per_device_train_batch_size=2,          # Increased due to LoRA memory efficiency
    gradient_accumulation_steps=4,           # Adjust based on GPU memory
    num_train_epochs=2,
    learning_rate=3e-4,                     # Lower LR for adapter training
    weight_decay=0.01,
    bf16=True,                              # Use bfloat16
    logging_steps=10,
    optim="paged_adamw_8bit",               # Optimizer for 8-bit models
    save_strategy="no",
    report_to="none",
    gradient_checkpointing=True,
    )




Map:   0%|          | 0/600 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset[0].keys()


dict_keys(['text', 'question', 'answer', 'source', 'input_ids', 'attention_mask', 'labels'])

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    # tokenizer=tokenizer,
    data_collator=data_collator
    )

# Start training
trainer.train()

# Save adapter weights
model.save_pretrained("lora_adapter_v2")

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,1.833
20,1.5495
30,1.3277
40,1.4001
50,1.2852
60,1.3577
70,1.3649


In [None]:
model.eval()
test_dataset = load_dataset("OnnieNLP/InformationExtractionQA", split="test")


In [None]:
# Format function
def format_prompt(example):
    return (
        "You are a helpful assistant that answers questions based on provided context. Keep answers short.\n\n"
        f"Context: {example['text']}\n\nQuestion: {example['question']}\n\nAnswer:"
    )

In [None]:

print("\nPrediction for the last test data sample:")
last_sample = test_dataset[-14]
prompt = format_prompt(last_sample)
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)

with torch.no_grad():
    output = model.generate(
        **inputs,
        max_new_tokens=64,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id
    )

generated = tokenizer.decode(output[0], skip_special_tokens=True)
predicted_answer = generated.split("Answer:")[-1].strip().split("\n")[0]

print(f"context:{last_sample['text']}")
print(f"Question: {last_sample['question']}")
print(f"True Answer: {last_sample['answer']}")
print(f"Predicted Answer: {predicted_answer}")


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



Prediction for the last test data sample:
context:Bucovina este traversată de râuri importante precum Siretul și Moldava.
Question: Care este un râu important care traversează Bucovina, pe lângă Moldova?
True Answer: Siret
Predicted Answer: Lemne și lemn


In [None]:
!unzip "/content/lora_v2_3e-4_1 epoch.zip"

Archive:  /content/lora_v2_3e-4_1 epoch.zip
   creating: lora_v2_3e-4_1 epoch/
  inflating: lora_v2_3e-4_1 epoch/adapter_model.safetensors  
  inflating: __MACOSX/lora_v2_3e-4_1 epoch/._adapter_model.safetensors  
  inflating: lora_v2_3e-4_1 epoch/.DS_Store  
  inflating: __MACOSX/lora_v2_3e-4_1 epoch/._.DS_Store  
  inflating: lora_v2_3e-4_1 epoch/README.md  
  inflating: __MACOSX/lora_v2_3e-4_1 epoch/._README.md  
  inflating: lora_v2_3e-4_1 epoch/adapter_config.json  
  inflating: __MACOSX/lora_v2_3e-4_1 epoch/._adapter_config.json  
   creating: lora_v2_3e-4_1 epoch/lora/
  inflating: lora_v2_3e-4_1 epoch/lora/adapter_model.safetensors  
  inflating: __MACOSX/lora_v2_3e-4_1 epoch/lora/._adapter_model.safetensors  
  inflating: lora_v2_3e-4_1 epoch/lora/adapter_config (1).json  
  inflating: __MACOSX/lora_v2_3e-4_1 epoch/lora/._adapter_config (1).json  


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
import torch # Import torch for torch.float16

base_model_name = "meta-llama/Llama-3.2-3B"

# List of paths to your adapters
adapter_paths = [
    "/content/lora_v2_3e-4_1_epoch", # path to adapter 1
    "/content/lora_v2_3e-4_1_epoch/lora", # path to adapter 2
    # Add more adapter paths here if needed
]

# Configure 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16, # Use bfloat16 for better performance on supported GPUs
    bnb_4bit_use_double_quant=True
)

# Load base model with appropriate dtype, quantization config, and device mapping
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto",
    offload_folder="offload" # Optional: for offloading to CPU
)

# Apply adapters sequentially (stacked)
for i, adapter_path in enumerate(adapter_paths):
    if i == 0:
        # Load the first adapter on top of the base model
        model = PeftModel.from_pretrained(model, adapter_path)
    else:
        # Add subsequent adapters on top of the previous one
        model.load_adapter(adapter_path, adapter_name=f"adapter_{i}")

tokenizer = AutoTokenizer.from_pretrained(base_model_name)

print("Model loaded with stacked adapters and quantization.")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded with stacked adapters and quantization.


In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
import json
from tqdm import tqdm

# # Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# # Load adapter configuration
# adapter_dir = "/content/llama3-qa-lora-adapter"  # Path to saved LoRA adapter
# config = PeftConfig.from_pretrained(adapter_dir)

# # Load base model with quantization (must match training setup)
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16,
#     bnb_4bit_use_double_quant=True
# )

# base_model = AutoModelForCausalLM.from_pretrained(
#     config.base_model_name_or_path,
#     quantization_config=bnb_config,
#     device_map="auto",
#     use_cache=False
# )

# # Attach LoRA adapter to base model
# model = PeftModel.from_pretrained(base_model, adapter_dir)
model.eval()

# # Load tokenizer
# tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

# Load test dataset
test_dataset = load_dataset("OnnieNLP/InformationExtractionQA", split="test")

# Format function
def format_prompt(example):
    return (
        "You are a helpful assistant that answers questions based on provided context. Keep answers short.\n\n"
        f"Context: {example['text']}\n\nQuestion: {example['question']}\n\nAnswer:"
    )

# Inference
results = []
for example in tqdm(test_dataset, desc="Generating answers"):
    prompt = format_prompt(example)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=64,
            temperature=0.7,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )

    generated = tokenizer.decode(output[0], skip_special_tokens=True)
    predicted_answer = generated.split("Answer:")[-1].strip()

    results.append({
        "question": example["question"],
        "context": example["text"],
        "true_answer": example["answer"],
        "predicted_answer": predicted_answer
    })

# Save to JSON
output_path = "/content/predicted_answers_2_adapters.json"
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print(f"Saved predictions to {output_path}")

Using device: cuda


Generating answers:   0%|          | 0/200 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating answers:   0%|          | 1/200 [00:09<31:19,  9.44s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating answers:   1%|          | 2/200 [00:15<24:17,  7.36s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating answers:   2%|▏         | 3/200 [00:21<22:58,  7.00s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Generating answers:   2%|▏         | 4/200 [00:27<21:30,  6.58s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANS

Saved predictions to /content/predicted_answers_2_adapters.json



