# Challenge B: FSDP2 + QLoRA on 2xT4 GPUs

This notebook demonstrates finetuning a model using QLoRA with FSDP2 on Kaggle's 2x Tesla T4 GPUs.

In [None]:
!pip install torch>=2.4 transformers peft trl accelerate bitsandbytes -U --quiet

In [None]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset

# Must be 2xT4
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

model_name = "unsloth/llama-3-8b-bnb-4bit"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={ "": 0 }, 
    torch_dtype=torch.bfloat16,
    attn_implementation="sdpa",
)

model = prepare_model_for_kbit_training(model)
lora_config = LoraConfig(
    r=64,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)

# Dataset
dataset = load_dataset("philschmid/dolly-15k-llama-3-format", split="train")

training_args = SFTConfig(
    output_dir="./outputs",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    bf16=True, 
    logging_steps=1,
    max_steps=10,
    dataset_text_field="text",
    max_seq_length=512,
    # FSDP2 config
    fsdp="full_shard auto_wrap",
    fsdp_config={
        "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
        "activation_checkpointing": True,
        "forward_prefetch": True,
        "offload_params": True,
    },
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    args=training_args,
)

trainer.train()
print("FSDP2 + QLoRA Training completed!")