#### Install required packages

In [None]:
!pip install -q -U torch transformers peft bitsandbytes datasets trl accelerate

In [None]:
!pip install bitsandbytes

#### Auto-detect dtype based on GPU capability

In [2]:
import torch
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_capability = torch.cuda.get_device_capability()[0]

    print(f"GPU: {gpu_name}")
    print(f"Compute Capability: {gpu_capability}.x")

    # Ampere (RTX 30xx, A100) and newer (capability >= 8) support bf16 efficiently
    # Older GPUs (T4, V100, RTX 20xx) should use fp16
    if gpu_capability >= 8:
        torch_dtype = torch.bfloat16
        use_bf16 = True
        use_fp16 = False
        attn_implementation = "flash_attention_2"
        print("Using bfloat16 (Ampere+ GPU detected)")
    else:
        torch_dtype = torch.float16
        use_bf16 = False
        use_fp16 = True
        attn_implementation = "eager"
        print("Using float16 (Pre-Ampere GPU detected)")
else:
    raise RuntimeError("No GPU available!")

GPU: NVIDIA GeForce RTX 4070 Laptop GPU
Compute Capability: 8.x
Using bfloat16 (Ampere+ GPU detected)


#### Part 1: Loading model in 8-bit (8-bit Quantization)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
# 1. Define the Quantization configuration
# This loads the weights in 8-bit integers
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0
)

In [4]:
# 2. Load Tokenizer
model_id = "microsoft/phi-2"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
print(f"Tokenizer loaded: {tokenizer.__class__.__name__}")
print(f"Vocab size: {tokenizer.vocab_size:,}")

Tokenizer loaded: CodeGenTokenizerFast
Vocab size: 50,257


In [5]:
# 3. Load the model with Quantization
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",          # Automatically maps layers to GPU/CPU
    trust_remote_code=True
)

print(f"Model loaded. Memory footprint: {model.get_memory_footprint() / 1e9:.2f} GB")
print(f"Model parameters: {model.num_parameters() / 1e9:.1f}B")

Loading checkpoint shards: 100%|██████████| 2/2 [00:12<00:00,  6.45s/it]


Model loaded. Memory footprint: 3.04 GB
Model parameters: 2.8B


In [7]:
print(model)

PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2560)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear8bitLt(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear8bitLt(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear8bitLt(in_features=2560, out_features=2560, bias=True)
          (dense): Linear8bitLt(in_features=2560, out_features=2560, bias=True)
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear8bitLt(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear8bitLt(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (rotary_emb): PhiRotaryEmbedding()
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (final_

#### Part 2: 8-bit model fine-tuning (Training) using LoRA

An 8-bit model can't be trained directly (the weights are frozen). Therefore using PEFT (Parameter-Efficient Fine-Tuning) to attach small, trainable adapters (LoRA) on top of the frozen 8-bit weights is needed in order to train the quantized model.




In [8]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import TrainingArguments
from trl import SFTTrainer
from datasets import load_dataset

# 1. Prepare model for k-bit training (8-bit training in our case)
model = prepare_model_for_kbit_training(model)

# 2. Configure LoRA (Low-Rank Adaptation)
# This adds trainable adapters to the model
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=[                         # Phi-2 specific modules
        "q_proj",
        "k_proj",
        "v_proj",
        "dense",
        "fc1",
        "fc2",
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)


model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 23,592,960 || all params: 2,803,276,800 || trainable%: 0.8416


In [9]:
from trl import SFTConfig
# 3. Load a dataset
dataset = load_dataset("tatsu-lab/alpaca", split="train[:100]") # First 100 rows

# 4. Set Training Arguments
training_arguments = SFTConfig(
output_dir = "./phi2-8bit-finetuned",
dataset_text_field="text",
num_train_epochs=2,
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
# optimizer
optim="adamw_8bit",
# Learning rate settings
learning_rate=2e-4,
lr_scheduler_type="cosine",
# logging
logging_steps=10,
report_to="none",
# Saving
save_strategy="epoch",
# Precision settings (Auto detection)
bf16=use_bf16,
fp16=use_fp16,
max_grad_norm=0.3, # max gradient norm based on QLoRA paper
warmup_steps=100, # warmup_steps=0.03 used before based on QLoRA paper (now deprecated)
# Memory optimization
gradient_checkpointing=True
)

In [10]:
# 5. Initialize Trainer (SFTTrainer handles prompt formatting automatically)
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    #peft_config=peft_config,
    processing_class=tokenizer,
    args=training_arguments,
)

# 6. Start Training
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.


Step,Training Loss
10,1.9309
20,1.764




TrainOutput(global_step=26, training_loss=1.8225610439593976, metrics={'train_runtime': 93.6382, 'train_samples_per_second': 2.136, 'train_steps_per_second': 0.278, 'total_flos': 485357902233600.0, 'train_loss': 1.8225610439593976, 'entropy': 1.7890512347221375, 'num_tokens': 23722.0, 'mean_token_accuracy': 0.586806442249905, 'epoch': 2.0})

In [11]:
# 7. save the adapters (the LoRA weights)
output_dir = "./phi2-8bit-finetuned"
# Save the adapter weights
trainer.model.save_pretrained(output_dir)
# Save the tokenizer (for exact padding/EOS settings)
tokenizer.save_pretrained(output_dir)

('./phi2-8bit-finetuned/tokenizer_config.json',
 './phi2-8bit-finetuned/special_tokens_map.json',
 './phi2-8bit-finetuned/vocab.json',
 './phi2-8bit-finetuned/merges.txt',
 './phi2-8bit-finetuned/added_tokens.json',
 './phi2-8bit-finetuned/tokenizer.json')

#### Part 3: Reload the model in 8-bit, then attach the saved adapters for inference (after training)

In [12]:
from peft import PeftModel

model_id = "microsoft/phi-2"
adapter_path = "./phi2-8bit-finetuned"

# Load the weights in 8-bit integers again (not necessary already done in part I)
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0
)
# 1. Load the base model again (Frozen, 8-bit)
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# 2. Load and attach the saved adapters
model = PeftModel.from_pretrained(base_model, adapter_path)
tokenizer = AutoTokenizer.from_pretrained(adapter_path)

Loading checkpoint shards: 100%|██████████| 2/2 [00:11<00:00,  5.67s/it]


In [13]:
# 3. Prepare the input
prompt = "Instruct: Write a Python function to calculate the Fibonacci sequence.\nOutput:"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# 4. Generate
# We use torch.no_grad() to save memory during inference
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=200,
        do_sample=True,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id
    )

# 5. Decode output
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(result)

Instruct: Write a Python function to calculate the Fibonacci sequence.
Output: def fibonacci(n):
    if n == 0:
        return 0
    elif n == 1:
        return 1
    else:
        return fibonacci(n-1) + fibonacci(n-2)

