### Part 1: Install required packages

1.1 Install official libraries

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets
# Restart runtime after installation (for colab only)

In [6]:
!pip install -q -U git+https://github.com/huggingface/trl.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for trl (pyproject.toml) ... [?25l[?25hdone


In [2]:
import transformers, trl, peft, accelerate, datasets, bitsandbytes, torch
print("torch", torch.__version__)
print("transformers", transformers.__version__)
print("trl", trl.__version__)
print("peft", peft.__version__)
print("accelerate", accelerate.__version__)
print("datasets", datasets.__version__)
print("bitsandbytes", bitsandbytes.__version__)

torch 2.9.0+cu126
transformers 5.0.0.dev0
trl 0.27.0.dev0
peft 0.18.1.dev0
accelerate 1.13.0.dev0
datasets 4.0.0
bitsandbytes 0.49.0


In [17]:
# Restart the runtime without deleting files
exit()

1.2 Get an estimation of memory usage for loading the model

In [16]:
import os
os.environ["MODEL_NAME"] = "mistralai/Mistral-7B-v0.1"
!accelerate estimate-memory {MODEL_NAME} --library_name transformers

Loading pretrained config for `mistralai/Mistral-7B-v0.1` from `transformers`...
`torch_dtype` is deprecated! Use `dtype` instead!
┌────────────────────────────────────────────────────────┐
│  Memory Usage for loading `mistralai/Mistral-7B-v0.1`  │
├───────┬─────────────┬──────────┬───────────────────────┤
│ dtype │Largest Layer│Total Size│  Training using Adam  │
├───────┼─────────────┼──────────┼───────────────────────┤
│float32│  832.03 MB  │ 26.49 GB │       105.96 GB       │
│float16│  416.02 MB  │ 13.24 GB │        52.98 GB       │
│  int8 │  208.01 MB  │ 6.62 GB  │          N/A          │
│  int4 │   104.0 MB  │ 3.31 GB  │          N/A          │
└───────┴─────────────┴──────────┴───────────────────────┘


1.3 Auto-detect dtype based on GPU capability

In [7]:
import torch
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_capability = torch.cuda.get_device_capability()[0]

    print(f"GPU: {gpu_name}")
    print(f"Compute Capability: {gpu_capability}.x")

    # Ampere (RTX 30xx, A100) and newer (capability >= 8) support bf16 efficiently
    # Older GPUs (T4, V100, RTX 20xx) should use fp16
    if gpu_capability >= 8:
        torch_dtype = torch.bfloat16
        use_bf16 = True
        use_fp16 = False
        attn_implementation = "flash_attention_2"
        print("Using bfloat16 (Ampere+ GPU detected)")
    else:
        torch_dtype = torch.float16
        use_bf16 = False
        use_fp16 = True
        attn_implementation = "eager"
        print("Using float16 (Pre-Ampere GPU detected)")
else:
    raise RuntimeError("No GPU available!")

GPU: NVIDIA A100-SXM4-80GB
Compute Capability: 8.x
Using bfloat16 (Ampere+ GPU detected)


1.4 Import libraries

In [40]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset

### Part 2: Configuration & loading

2.1 Model configuration

In [4]:
MODEL_NAME = "mistralai/Mistral-7B-v0.1"

2.2 Configure 4-bit quantization

In [8]:
# QLoRA configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",           # 4-bit NormalFloat
    bnb_4bit_compute_dtype=torch_dtype,  # (Auto-detected)
    bnb_4bit_use_double_quant=True,      # Double quantization for extra memory savings
)
print(f"Compute dtype: {torch_dtype}")

Compute dtype: torch.bfloat16


2.3 Load and configure tokenizer

In [9]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

print(f"Vocab size: {len(tokenizer)}")
print(f"Pad token: {tokenizer.pad_token}")

Vocab size: 32000
Pad token: </s>


2.4 Load model with 4-bit quantization

In [17]:
# If attention is flash attention 2, install it (using the command below or skip if not the case)
print(f"Attention: {attn_implementation}")

Attention: flash_attention_2


In [None]:
# install Flash Attention 2
!pip install ninja packaging wheel
!pip install flash-attn --no-build-isolation

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    dtype=torch_dtype,
    attn_implementation=attn_implementation,
)
print(f"Model loaded. Memory footprint: {model.get_memory_footprint() / 1e9:.2f} GB")
print(f"Model parameters: {model.num_parameters() / 1e9:.1f}B")

In [18]:
print(model)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
      )
    )
    (n

2.5 Prepare Model for training

In [19]:
# Disable cache (to save memory during training)
model.config.use_cache = False
model.config.pretraining_tp = 1

In [20]:
# enable gradient checkpointing
model.gradient_checkpointing_enable()
# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

### Part 3: LoRA adapter configuration (following QLoRA paper recommendations)

3.1 Define LoRA hyperparameters

In [46]:
peft_config = LoraConfig(
    r=16,                                   # Rank (4-64, higher=more capacity)
    lora_alpha=32,                          # Alpha scaling (typically 2x rank)
    target_modules=[                        # Mistral-specific attention layers
        "q_proj", "v_proj", "k_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_dropout=0.05,                      # Dropout for regularization
    bias="none",                            # No bias tuning
    task_type="CAUSAL_LM"                   # Causal language modeling
)

3.2 Apply LoRA adapters to the model

In [None]:
# Applying the LoRA configuration to the model
odel = get_peft_model(model, peft_config)

# Print trainable parameters
model.print_trainable_parameters()

### Part 4: Dataset preparation

4.1 Load training data (Alpaca dataset)

In [26]:
# For full training
dataset = load_dataset("yahma/alpaca-cleaned", split="train")
#or load a small subset by using split="train[:1000]" (first 1000 samples or add more)
# dataset = load_dataset("yahma/alpaca-cleaned", split="train").shuffle(seed=42).select(range(1000))

print(f"Samples: {len(dataset)}")

Samples: 51760


In [76]:
# Print sample to see the structure
print(dataset[0]['text'][:1000])

### Instruction:
Give three tips for staying healthy.

### Response:
1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.

2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.

3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.


4.2 Data formatting

In [None]:
def format_instruction(s):
    base = f"### Instruction:\n{s['instruction']}\n\n"
    if s.get("input"): base += f"### Input:\n{s['input']}\n\n"
    return base + f"### Response:\n{s['output']}"

dataset = dataset.map(lambda x: {"text": format_instruction(x)}, remove_columns=dataset.column_names)

### Part 5: Training configuration

5.1 Define training arguments

In [51]:
training_arguments = SFTConfig(
    output_dir="./qlora-mistral",               # Output directory
    per_device_train_batch_size=4,               # Batch size per GPU
    gradient_accumulation_steps=4,              # Effective batch size = 16
    num_train_epochs=3,                         # Number of training epochs
    learning_rate=2e-4,                         # Standard QLoRA learning rate
    weight_decay=0.01,                          # Regularization
    warmup_steps=50,                            # Warmup steps
    logging_steps=10,                           # Log every 10 steps
    save_steps=100,                             # Save every 100 steps
    dataset_text_field="text",
    eval_steps=25,                              # Evaluate every 25 steps
    save_total_limit=2,                         # Keep only last 2 checkpoints
    # Precision settings (Auto detection)
    bf16=use_bf16,
    fp16=use_fp16,
    # save_strategy="epoch",
    optim="paged_adamw_8bit",                   # Memory-efficient optimizer
    report_to="none",                           # Disable logging (set to "wandb" or "tensorboard" if needed)
    gradient_checkpointing=True,
    ddp_find_unused_parameters=False,           # For distributed training
    max_grad_norm=0.3,                          # max gradient norm
    # max_length=512,
    # packing=True
)

5.2 Initialize SFTTrainer

In [None]:
from trl import SFTTrainer
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
model=model,
train_dataset=dataset,
processing_class=tokenizer,
args=training_arguments,
#peft_config=peft_config,
)
# Note: The SFTTrainer sees that your model is already a PeftModel thanks to the get_peft_model() function previously used and doesn't want to add another adapter on top.
# That's why I commented the last line of code (peft_config=peft_config) and used get_peft_model(). Just in case you run into issues I encountered.

### Part 6: Training execution

6.1 Monitor GPU memory before training

In [55]:
# Clear GPU cache
# torch.cuda.empty_cache()

# Get memory status
allocated = torch.cuda.memory_allocated(0) / 1024**3
reserved = torch.cuda.memory_reserved(0) / 1024**3

print(f"Allocated: {allocated:.2f} GB")
print(f"Reserved: {reserved:.2f} GB")
print(f"Free: {torch.cuda.get_device_properties(0).total_memory / 1024**3 - reserved:.2f} GB")

Allocated: 4.41 GB
Reserved: 26.16 GB
Free: 53.15 GB


6.2 Start training

In [56]:
# Train the model
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.
Casting fp32 inputs back to torch.bfloat16 for flash-attn compatibility.


Step,Training Loss
10,1.279801
20,0.999431
30,0.964201
40,0.949522
50,0.990408
60,0.985018
70,0.975307
80,0.964525
90,0.894629
100,0.959349


TrainOutput(global_step=9705, training_loss=0.7542305653762228, metrics={'train_runtime': 29914.8954, 'train_samples_per_second': 5.191, 'train_steps_per_second': 0.324, 'total_flos': 2.3318862384203366e+18, 'train_loss': 0.7542305653762228})

6.3 Save LoRA adapters and tokenizer

In [60]:
output_dir="./qlora-mistral"
trainer.save_model(output_dir)
# Another alternative
#trainer.model.save_pretrained(output_dir)
#tokenizer.save_pretrained(output_dir)

6.4 Download your folders/files (If running on Colab)

In [63]:
# In case you intend to keep your work or possibly upload it again.
from google.colab import files
import zipfile
import os

model_path = "./qlora-mistral"
zip_path = "qlora-mistral.zip"

print(f"Zipping to {zip_path}...")
!zip -r {zip_path} {model_path}

print(f"Download ready: {zip_path} (size: {os.path.getsize(zip_path) / 1024**2:.1f} MB)")
files.download(zip_path)

Zipping to qlora-mistral.zip...
  adding: qlora-mistral/ (stored 0%)
  adding: qlora-mistral/adapter_model.safetensors (deflated 22%)
  adding: qlora-mistral/adapter_config.json (deflated 58%)
  adding: qlora-mistral/tokenizer_config.json (deflated 47%)
  adding: qlora-mistral/checkpoint-9700/ (stored 0%)
  adding: qlora-mistral/checkpoint-9700/adapter_model.safetensors (deflated 22%)
  adding: qlora-mistral/checkpoint-9700/adapter_config.json (deflated 58%)
  adding: qlora-mistral/checkpoint-9700/tokenizer_config.json (deflated 47%)
  adding: qlora-mistral/checkpoint-9700/training_args.bin (deflated 53%)
  adding: qlora-mistral/checkpoint-9700/scheduler.pt (deflated 61%)
  adding: qlora-mistral/checkpoint-9700/optimizer.pt (deflated 13%)
  adding: qlora-mistral/checkpoint-9700/README.md (deflated 65%)
  adding: qlora-mistral/checkpoint-9700/trainer_state.json (deflated 80%)
  adding: qlora-mistral/checkpoint-9700/rng_state.pth (deflated 26%)
  adding: qlora-mistral/checkpoint-9700/tok

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Part 7: Inference

7.1 Reload model for inference

In [None]:
from peft import PeftModel
# Reload base model
inference_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    dtype=torch_dtype,
    attn_implementation=attn_implementation,
)

# Load trained LoRA adapters
inference_model = PeftModel.from_pretrained(inference_model, output_dir)
inference_tokenizer = AutoTokenizer.from_pretrained(output_dir)

7.2 Create an inference function

In [74]:
# Generate response from fine-tuned model
def generate(prompt):
    # Tokenize
    inputs = inference_tokenizer(f"### Instruction:\n{prompt}\n\n### Response:\n", return_tensors="pt").to(inference_model.device)
    # Generate
    outputs = inference_model.generate(**inputs, max_new_tokens=256, temperature=0.7, do_sample=True)
    # Decode
    return inference_tokenizer.decode(outputs[0], skip_special_tokens=True).split("### Response:")[-1].strip()

print(generate("Explain machine learning in simple terms."))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


1. Data Preparation: Gather data from different sources and prepare it for analysis.
2. Exploratory Data Analysis (EDA): Use techniques such as visualization to understand the data and identify patterns.
3. Data Transformation: Transform the data into a format that is suitable for the model, such as normalization or dimensionality reduction.
4. Model Selection: Choose the appropriate machine learning algorithm for the data and the task at hand.
5. Model Training: Train the model using the prepared data and evaluate its


7.3 GPU memory Cleanup

In [None]:
# Clean up GPU memory when done (if needed)
#del model, inference_model, trainer
#torch.cuda.empty_cache()