In [1]:
# -------------------------------------------------------------------------------------------------------------
# Project: Open Source Institute-Cognitive System of Machine Intelligent Computing (OpenSI-CoSMIC)
# Contributors:
#     Muntasir Adnan <adnan.adnan@canberra.edu.au>
# 
# Copyright (c) 2025 Open Source Institute
# 
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without
# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following
# conditions:
# 
# The above copyright notice and this permission notice shall be included in all copies or substantial
# portions of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
# LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# -------------------------------------------------------------------------------------------------------------

In [2]:
def print_title(section_num: int, description: str) ->  None:
    print("-"*80)
    print(f"Section {section_num}: {description}")
    print("-"*80)

# SECTION 1: VERIFY ENVIRONMENT

In [3]:
print_title(1, "Environment")

import sys
print(f"Python version: {sys.version}")
print(f"Python executable: {sys.executable}")

import torch
print(f"\nPyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU device: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    print(f"Current GPU memory allocated: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
else:
    print("\nWARNING: CUDA not available! This tutorial requires a GPU.")

--------------------------------------------------------------------------------
Section 1: Environment
--------------------------------------------------------------------------------
Python version: 3.10.19 (main, Oct 21 2025, 16:43:05) [GCC 11.2.0]
Python executable: /home/adnana/miniconda3/envs/llm/bin/python3.10

PyTorch version: 2.1.0+cu121
CUDA available: True
CUDA version: 12.1
GPU device: NVIDIA A100 80GB PCIe
GPU memory: 85.09 GB
Current GPU memory allocated: 0.00 GB


# SECTION 2: IMPORTS

In [4]:
print_title(2, "Importing Libraries")

import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
import gc
import matplotlib.pyplot as plt
from datetime import datetime

print("All imports successful!\n")

--------------------------------------------------------------------------------
Section 2: Importing Libraries
--------------------------------------------------------------------------------
All imports successful!



# SECTION 3: LOAD DATASET

In [5]:
print_title(3, "Dataset")

synthetic_data = {
    'q': [
        'Write a function to add two numbers',
        'Create a function to check if a number is even',
        'Write a function to reverse a string',
        'Create a function to find the maximum in a list',
        'Write a function to calculate factorial',
        'Create a function to check if a number is prime',
        'Write a function to find the sum of a list',
        'Create a function to sort a dictionary by values',
        'Write a function to remove duplicates from a list',
        'Create a function to check if a string is a palindrome',
    ] * 10,
    'a': [
        'def add_numbers(a, b):\n    return a + b',
        'def is_even(n):\n    return n % 2 == 0',
        'def reverse_string(s):\n    return s[::-1]',
        'def find_max(lst):\n    return max(lst)',
        'def factorial(n):\n    if n <= 1:\n        return 1\n    return n * factorial(n-1)',
        'def is_prime(n):\n    if n < 2:\n        return False\n    for i in range(2, int(n**0.5) + 1):\n        if n % i == 0:\n            return False\n    return True',
        'def sum_list(lst):\n    return sum(lst)',
        'def sort_dict_by_value(d):\n    return dict(sorted(d.items(), key=lambda x: x[1]))',
        'def remove_duplicates(lst):\n    return list(set(lst))',
        'def is_palindrome(s):\n    return s == s[::-1]',
    ] * 10
}

df = pd.DataFrame(synthetic_data)

print(f"Dataset loaded: {len(df)} examples")
print(f"Columns: {df.columns.tolist()}")
print("\nFirst 3 examples:")
for idx in range(3):
    print(f"\n--- Example {idx+1} ---")
    print(f"Question: {df.iloc[idx]['q']}")
    print(f"Answer: {df.iloc[idx]['a']}")

print("\nDataset loaded successfully!\n")

--------------------------------------------------------------------------------
Section 3: Dataset
--------------------------------------------------------------------------------
Dataset loaded: 100 examples
Columns: ['q', 'a']

First 3 examples:

--- Example 1 ---
Question: Write a function to add two numbers
Answer: def add_numbers(a, b):
    return a + b

--- Example 2 ---
Question: Create a function to check if a number is even
Answer: def is_even(n):
    return n % 2 == 0

--- Example 3 ---
Question: Write a function to reverse a string
Answer: def reverse_string(s):
    return s[::-1]

Dataset loaded successfully!



# SECTION 4: TOKENIZER LOADING

In [14]:
print_title(4, "Tokenizer Loading")

model_name = "./qwen"
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name, 
    trust_remote_code=True
)

if tokenizer.pad_token is None:
    print(f"PAD token is not dest by default for model: {model_name}")
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

print("Tokenizer loaded!")
print(f"Vocabulary size: {len(tokenizer):,}")
# https://huggingface.co/transformers/v3.2.0/main_classes/tokenizer.html#:~:text=model_max_length%20(%20int%20%2C%20optional%20)%20%E2%80%93,inputs%20to%20the%20transformer%20model.
print(f"Model max length: {tokenizer.model_max_length}")
print(f"BOS token: '{tokenizer.bos_token}' (ID: {tokenizer.bos_token_id})")
print(f"EOS token: '{tokenizer.eos_token}' (ID: {tokenizer.eos_token_id})")
print(f"PAD token: '{tokenizer.pad_token}' (ID: {tokenizer.pad_token_id})")

--------------------------------------------------------------------------------
Section 4: Tokenizer Loading
--------------------------------------------------------------------------------
Tokenizer loaded!
Vocabulary size: 151,665
Model max length: 131072
BOS token: 'None' (ID: None)
EOS token: '<|endoftext|>' (ID: 151643)
PAD token: '<|endoftext|>' (ID: 151643)


In [15]:
tokenizer

Qwen2TokenizerFast(name_or_path='./qwen', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|endoftext|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=Fal

# SECTION 5: QUANTIZATION CONFIGURATION

In [8]:
print_title(5, "Quantization Configuration")

print("Configuring 4-bit quantization...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                      # Enable 4-bit loading
    bnb_4bit_quant_type="nf4",              # Use NormalFloat4 quantization
    bnb_4bit_compute_dtype=torch.bfloat16,  # Compute in bfloat16 for stability
    bnb_4bit_use_double_quant=True,         # Double quantization for more memory savings
)

--------------------------------------------------------------------------------
Section 5: Quantization Configuration
--------------------------------------------------------------------------------
Configuring 4-bit quantization...


# SECTION 6: MODEL

In [9]:
# !wget https://huggingface.co/Qwen/Qwen2.5-0.5B/resolve/main/model.safetensors?download=true .

In [12]:
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     quantization_config=bnb_config,
#     device_map="auto",
#     trust_remote_code=True,
#     torch_dtype=torch.bfloat16,
#     cache_dir="./qwen"
# )

# Shards would divide the tensor files into multiple files.

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    local_files_only=True  # Add this to prevent any download attempts
)

print("\nModel loaded successfully!")


Model loaded successfully!


In [20]:
print("\n" + "-" * 80)
print("MODEL ARCHITECTURE OVERVIEW")
print("-" * 80)
print(model)


--------------------------------------------------------------------------------
MODEL ARCHITECTURE OVERVIEW
--------------------------------------------------------------------------------
Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear4bit(in_features=896, out_features=896, bias=True)
          (k_proj): Linear4bit(in_features=896, out_features=128, bias=True)
          (v_proj): Linear4bit(in_features=896, out_features=128, bias=True)
          (o_proj): Linear4bit(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear4bit(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear4bit(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear4bit(in_features=4864, out_features=896, b

1. TOP LEVEL STRUCTURE:  
   • Qwen2ForCausalLM - The complete model for causal language modelling. Predicts the next token autoregressively (one at a time) given previous tokens: P(x_n | x_<n)  
   • model - The main transformer model (decoder only)  
   • lm_head - Final linear layer that produces vocabulary logits, raw probability score from the hidden layers.  
  
2. EMBEDDING LAYER:  
   • embed_tokens: Embedding(151936, 896)  
   • 151936 = vocabulary size (number of unique tokens)  
   • 896 = embedding dimension (d_model)  
   • Converts token IDs → dense vectors  
  
3. TRANSFORMER LAYERS:  
   • layers: ModuleList (0-23) = 24 decoder layers  
   • Each layer has the SAME structure, stacked 24 times  
  
4. FOR DECODER_LAYER in "TRANSFORMER LAYERS":  
  
   A. SELF-ATTENTION (Qwen2Attention):  
      • q_proj: Query projection [896 → 896] "what am I looking for?" (learnt)  
      • k_proj: Key projection [896 → 128] "what do I contain?" (learnt)  
      • v_proj: Value projection [896 → 128] "what information do I carry?" (learnt)  
      • o_proj: Output projection [896 → 896] Combines attention outputs back to model dimension (concat).
   k_proj and v_proj are smaller (128) - this is called Multi-Query Attention (MQA) or Grouped-Query Attention (GQA). (shared k and v)  
      • rotary_emb: Rotary Position Embedding (RoPE): Encodes position information directly in the attention mechanism  

   B. FEED-FORWARD NETWORK (Qwen2MLP):
      • gate_proj: [896 → 4864] - "Gating" pathway  
      • up_proj: [896 → 4864] - "Up" projection  
      • down_proj: [4864 → 896] - "Down" projection back to d_model  
      • act_fn: SiLU() - Activation function (Sigmoid Linear Unit)  

   C. LAYER NORMALIZATION:
      • input_layernorm: Qwen2RMSNorm: Applied BEFORE self-attention (Pre-LN)  
      • post_attention_layernorm: Qwen2RMSNorm: Applied BEFORE FFN  
  
6. FINAL LAYERS:
   • norm: Qwen2RMSNorm - Final normalization before output  
   • lm_head: Linear [896 → 151936]  
     - Projects back to vocabulary size  
     - Each position gets a score for every possible token (logit)  

7. THE "Linear4bit":  
   • These are quantized layers (from bitsandbytes)  
   • Store weights in 4-bit instead of 16-bit  
   • Normal: Linear(...)  
   • Quantized: Linear4bit(...)  
  
8. INFORMATION FLOW:  
   Token IDs > Embeddings > Decoder Layer 1 > Layer 2 > ... > Layer 24 > Norm > LM Head > Logits  
   
9. WHERE LoRA WILL BE APPLIED:  
   When we add LoRA adapters, we'll target:  
   • q_proj, k_proj, v_proj (attention projections)  
   • o_proj (attention output)  
   • gate_proj, up_proj, down_proj (FFN layers)  

In [18]:
total_params = sum(p.numel() for p in model.parameters())
print(f"\nTotal parameters: {total_params:,} ({total_params/1e9:.2f}B)")

# Memory footprint
memory_footprint = model.get_memory_footprint() / 1e9
print(f"Model memory footprint: {memory_footprint:.2f} GB")
available_gpu = round(torch.cuda.get_device_properties(0).total_memory / 1e9, 4)
print(f"Available GPU memory: {available_gpu} GB")
print(f"Memory usage: {memory_footprint / available_gpu * 100:.1f}% of {available_gpu}GB")


Total parameters: 315,119,488 (0.32B)
Model memory footprint: 0.45 GB
Available GPU memory: 85.0948 GB
Memory usage: 0.5% of 85.0948GB


# SECTION 7: TOKENIZATION HANDS-ON

In [22]:
print_title(7, "Understanding Tokenization")

example_question = "Write a function to calculate the sum of a list"
example_answer = "def sum_list(lst):\n    return sum(lst)"

print(f"Question: {example_question}")
print(f"Answer: {example_answer}")

q_tokens = tokenizer.tokenize(example_question)
q_ids = tokenizer.encode(example_question, add_special_tokens=False)

print("\n" + "-" * 80)
print("QUESTION TOKENIZATION:")
print("-" * 80)
print(f"Tokens: {q_tokens}")
print(f"Token IDs: {q_ids}")
print(f"Number of tokens: {len(q_ids)}")

a_tokens = tokenizer.tokenize(example_answer)
a_ids = tokenizer.encode(example_answer, add_special_tokens=False)

print("\n" + "-" * 80)
print("ANSWER TOKENIZATION:")
print("-" * 80)
print(f"Tokens: {a_tokens}")
print(f"Token IDs: {a_ids}")
print(f"Number of tokens: {len(a_ids)}")

# bytre pair encoding: BPE builds its vocabulary by merging frequently co-occurring byte sequences.
full_text = f"Question: {example_question}\nAnswer: {example_answer}"
full_encoding = tokenizer(full_text, return_tensors="pt", add_special_tokens=True)

print("\n" + "-" * 80)
print("FULL SEQUENCE WITH SPECIAL TOKENS:")
print("-" * 80)
print(f"Input IDs shape: {full_encoding['input_ids'].shape}")
print(f"Input IDs: {full_encoding['input_ids'][0].tolist()}")

decoded = tokenizer.decode(full_encoding['input_ids'][0])
print(f"\nDecoded text: {decoded}")

--------------------------------------------------------------------------------
Section 7: Understanding Tokenization
--------------------------------------------------------------------------------
Question: Write a function to calculate the sum of a list
Answer: def sum_list(lst):
    return sum(lst)

--------------------------------------------------------------------------------
QUESTION TOKENIZATION:
--------------------------------------------------------------------------------
Tokens: ['Write', 'Ġa', 'Ġfunction', 'Ġto', 'Ġcalculate', 'Ġthe', 'Ġsum', 'Ġof', 'Ġa', 'Ġlist']
Token IDs: [7985, 264, 729, 311, 11047, 279, 2629, 315, 264, 1140]
Number of tokens: 10

--------------------------------------------------------------------------------
ANSWER TOKENIZATION:
--------------------------------------------------------------------------------
Tokens: ['def', 'Ġsum', '_list', '(lst', '):Ċ', 'ĠĠĠ', 'Ġreturn', 'Ġsum', '(lst', ')']
Token IDs: [750, 2629, 2019, 46046, 982, 262, 470, 262