# -------------------------------------------------------------------------------------------------------------
# Project: Open Source Institute-Cognitive System of Machine Intelligent Computing (OpenSI-CoSMIC)
# Contributors:
#     Muntasir Adnan <adnan.adnan@canberra.edu.au>
# 
# Copyright (c) 2025 Open Source Institute
# 
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without
# limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so, subject to the following
# conditions:
# 
# The above copyright notice and this permission notice shall be included in all copies or substantial
# portions of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
# LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# -------------------------------------------------------------------------------------------------------------

In [2]:
def print_title(section_num: int, description: str) ->  None:
    print("-"*80)
    print(f"Section {section_num}: {description}")
    print("-"*80)

# SECTION 1: VERIFY ENVIRONMENT

In [3]:
print_title(1, "Environment")

import sys
print(f"Python version: {sys.version}")
print(f"Python executable: {sys.executable}")

import torch
print(f"\nPyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU device: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    print(f"Current GPU memory allocated: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB")
else:
    print("\nWARNING: CUDA not available! This tutorial requires a GPU.")

--------------------------------------------------------------------------------
Section 1: Environment
--------------------------------------------------------------------------------
Python version: 3.10.19 (main, Oct 21 2025, 16:43:05) [GCC 11.2.0]
Python executable: /home/adnana/miniconda3/envs/llm/bin/python3.10

PyTorch version: 2.1.0+cu121
CUDA available: True
CUDA version: 12.1
GPU device: NVIDIA A100 80GB PCIe
GPU memory: 85.09 GB
Current GPU memory allocated: 0.00 GB


# SECTION 2: IMPORTS

In [4]:
print_title(2, "Importing Libraries")

import os
from dotenv import load_dotenv
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
import gc
import matplotlib.pyplot as plt
from datetime import datetime
from huggingface_hub import login
from jinja2 import Template

print("All imports successful!\n")

--------------------------------------------------------------------------------
Section 2: Importing Libraries
--------------------------------------------------------------------------------
All imports successful!



In [5]:
# hf login
load_dotenv(".env")
hf_token = os.getenv("hf_token")
login(hf_token)

# SECTION 3: LOAD DATASET

In [6]:
print_title(3, "Dataset")

synthetic_data = {
    'q': [
        'Write a function to add two numbers',
        'Create a function to check if a number is even',
        'Write a function to reverse a string',
        'Create a function to find the maximum in a list',
        'Write a function to calculate factorial',
        'Create a function to check if a number is prime',
        'Write a function to find the sum of a list',
        'Create a function to sort a dictionary by values',
        'Write a function to remove duplicates from a list',
        'Create a function to check if a string is a palindrome',
    ] * 10,
    'a': [
        'def add_numbers(a, b):\n    return a + b',
        'def is_even(n):\n    return n % 2 == 0',
        'def reverse_string(s):\n    return s[::-1]',
        'def find_max(lst):\n    return max(lst)',
        'def factorial(n):\n    if n <= 1:\n        return 1\n    return n * factorial(n-1)',
        'def is_prime(n):\n    if n < 2:\n        return False\n    for i in range(2, int(n**0.5) + 1):\n        if n % i == 0:\n            return False\n    return True',
        'def sum_list(lst):\n    return sum(lst)',
        'def sort_dict_by_value(d):\n    return dict(sorted(d.items(), key=lambda x: x[1]))',
        'def remove_duplicates(lst):\n    return list(set(lst))',
        'def is_palindrome(s):\n    return s == s[::-1]',
    ] * 10
}

df = pd.DataFrame(synthetic_data)

print(f"Dataset loaded: {len(df)} examples")
print(f"Columns: {df.columns.tolist()}")
print("\nFirst 3 examples:")
for idx in range(3):
    print(f"\n--- Example {idx+1} ---")
    print(f"Question: {df.iloc[idx]['q']}")
    print(f"Answer: {df.iloc[idx]['a']}")

print("\nDataset loaded successfully!\n")

--------------------------------------------------------------------------------
Section 3: Dataset
--------------------------------------------------------------------------------
Dataset loaded: 100 examples
Columns: ['q', 'a']

First 3 examples:

--- Example 1 ---
Question: Write a function to add two numbers
Answer: def add_numbers(a, b):
    return a + b

--- Example 2 ---
Question: Create a function to check if a number is even
Answer: def is_even(n):
    return n % 2 == 0

--- Example 3 ---
Question: Write a function to reverse a string
Answer: def reverse_string(s):
    return s[::-1]

Dataset loaded successfully!



# SECTION 4: TOKENIZER LOADING

In [7]:
print_title(4, "Tokenizer Loading")

model_name = "Qwen/Qwen2.5-0.5B"
# model_name = "./qwen" # offline
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name, 
    trust_remote_code=True
)

if tokenizer.pad_token is None:
    print(f"PAD token is not dest by default for model: {model_name}")
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

print("Tokenizer loaded!")
print(f"Vocabulary size: {len(tokenizer):,}")
# https://huggingface.co/transformers/v3.2.0/main_classes/tokenizer.html#:~:text=model_max_length%20(%20int%20%2C%20optional%20)%20%E2%80%93,inputs%20to%20the%20transformer%20model.
print(f"Model max length: {tokenizer.model_max_length}")
print(f"BOS token: '{tokenizer.bos_token}' (ID: {tokenizer.bos_token_id})")
print(f"EOS token: '{tokenizer.eos_token}' (ID: {tokenizer.eos_token_id})")
print(f"PAD token: '{tokenizer.pad_token}' (ID: {tokenizer.pad_token_id})")

--------------------------------------------------------------------------------
Section 4: Tokenizer Loading
--------------------------------------------------------------------------------
Tokenizer loaded!
Vocabulary size: 151,665
Model max length: 131072
BOS token: 'None' (ID: None)
EOS token: '<|endoftext|>' (ID: 151643)
PAD token: '<|endoftext|>' (ID: 151643)


In [8]:
tokenizer

Qwen2TokenizerFast(name_or_path='Qwen/Qwen2.5-0.5B', vocab_size=151643, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|endoftext|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, nor

# SECTION 5: TOKENIZATION HANDS-ON

In [9]:
print_title(5, "Understanding Tokenization")

example_question = "Write a function to calculate the sum of a list"
example_answer = "def sum_list(lst):\n    return sum(lst)"

print(f"Question: {example_question}")
print(f"Answer: {example_answer}")

q_tokens = tokenizer.tokenize(example_question)
q_ids = tokenizer.encode(example_question, add_special_tokens=False)

print("\n" + "-" * 80)
print("QUESTION TOKENIZATION:")
print("-" * 80)
print(f"Tokens: {q_tokens}")
print(f"Token IDs: {q_ids}")
print(f"Number of tokens: {len(q_ids)}")

a_tokens = tokenizer.tokenize(example_answer)
a_ids = tokenizer.encode(example_answer, add_special_tokens=False)

print("\n" + "-" * 80)
print("ANSWER TOKENIZATION:")
print("-" * 80)
print(f"Tokens: {a_tokens}")
print(f"Token IDs: {a_ids}")
print(f"Number of tokens: {len(a_ids)}")

# bytre pair encoding: BPE builds its vocabulary by merging frequently co-occurring byte sequences.
full_text = f"Question: {example_question}\nAnswer: {example_answer}"
full_encoding = tokenizer(full_text, return_tensors="pt", add_special_tokens=True)

print("\n" + "-" * 80)
print("FULL SEQUENCE WITH SPECIAL TOKENS:")
print("-" * 80)
print(f"Input IDs shape: {full_encoding['input_ids'].shape}")
print(f"Input IDs: {full_encoding['input_ids'][0].tolist()}")

decoded = tokenizer.decode(full_encoding['input_ids'][0])
print(f"\nDecoded text: {decoded}")

--------------------------------------------------------------------------------
Section 5: Understanding Tokenization
--------------------------------------------------------------------------------
Question: Write a function to calculate the sum of a list
Answer: def sum_list(lst):
    return sum(lst)

--------------------------------------------------------------------------------
QUESTION TOKENIZATION:
--------------------------------------------------------------------------------
Tokens: ['Write', 'Ġa', 'Ġfunction', 'Ġto', 'Ġcalculate', 'Ġthe', 'Ġsum', 'Ġof', 'Ġa', 'Ġlist']
Token IDs: [7985, 264, 729, 311, 11047, 279, 2629, 315, 264, 1140]
Number of tokens: 10

--------------------------------------------------------------------------------
ANSWER TOKENIZATION:
--------------------------------------------------------------------------------
Tokens: ['def', 'Ġsum', '_list', '(lst', '):Ċ', 'ĠĠĠ', 'Ġreturn', 'Ġsum', '(lst', ')']
Token IDs: [750, 2629, 2019, 46046, 982, 262, 470, 262

# SECTION 6: QUANTIZATION CONFIGURATION

In [10]:
print_title(6, "Quantization Configuration")

print("Configuring 4-bit quantization...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                      # Enable 4-bit loading
    bnb_4bit_quant_type="nf4",              # Use NormalFloat4 quantization
    bnb_4bit_compute_dtype=torch.bfloat16,  # Compute in bfloat16 for stability
    bnb_4bit_use_double_quant=True,         # Double quantization for more memory savings
)

--------------------------------------------------------------------------------
Section 6: Quantization Configuration
--------------------------------------------------------------------------------
Configuring 4-bit quantization...


# SECTION 7: MODEL

In [11]:
# !wget https://huggingface.co/Qwen/Qwen2.5-0.5B/resolve/main/model.safetensors?download=true .

In [12]:
# does not work with ipykernel 7.0.1 
# downgrade to 6.30.1
# https://discuss.huggingface.co/t/model-loading-gets-stuck-when-calling-from-pretrained/112807/11

print_title(7, "Model Loading")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    cache_dir="./qwen_cache"
)

# Shards would divide the tensor files into multiple files.

# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     quantization_config=bnb_config,
#     device_map="auto",
#     trust_remote_code=True,
#     torch_dtype=torch.bfloat16,
#     local_files_only=True  # Add this to prevent any download attempts
# )

# print("\nModel loaded successfully!")

--------------------------------------------------------------------------------
Section 7: Model Loading
--------------------------------------------------------------------------------


In [13]:
print("\n" + "-" * 80)
print("MODEL ARCHITECTURE OVERVIEW")
print("-" * 80)
print(model)


--------------------------------------------------------------------------------
MODEL ARCHITECTURE OVERVIEW
--------------------------------------------------------------------------------
Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear4bit(in_features=896, out_features=896, bias=True)
          (k_proj): Linear4bit(in_features=896, out_features=128, bias=True)
          (v_proj): Linear4bit(in_features=896, out_features=128, bias=True)
          (o_proj): Linear4bit(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear4bit(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear4bit(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear4bit(in_features=4864, out_features=896, b

## Understanding the Model Architetcure  
1. TOP LEVEL STRUCTURE:  
   • Qwen2ForCausalLM - The complete model for causal language modelling. Predicts the next token autoregressively (one at a time) given previous tokens: P(x_n | x_<n)  
   • model - The main transformer model (decoder only)  
   • lm_head - Final linear layer that produces vocabulary logits, raw probability score from the hidden layers.  
  
2. EMBEDDING LAYER:  
   • embed_tokens: Embedding(151936, 896)  
   • 151936 = vocabulary size (number of unique tokens)  
   • 896 = embedding dimension (d_model)  
   • Converts token IDs → dense vectors  
  
3. TRANSFORMER LAYERS:  
   • layers: ModuleList (0-23) = 24 decoder layers  
   • Each layer has the SAME structure, stacked 24 times  
  
4. FOR DECODER_LAYER in "TRANSFORMER LAYERS":  
  
   A. SELF-ATTENTION (Qwen2Attention):  
      • q_proj: Query projection [896 → 896] "what am I looking for?" (learnt)  
      • k_proj: Key projection [896 → 128] "what do I contain?" (learnt)  
      • v_proj: Value projection [896 → 128] "what information do I carry?" (learnt)  
      • o_proj: Output projection [896 → 896] Combines attention outputs back to model dimension (concat).
   k_proj and v_proj are smaller (128) - this is called Multi-Query Attention (MQA) or Grouped-Query Attention (GQA). (shared k and v)  
      • rotary_emb: Rotary Position Embedding (RoPE): Encodes position information directly in the attention mechanism  

   B. FEED-FORWARD NETWORK (Qwen2MLP):  
      • gate_proj: [896 → 4864] - "Gating" pathway  
      • up_proj: [896 → 4864] - "Up" projection  
      • down_proj: [4864 → 896] - "Down" projection back to d_model  
      • act_fn: SiLU() - Activation function (Sigmoid Linear Unit)  

   C. LAYER NORMALIZATION:  
      • input_layernorm: Qwen2RMSNorm: Applied BEFORE self-attention (Pre-LN)  
      • post_attention_layernorm: Qwen2RMSNorm: Applied BEFORE FFN  
  
6. FINAL LAYERS:
   • norm: Qwen2RMSNorm - Final normalization before output  
   • lm_head: Linear [896 → 151936]  
     - Projects back to vocabulary size  
     - Each position gets a score for every possible token (logit)  

7. THE "Linear4bit":  
   • These are quantized layers (from bitsandbytes)  
   • Store weights in 4-bit instead of 16-bit  
   • Normal: Linear(...)  
   • Quantized: Linear4bit(...)  
  
8. INFORMATION FLOW:  
   Token IDs > Embeddings > Decoder Layer 1 > Layer 2 > ... > Layer 24 > Norm > LM Head > Logits  
   
9. WHERE LoRA WILL BE APPLIED:  
   When we add LoRA adapters, we'll target:  
   • q_proj, k_proj, v_proj (attention projections)  
   • o_proj (attention output)  
   • gate_proj, up_proj, down_proj (FFN layers)  

In [14]:
total_params = sum(p.numel() for p in model.parameters())
print(f"\nTotal parameters: {total_params:,} ({total_params/1e9:.2f}B)")

# Memory footprint
memory_footprint = model.get_memory_footprint() / 1e9
print(f"Model memory footprint: {memory_footprint:.2f} GB")
available_gpu = round(torch.cuda.get_device_properties(0).total_memory / 1e9, 4)
print(f"Available GPU memory: {available_gpu} GB")
print(f"Memory usage: {memory_footprint / available_gpu * 100:.1f}% of {available_gpu}GB")


Total parameters: 315,119,488 (0.32B)
Model memory footprint: 0.45 GB
Available GPU memory: 85.0948 GB
Memory usage: 0.5% of 85.0948GB


# SECTION 7: DATASET FORMATTING

In [15]:
print_title(7, "Dataset Formatting")

# chat template
def load_chat_template() -> None:
    if tokenizer.chat_template is None:
        print("Chat template does not exist, loading default chat template.")
        with open("chat_template.jinja2", "r") as f:
            template_string = f.read()
        tokenizer.chat_template = template_string
    else:
        print("Tokenizer has built-in chat template!")


load_chat_template()

--------------------------------------------------------------------------------
Section 7: Dataset Formatting
--------------------------------------------------------------------------------
Tokenizer has built-in chat template!


In [18]:
print(f"\nTemplate preview:")
print(tokenizer.chat_template)


Template preview:
{%- if tools %}
    {{- '<|im_start|>system\n' }}
    {%- if messages[0]['role'] == 'system' %}
        {{- messages[0]['content'] }}
    {%- else %}
        {{- 'You are a helpful assistant.' }}
    {%- endif %}
    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
    {%- for tool in tools %}
        {{- "\n" }}
        {{- tool | tojson }}
    {%- endfor %}
    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
{%- else %}
    {%- if messages[0]['role'] == 'system' %}
        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
    {%- else %}
        {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}
    {%- en

In [30]:
# Automatic, industry standard, portable, flexible
def format_with_chat_template(question: str, answer: str, tokenize: bool = False, add_generation_prompt: bool = False) -> str:
    """
    Example formatting function to work with pandas.
    See masking for actual formatting func that we will use for training
    """
    messages = [
        {"role": "system", "content": "You are a helpful Python coding assistant."},
        {"role": "user", "content": question},
        {"role": "assistant", "content": answer}
    ]
    
    formatted = tokenizer.apply_chat_template(
        messages,
        tokenize = tokenize, # Return string, not token IDs
        add_generation_prompt = add_generation_prompt # Don't add generation prompt
    )
    
    return formatted

In [20]:
# formatted example
print(format_with_chat_template(df.iloc[0]["q"], df.iloc[0]["a"]))

<|im_start|>system
You are a helpful Python coding assistant.<|im_end|>
<|im_start|>user
Write a function to add two numbers<|im_end|>
<|im_start|>assistant
def add_numbers(a, b):
    return a + b<|im_end|>



In [21]:
# formatted example with gen prompt
# for inference
print(format_with_chat_template(df.iloc[0]["q"], df.iloc[0]["a"], add_generation_prompt=True))

<|im_start|>system
You are a helpful Python coding assistant.<|im_end|>
<|im_start|>user
Write a function to add two numbers<|im_end|>
<|im_start|>assistant
def add_numbers(a, b):
    return a + b<|im_end|>
<|im_start|>assistant



In [25]:
# BOS token: 'None' (ID: None)
# EOS token: '<|endoftext|>' (ID: 151643)
# PAD token: '<|endoftext|>' (ID: 151643)
# ========================================

# weird part is qwen's default chat template does not use its set EOS or PAD token
print(tokenizer.decode(151644), tokenizer.decode(151645))

<|im_start|> <|im_end|>


In [22]:
# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(df)

print(f"\nDataset ready: {len(dataset)} formatted examples")


Dataset ready: 100 formatted examples


In [23]:
dataset[:5]

{'q': ['Write a function to add two numbers',
  'Create a function to check if a number is even',
  'Write a function to reverse a string',
  'Create a function to find the maximum in a list',
  'Write a function to calculate factorial'],
 'a': ['def add_numbers(a, b):\n    return a + b',
  'def is_even(n):\n    return n % 2 == 0',
  'def reverse_string(s):\n    return s[::-1]',
  'def find_max(lst):\n    return max(lst)',
  'def factorial(n):\n    if n <= 1:\n        return 1\n    return n * factorial(n-1)']}

# SECTION 8: MASKING

In [47]:
print_title(8, "Masking")

# SFT trainer takes care of this automatically, using roles from the chat template but we will demonstrate the process here.
# https://huggingface.co/docs/trl/sft_trainer

def formatting_func(example: dict) -> str:
    """
    Example formatting function to work with pandas.
    See masking for actual formatting func that we will use for training
    """
    messages = [
        {"role": "system", "content": "You are a helpful Python coding assistant."},
        {"role": "user", "content": example["q"]},
        {"role": "assistant", "content": example["a"]}
    ]
    
    formatted = tokenizer.apply_chat_template(
        messages,
        tokenize = False,
        add_generation_prompt = False
    )
    
    return formatted

example_text = formatting_func(dataset[0])
print("\n--- FORMATTED EXAMPLE (String) ---")
print(example_text)
print("-" * 80)

--------------------------------------------------------------------------------
Section 8: Masking
--------------------------------------------------------------------------------

--- FORMATTED EXAMPLE (String) ---
<|im_start|>system
You are a helpful Python coding assistant.<|im_end|>
<|im_start|>user
Write a function to add two numbers<|im_end|>
<|im_start|>assistant
def add_numbers(a, b):
    return a + b<|im_end|>

--------------------------------------------------------------------------------


## This is used for calculating loss, when labelled as -100, SFTTrainer will execmpt those tokens while calculating loss. This is basically called training on completion

In [48]:
# Signpost is the signal for sft trainer to know where assistant response starts.
# So sample with asssitant response - sameple without would be the signpost
# We find the "signpost" by seeing what tokens are added by add_generation_prompt=True

dummy_history = [{"role": "user", "content": "test"}]

templated_without_prompt = tokenizer.apply_chat_template(
    dummy_history,
    tokenize=False,
    add_generation_prompt=False
)
print("Without target signpost")
print("-"*50)
print(templated_without_prompt)
templated_with_prompt = tokenizer.apply_chat_template(
    dummy_history,
    tokenize=False,
    add_generation_prompt=True
)
print("With target signpost")
print("-"*50)
print(templated_with_prompt)

signpost_text = templated_with_prompt[len(templated_without_prompt):]
print(f"Signpost text: {signpost_text}")
signpost_id = tokenizer(signpost_text, return_tensors="pt", add_special_tokens=False)
print(f"Signpost id: {signpost_id}")

decoded_signpost = tokenizer.decode(signpost_id["input_ids"][0].tolist()) # to check
print(f"Decoded signpost: {decoded_signpost}")

Without target signpost
--------------------------------------------------
<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
test<|im_end|>

With target signpost
--------------------------------------------------
<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
test<|im_end|>
<|im_start|>assistant

Signpost text: <|im_start|>assistant

Signpost id: {'input_ids': tensor([[151644,  77091,    198]]), 'attention_mask': tensor([[1, 1, 1]])}
Decoded signpost: <|im_start|>assistant



In [52]:
from trl import DataCollatorForCompletionOnlyLM

collator = DataCollatorForCompletionOnlyLM(
    response_template=signpost_id["input_ids"][0].tolist(),
    tokenizer=tokenizer,
)

tokenized_example = tokenizer(example_text, return_tensors="pt")

batch = [
    {
        "input_ids": tokenized_example["input_ids"][0],
        "attention_mask": tokenized_example["attention_mask"][0]
    }
]

print("Applying the DataCollator...")
processed_batch = collator(batch)

# print(processed_batch)

print("\n--- MASKING DEMONSTRATION ---")

df_comparison = pd.DataFrame({
    "Token": tokenizer.convert_ids_to_tokens(processed_batch["input_ids"][0]),
    "Input ID": processed_batch["input_ids"][0].tolist(),
    "Label": processed_batch["labels"][0].tolist(),
    "Attention Mask": processed_batch["attention_mask"][0].tolist()
})

print(df_comparison.to_string())

Applying the DataCollator...

--- MASKING DEMONSTRATION ---
           Token  Input ID   Label  Attention Mask
0   <|im_start|>    151644    -100               1
1         system      8948    -100               1
2              Ċ       198    -100               1
3            You      2610    -100               1
4           Ġare       525    -100               1
5             Ġa       264    -100               1
6       Ġhelpful     10950    -100               1
7        ĠPython     13027    -100               1
8        Ġcoding     10822    -100               1
9     Ġassistant     17847    -100               1
10             .        13    -100               1
11    <|im_end|>    151645    -100               1
12             Ċ       198    -100               1
13  <|im_start|>    151644    -100               1
14          user       872    -100               1
15             Ċ       198    -100               1
16         Write      7985    -100               1
17            Ġa      

# SECTION 9: ATTENTION MASK

## The 'attention_mask' tells the model what to LOOK at. This is used to handle 'padding', which is added when we batch sequences of different lengths so they all have the same length.

In [55]:
print_title(9, "Attention Mask")

text1 = "Write a function to add two numbers"
text2 = "Write a function to check if a string is a palindrome. It should be efficient."

print(f"\n--- Example 1 ---\n{text1}")
print(f"Length: {len(tokenizer.encode(text1))} tokens")

print(f"\n--- Example 2 ---\n{text2}")
print(f"Length: {len(tokenizer.encode(text2))} tokens")

--------------------------------------------------------------------------------
Section 9: Attention Mask
--------------------------------------------------------------------------------

--- Example 1 ---
Write a function to add two numbers
Length: 7 tokens

--- Example 2 ---
Write a function to check if a string is a palindrome. It should be efficient.
Length: 17 tokens


In [60]:
batch_inputs = tokenizer(
    [text1, text2], 
    padding="longest", # padding will be added because of this
    return_tensors="pt"
)
# PAD token: '<|endoftext|>' (ID: 151643)
batch_inputs

{'input_ids': tensor([[  7985,    264,    729,    311,    912,   1378,   5109, 151643, 151643,
         151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643],
        [  7985,    264,    729,    311,   1779,    421,    264,    914,    374,
            264,  73097,     13,   1084,   1265,    387,  11050,     13]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [64]:
print(f"PAD Token ID: {tokenizer.pad_token_id}")

print("\nDecoded Input IDs:")
print("Example 1:", tokenizer.decode(batch_inputs["input_ids"][0]))
print("Example 2:", tokenizer.decode(batch_inputs["input_ids"][1]))
print("-" * 80)

# ---------------------------------------------------------------------------
# 4. Show the attention_mask
# ---------------------------------------------------------------------------
print("\nAttention Mask:")
print(batch_inputs["attention_mask"])

PAD Token ID: 151643

Decoded Input IDs:
Example 1: Write a function to add two numbers<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>
Example 2: Write a function to check if a string is a palindrome. It should be efficient.
--------------------------------------------------------------------------------

Attention Mask:
tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
