Learning finetuning

In [3]:
import torch
import gc
import time
from transformers import AutoModelForCausalLM, AutoTokenizer # using hugging face transformers library
from torch.utils.data import Dataset, DataLoader
from IPython.display import clear_output




In [6]:
print(torch.cuda.memory_allocated())
print(torch.cuda.memory_reserved())
print(torch.cuda.is_available())


0
0
True


In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def print_gpu_utilization():
    """Prints the current GPU memory usage (allocated and reserved)."""
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / (1024 ** 2)
        reserved = torch.cuda.memory_reserved() / (1024 ** 2)
        print(f"GPU Memory Usage: Allocated: {allocated:.2f} MB || Reserved: {reserved:.2f} MB")
    else:
        print("CUDA not available. Running on CPU.")

def flush_memory():
    """Flushes unused memory from GPU and runs garbage collection."""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

In [11]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

- Download a 1.2 billion parameter llama 3 instruction tuned model from hugging face.
- It's a casual langauge model.
- Brain float 16 talks about the precision, half precision in this case.Reduces memory use and increases speed without losing much of the accuracy.


Model is cached to ~/.cache/huggingface/hub/

In [12]:
DEFAULT_MODEL = "meta-llama/Llama-3.2-1B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    DEFAULT_MODEL,
    device_map=device,
    torch_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(DEFAULT_MODEL, use_safetensors=True)

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [13]:
print(model.get_memory_footprint()/(1024*1024))

2357.1290283203125


In [14]:
print(model.hf_device_map) # you can split layers across devcies
print(model.dtype)
print_gpu_utilization()

{'': device(type='cuda')}
torch.bfloat16
GPU Memory Usage: Allocated: 2357.13 MB || Reserved: 2862.00 MB


In [15]:
print(model)  # architecture
print(model.config)  # model hyperparameters

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb):

In [17]:
print(tokenizer.vocab_size)

128000


In [18]:
model.device

device(type='cuda', index=0)

In [19]:
prompt = "Explain how transformers work in simple terms:"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=100)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [21]:
inputs

{'input_ids': tensor([[128000,    849,  21435,   1268,  87970,    990,    304,   4382,   3878,
             25]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [22]:
outputs

tensor([[128000,    849,  21435,   1268,  87970,    990,    304,   4382,   3878,
             25,  38891,    264,   1841,   4817,     11,    719,   4619,    315,
           3339,    433,    733,     11,    433,   3727,    279,   1841,    330,
          36341,      1,    311,    279,   4994,   1917,    382,   4897,    596,
          17715,    279,   4623,   4920,    264,  43678,     13,    362,  43678,
            374,    264,   3756,    430,   5829,  66669,   8603,    311,   2349,
            279,   5216,    315,  20314,   1510,     13,   5810,    596,   1268,
            433,   4375,   1473,    334,    791,  14967,  82086,  57277,     32,
          43678,  17610,    315,   1403,  71860,    315,   9244,     11,   2663,
           6156,    323,  14580,  71860,     13,   3277,    459,  73462,   1510,
            320,   1741,      8,  28555,   1555,    279,   6156,  40760,     11,
            433,  90974,    264,  22465,    304,    279,  14580,  40760,     13,
           1115,    374]], d

In [23]:
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)


Explain how transformers work in simple terms: Imagine a car engine, but instead of making it go, it makes the car "talk" to the outside world.

That's roughly the idea behind a transformer. A transformer is a device that uses electromagnetic forces to change the direction of electrical current. Here's how it works:

**The Basic Principle**

A transformer consists of two coils of wire, called primary and secondary coils. When an alternating current (AC) flows through the primary coil, it induces a voltage in the secondary coil. This is


In [24]:
messages = [
    {"role": "user", "content": "Who is the author of Circe?"}
]

In [25]:
tokenized_text = tokenizer.apply_chat_template(messages,tokenize=False,add_generation_prompt=True)
print(tokenized_text)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 13 Jul 2025

<|eot_id|><|start_header_id|>user<|end_header_id|>

Who is the author of Circe?<|eot_id|><|start_header_id|>assistant<|end_header_id|>




absolute start of a conversaiton, it helps reset the context. the beginning and end of a role header, and what was said. Then you have an end of turn id.
And you notice the beginning and end role header for assitant because of add_generation prompt

In [26]:
from transformers import pipeline

llama_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

# generate upto 20 tokens and stop early if <eos> detected
generated_text = llama_pipeline(tokenized_text, max_new_tokens=20, early_stopping=True)

print(generated_text[0]['generated_text'])

Device set to use cuda
The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 13 Jul 2025

<|eot_id|><|start_header_id|>user<|end_header_id|>

Who is the author of Circe?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

The author of the novel "Circe" is Madeline Miller.


In [27]:
generated_text

[{'generated_text': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 13 Jul 2025\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho is the author of Circe?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nThe author of the novel "Circe" is Madeline Miller.'}]

In [29]:
len(generated_text)

1

In [31]:
tokenized_text

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 13 Jul 2025\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho is the author of Circe?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'

In [32]:
inputs = tokenizer(tokenized_text, return_tensors="pt").to(device)
inputs

{'input_ids': tensor([[128000, 128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,
           2696,     25,   6790,    220,   2366,     18,    198,  15724,   2696,
             25,    220,   1032,  10263,    220,   2366,     20,    271, 128009,
         128006,    882, 128007,    271,  15546,    374,    279,   3229,    315,
          42009,    346,     30, 128009, 128006,  78191, 128007,    271]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')}