Step 1. Load LLaMA 3.2 and its tokenizer

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "unsloth/Llama-3.2-1B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm


Step 2. Configure model and Freeze Lower Layers

In [2]:
len = 0
for name, param in model.named_parameters():
    len += 1
    print(f"{name} - requires_grad: {param.requires_grad}")
print(f"total layers - {len}")

model.embed_tokens.weight - requires_grad: True
model.layers.0.self_attn.q_proj.weight - requires_grad: True
model.layers.0.self_attn.k_proj.weight - requires_grad: True
model.layers.0.self_attn.v_proj.weight - requires_grad: True
model.layers.0.self_attn.o_proj.weight - requires_grad: True
model.layers.0.mlp.gate_proj.weight - requires_grad: True
model.layers.0.mlp.up_proj.weight - requires_grad: True
model.layers.0.mlp.down_proj.weight - requires_grad: True
model.layers.0.input_layernorm.weight - requires_grad: True
model.layers.0.post_attention_layernorm.weight - requires_grad: True
model.layers.1.self_attn.q_proj.weight - requires_grad: True
model.layers.1.self_attn.k_proj.weight - requires_grad: True
model.layers.1.self_attn.v_proj.weight - requires_grad: True
model.layers.1.self_attn.o_proj.weight - requires_grad: True
model.layers.1.mlp.gate_proj.weight - requires_grad: True
model.layers.1.mlp.up_proj.weight - requires_grad: True
model.layers.1.mlp.down_proj.weight - requires_gr

In [3]:
# Freeze all layers except the top 2 (totally 15 known layers from above step)
for name, parameter in model.named_parameters():
    if "model.norm.weight" not in name and "layers.15" not in name:
        parameter.requires_grad = False

In [4]:
# Verify configuration
for name, param in model.named_parameters():
    print(f"{name} - requires_grad: {param.requires_grad}")

model.embed_tokens.weight - requires_grad: False
model.layers.0.self_attn.q_proj.weight - requires_grad: False
model.layers.0.self_attn.k_proj.weight - requires_grad: False
model.layers.0.self_attn.v_proj.weight - requires_grad: False
model.layers.0.self_attn.o_proj.weight - requires_grad: False
model.layers.0.mlp.gate_proj.weight - requires_grad: False
model.layers.0.mlp.up_proj.weight - requires_grad: False
model.layers.0.mlp.down_proj.weight - requires_grad: False
model.layers.0.input_layernorm.weight - requires_grad: False
model.layers.0.post_attention_layernorm.weight - requires_grad: False
model.layers.1.self_attn.q_proj.weight - requires_grad: False
model.layers.1.self_attn.k_proj.weight - requires_grad: False
model.layers.1.self_attn.v_proj.weight - requires_grad: False
model.layers.1.self_attn.o_proj.weight - requires_grad: False
model.layers.1.mlp.gate_proj.weight - requires_grad: False
model.layers.1.mlp.up_proj.weight - requires_grad: False
model.layers.1.mlp.down_proj.weig

Step 2. Prepare the Dataset

In [5]:
from datasets import load_dataset

dataset = load_dataset("pacovaldez/pandas-questions")
def tokenize_and_format(examples):
    # Concatenate question and context as input if context exists; otherwise, use only the question
    if 'context' in examples:
        inputs = [q + " " + c for q, c in zip(examples['question'], examples['context'])]
    else:
        inputs = examples['question']
    
    # Tokenize the inputs and the outputs
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['answer_body'], max_length=512, truncation=True, padding="max_length")
    
    # Set the labels in the tokenized output
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

dataset = dataset.map(tokenize_and_format, batched=True)
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Step 3. Define Loss Function

In [6]:
import torch

def weighted_cross_entropy_loss(outputs, labels, weights=None):
    logits = outputs.logits
    loss_fct = torch.nn.CrossEntropyLoss(weight=weights, ignore_index=tokenizer.pad_token_id)
    
    active_loss = labels.view(-1) != tokenizer.pad_token_id
    active_logits = logits.view(-1, logits.size(-1))[active_loss]
    active_labels = labels.view(-1)[active_loss]

    loss = loss_fct(active_logits, active_labels)
    return loss

Step 4. Train

In [7]:
from torch.utils.data import DataLoader
from torch.amp import autocast, GradScaler

# pre train summary
torch.cuda.empty_cache()
torch.cuda.memory_summary(device=None, abbreviated=False)
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

# train
num_epochs = 5
batchSize = 4 # we need better gpu memory to increase batch size
accumulation_steps = 8  # Defines how many steps to accumulate gradients before stepping optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

scaler = GradScaler(device)
train_loader = DataLoader(dataset['train'].select(range(3000)), batch_size=batchSize, shuffle=True)
model.to(device)
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=5e-5)

model.train()
for epoch in range(num_epochs):
    total_loss = 0
    num_batches = 0
    for batch in train_loader:
        inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
        with autocast(str(device)): # mixed precision - temporarily casts operations to float16
            outputs = model(input_ids=inputs, labels=labels)
            loss = weighted_cross_entropy_loss(outputs, labels)
        
        scaler.scale(loss).backward()
        if (num_batches + 1) % accumulation_steps == 0: #Gradient Accumulation
            scaler.step(optimizer) 
            scaler.update()
            optimizer.zero_grad()
        total_loss += loss.item()
        num_batches += 1
    average_loss = total_loss / num_batches
    print(f'Epoch {epoch+1}, Average Loss: {average_loss:.4f}')

# post train summary
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")


GPU = Tesla T4. Max memory = 14.741 GB.
0.0 GB of memory reserved.
Epoch 1, Average Loss: 7.4451
Epoch 2, Average Loss: 6.6252
Epoch 3, Average Loss: 6.5154
Epoch 4, Average Loss: 6.3859
Epoch 5, Average Loss: 6.2578
Peak reserved memory = 12.006 GB.
Peak reserved memory for training = 12.006 GB.
Peak reserved memory % of max memory = 81.446 %.
Peak reserved memory for training % of max memory = 81.446 %.


Step 5. Evaluation and Iteration

In [8]:
validation_loader = DataLoader(dataset['validation'], batch_size=batchSize, shuffle=True)
model.eval()
total_loss = 0
num_batches = 0
with torch.no_grad():
    for batch in validation_loader:
        inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
        with autocast(str(device)): # mixed precision - temporarily casts operations to float16
            outputs = model(input_ids=inputs, labels=labels)
            loss = weighted_cross_entropy_loss(outputs, labels)
        total_loss += loss.item()
        num_batches += 1
    average_loss = total_loss / num_batches
    print(f'Average Loss: {average_loss:.4f}')

Average Loss: 6.4421


Step 6. Save Model and Tokenize

In [9]:
# Save the Entire Model
torch.save(model, 'llama3_2_batch_4_tuned_model.pth')
# Save Only the Model's State Dict
torch.save(model.state_dict(), 'llama3_2_batch_4_tuned_model_weights.pth')
# Saving the Tokenizer
tokenizer.save_pretrained('llama3_2_batch_4_tuned_model_tokenizer/')

('llama3_2_batch_4_tuned_model_tokenizer/tokenizer_config.json',
 'llama3_2_batch_4_tuned_model_tokenizer/special_tokens_map.json',
 'llama3_2_batch_4_tuned_model_tokenizer/tokenizer.json')