Step 1. Load LLaMA 3.2 and its tokenizer

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "unsloth/Llama-3.2-1B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Step 2. Configure model and Freeze Lower Layers

In [None]:
# Preview the model configuration
len = 0
for name, param in model.named_parameters():
    len += 1
    print(f"{name} - requires_grad: {param.requires_grad}")
print(f"total layers - {len}")

In [3]:
# Freeze all layers except the top 2 (totally 15 known layers from above step)
for name, parameter in model.named_parameters():
    if "model.norm.weight" not in name and "layers.15" not in name:
        parameter.requires_grad = False

In [None]:
# Verify configuration
for name, param in model.named_parameters():
    print(f"{name} - requires_grad: {param.requires_grad}")

Step 3. Prepare the Dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("pacovaldez/pandas-questions")
def tokenize_and_format(examples):
    # Concatenate question and context as input if context exists; otherwise, use only the question
    if 'context' in examples:
        inputs = [q + " " + c for q, c in zip(examples['question'], examples['context'])]
    else:
        inputs = examples['question']
    
    # Tokenize the inputs and the outputs
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['answer_body'], max_length=512, truncation=True, padding="max_length")
    
    # Set the labels in the tokenized output
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

dataset = dataset.map(tokenize_and_format, batched=True)
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Step 4. Define Loss Function

In [None]:
import torch

def weighted_cross_entropy_loss(outputs, labels, weights=None): # weights is a tensor of shape (num_labels,), which is the weight for each class
    logits = outputs.logits
    loss_fct = torch.nn.CrossEntropyLoss(weight=weights, ignore_index=tokenizer.pad_token_id)
    
    active_loss = labels.view(-1) != tokenizer.pad_token_id
    active_logits = logits.view(-1, logits.size(-1))[active_loss]
    active_labels = labels.view(-1)[active_loss]

    loss = loss_fct(active_logits, active_labels)
    return loss

Step 5. Train

In [None]:
from torch.utils.data import DataLoader
from torch.amp import autocast, GradScaler

# pre train summary
torch.cuda.empty_cache()
torch.cuda.memory_summary(device=None, abbreviated=False)
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

# train
num_epochs = 1
batchSize = 2 # we need better gpu memory to increase batch size
accumulation_steps = 8  # Defines how many steps to accumulate gradients before stepping optimizer
learning_rate = 5e-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dataset_train = dataset['train'].select(range(3000)) # reduce the dataset size for limited gpu memory

scaler = GradScaler(device)
train_loader = DataLoader(dataset_train, batch_size=batchSize, shuffle=True)
model.to(device)
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate)

model.train()
for epoch in range(num_epochs):
    total_loss = 0
    num_batches = 0
    for batch in train_loader:
        inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
        with autocast(str(device)): # mixed precision - temporarily casts operations to float16
            outputs = model(input_ids=inputs, labels=labels)
            loss = weighted_cross_entropy_loss(outputs, labels)
        
        scaler.scale(loss).backward()
        if (num_batches + 1) % accumulation_steps == 0: #Gradient Accumulation
            scaler.step(optimizer) 
            scaler.update()
            optimizer.zero_grad()
        total_loss += loss.item()
        num_batches += 1
    average_loss = total_loss / num_batches
    print(f'Epoch {epoch+1}, Average Loss: {average_loss:.4f}')

# post train summary
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")


Step 6. Evaluation and Iteration

In [None]:
validation_loader = DataLoader(dataset['validation'], batch_size=batchSize, shuffle=True)
model.eval()
with torch.no_grad():
    for batch in validation_loader:
        inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
        with autocast(str(device)): # mixed precision - temporarily casts operations to float16
            outputs = model(input_ids=inputs, labels=labels)
            loss = weighted_cross_entropy_loss(outputs, labels)
        print(f"Validation Loss: {loss.item()}")