In [1]:
import torch

In [2]:

if torch.backends.mps.is_available():
    torch_device = torch.device("mps:0")
    x = torch.ones(1, device=torch_device)
    print (x)
elif torch.cuda.is_available():
    torch_device = torch.device("cuda")
    print ("MPS device not found. Using CUDA.")
else:
    torch_device = torch.device("cpu")
    print ("No accelerator device found. Using CPU.")

MPS device not found. Using CUDA.


## Preload

In [3]:
from transformers import GPT2LMHeadModel

# Load the model
gpt2 = GPT2LMHeadModel.from_pretrained(
    "openai-community/gpt2-medium", 
).to(torch_device)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
gpt2

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=3072, nx=1024)
          (c_proj): Conv1D(nf=1024, nx=1024)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=4096, nx=1024)
          (c_proj): Conv1D(nf=1024, nx=4096)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)

In [5]:
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L458
gpt2.transformer.h[0].attn

GPT2SdpaAttention(
  (c_attn): Conv1D(nf=3072, nx=1024)
  (c_proj): Conv1D(nf=1024, nx=1024)
  (attn_dropout): Dropout(p=0.1, inplace=False)
  (resid_dropout): Dropout(p=0.1, inplace=False)
)

## Dataset

In [6]:
import urllib
import json
import os

file_path = "instruction-data.json"
url = (
    "https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/main/alpaca_data.json"
)
# url = (
#     "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch"
#     "/main/ch07/01_main-chapter-code/instruction-data.json"
# )
def download_and_load_data(
    file_path, url, force_download=False
):
    if not os.path.exists(file_path) or force_download:
        with urllib.request.urlopen(url) as response:
            with open(file_path, "w", encoding="utf-8") as out_file:
                out_file.write(response.read().decode("utf-8"))
    with open(file_path, "r") as f:
        data = json.loads(f.read())
    return data


In [7]:
data = download_and_load_data(file_path, url, force_download=True)

In [8]:
data

[{'instruction': 'Give three tips for staying healthy.',
  'input': '',
  'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.'},
 {'instruction': 'What are the three primary colors?',
  'input': '',
  'output': 'The three primary colors are red, blue, and yellow.'},
 {'instruction': 'Describe the structure of an atom.',
  'input': '',
  'output': 'An atom is made up of a nucleus, which contains protons and neutrons, surrounded by electrons that travel in orbits around the nucleus. The protons and neutrons have a positive charge, while the electrons have a negative charge, resulting in an overall neutral atom. The number of each particle determines the atomic number and the type of atom.'},
 {'instruction': 'How can we reduce air pollution?',
  'input': '',
  'output': 'There are a number of ways to reduce air pollution, such

In [9]:
def format_instructions(entry):
    # Alpaca
    instruction = entry.get("instruction", None)
    input = entry.get("input", None)
    output = entry.get("output", None)
    formatted = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately completes the request."
        f"\n\n### Instruction\n{instruction}"
    )
    if input:
        formatted += f"\n\n### Input\n{input}"
    if output:
        formatted += f"\n\n### Output\n{output}"
    return formatted

formatted_data = [format_instructions(entry) for entry in data]
formatted_data


['Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction\nGive three tips for staying healthy.\n\n### Output\n1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.',
 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction\nWhat are the three primary colors?\n\n### Output\nThe three primary colors are red, blue, and yellow.',
 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction\nDescribe the structure of an atom.\n\n### Output\nAn atom is made up of a nucleus, which contains protons and neutrons, surrounded by electrons that travel in orbits around the nucleus. The protons and neutrons have a positive charge, while the electron

In [10]:
import random

train_n = int(len(data) * 0.85)
test_n = int(len(data) * 0.1)
val_n = len(data) - train_n - test_n

shuffled_data = random.sample(data, len(data))
train_data = shuffled_data[:train_n]
test_data = shuffled_data[train_n:train_n+test_n]
val_data = shuffled_data[train_n+test_n:]

print(f"Train data: {len(train_data)}, Test data: {len(test_data)}, Validation data: {len(val_data)}")

Train data: 44201, Test data: 5200, Validation data: 2601


### Dataloader

In [11]:
PAD_TOKEN_ID = 50256
PLACEHOLDER_TOKEN_ID = -100

In [12]:
import torch
from torch.utils.data import Dataset, DataLoader

class InstructDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=None, pad_token_id=PAD_TOKEN_ID):
        self.data = data
        formatted_data = [format_instructions(entry) for entry in data]
        self.encoded_data = [tokenizer.encode(entry) for entry in formatted_data]

        if max_length is None:
            self.max_length = max(len(entry) for entry in self.encoded_data)
        else:
            self.max_length = max_length
            self.encoded_data = [
                encoded_text[:self.max_length]
                for encoded_text in self.encoded_data
            ]

        # # Padding Handled in custom_collate_fn instead
        # self.encoded_data = [
        #     entry + [pad_token_id] * (self.max_length - len(entry))
        #     for entry in self.encoded_data
        # ]

    def __getitem__(self, idx):
        return self.encoded_data[idx]

    def __len__(self):
        return len(self.encoded_data)


In [13]:
def batch_collate_fn(
    batch, 
    pad_token_id=PAD_TOKEN_ID, 
    placeholder_token_id=PLACEHOLDER_TOKEN_ID,
    allowed_max_length=None, 
    device=torch_device
):
    batch_max_length = max(len(item) for item in batch)
    inputs = []
    targets = []
    for item in batch:
        padding_length = batch_max_length - len(item)
        input_item = item + [pad_token_id] * padding_length
        target_item = item[1:] + [pad_token_id] + [placeholder_token_id] * padding_length
        if allowed_max_length is not None and batch_max_length > allowed_max_length:
            input_item = input_item[:allowed_max_length]
            target_item = target_item[:allowed_max_length]
        inputs.append(torch.tensor(input_item).to(device))
        targets.append(torch.tensor(target_item).to(device))

    inputs_tensor = torch.stack(inputs).to(device)
    targets_tensor = torch.stack(targets).to(device)
    return inputs_tensor, targets_tensor


In [14]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"})

[50256]

In [15]:
dataset = InstructDataset(train_data, tokenizer, max_length=1024, pad_token_id=PAD_TOKEN_ID)

In [16]:
import itertools

# Get the first 10 items of the InstructDataset iterator
first_10_items = list(itertools.islice(iter(dataset), 10))
for item in first_10_items:
    print(len(item))


100
117
95
75
72
120
110
48
112
124


In [17]:
batch_collate_fn(first_10_items)

(tensor([[21106,   318,   281,  ..., 50256, 50256, 50256],
         [21106,   318,   281,  ..., 50256, 50256, 50256],
         [21106,   318,   281,  ..., 50256, 50256, 50256],
         ...,
         [21106,   318,   281,  ..., 50256, 50256, 50256],
         [21106,   318,   281,  ..., 50256, 50256, 50256],
         [21106,   318,   281,  ...,   607, 27146,    13]], device='cuda:0'),
 tensor([[  318,   281, 12064,  ...,  -100,  -100,  -100],
         [  318,   281, 12064,  ...,  -100,  -100,  -100],
         [  318,   281, 12064,  ...,  -100,  -100,  -100],
         ...,
         [  318,   281, 12064,  ...,  -100,  -100,  -100],
         [  318,   281, 12064,  ...,  -100,  -100,  -100],
         [  318,   281, 12064,  ..., 27146,    13, 50256]], device='cuda:0'))

In [18]:
from functools import partial
NUM_WORKERS = 0
BATCH_SIZE = 4
MAX_LENGTH = 512

collate_fn = partial(
    batch_collate_fn, 
    pad_token_id=PAD_TOKEN_ID, 
    placeholder_token_id=PLACEHOLDER_TOKEN_ID, 
    allowed_max_length=MAX_LENGTH, 
    device=torch_device
)

train_dataset = InstructDataset(
    train_data, 
    tokenizer, 
    max_length=MAX_LENGTH,
    pad_token_id=PAD_TOKEN_ID,
)
train_dataloader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    collate_fn=collate_fn,
    shuffle=True,
    drop_last=True,
    num_workers=NUM_WORKERS,
)

test_dataset = InstructDataset(
    test_data, 
    tokenizer, 
    max_length=MAX_LENGTH,
    pad_token_id=PAD_TOKEN_ID,
)
test_dataloader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    collate_fn=collate_fn,
    shuffle=False,
    drop_last=True,
    num_workers=NUM_WORKERS,
)

validation_dataset = InstructDataset(
    val_data, 
    tokenizer, 
    max_length=512,
    pad_token_id=PAD_TOKEN_ID,
)
validation_dataloader = DataLoader(
    validation_dataset,
    batch_size=BATCH_SIZE,
    collate_fn=collate_fn,
    shuffle=False,
    drop_last=True,
    num_workers=NUM_WORKERS,
)


In [19]:
for inputs, targets in train_dataloader:
    print(inputs.shape, targets.shape)

torch.Size([4, 169]) torch.Size([4, 169])
torch.Size([4, 142]) torch.Size([4, 142])
torch.Size([4, 164]) torch.Size([4, 164])
torch.Size([4, 131]) torch.Size([4, 131])
torch.Size([4, 100]) torch.Size([4, 100])
torch.Size([4, 159]) torch.Size([4, 159])
torch.Size([4, 133]) torch.Size([4, 133])
torch.Size([4, 183]) torch.Size([4, 183])
torch.Size([4, 150]) torch.Size([4, 150])
torch.Size([4, 239]) torch.Size([4, 239])
torch.Size([4, 151]) torch.Size([4, 151])
torch.Size([4, 187]) torch.Size([4, 187])
torch.Size([4, 168]) torch.Size([4, 168])
torch.Size([4, 111]) torch.Size([4, 111])
torch.Size([4, 145]) torch.Size([4, 145])
torch.Size([4, 167]) torch.Size([4, 167])
torch.Size([4, 125]) torch.Size([4, 125])
torch.Size([4, 146]) torch.Size([4, 146])
torch.Size([4, 112]) torch.Size([4, 112])
torch.Size([4, 111]) torch.Size([4, 111])
torch.Size([4, 157]) torch.Size([4, 157])
torch.Size([4, 127]) torch.Size([4, 127])
torch.Size([4, 147]) torch.Size([4, 147])
torch.Size([4, 153]) torch.Size([4

## Training Utilities

In [20]:
def generate_text_simple(model, token_ids, max_new_tokens, context_size):
    logits = None
    for i in range(max_new_tokens):
        context_token_ids = token_ids[:, -context_size:]
        with torch.no_grad():
            logits = model(context_token_ids)
        if not isinstance(logits, torch.Tensor):
            logits = logits.logits

        logits = logits[:, -1, :]
        probas = torch.softmax(logits, dim=-1)
        token_id_next = torch.argmax(probas, dim=1, keepdim=True)  # Pure Greed
        token_ids = torch.cat((token_ids, token_id_next), dim=1) 

    return token_ids

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special=set(['<|endoftext|>']))
    encoded_tensor = torch.tensor(encoded).unsqueeze(0).to(torch_device)
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    decoded = tokenizer.decode(token_ids[0].tolist())
    return decoded


In [21]:
from torch import nn
def loss_fn(logits, targets):
    vocab_size = logits.shape[-1]
    loss = nn.CrossEntropyLoss()(logits.view(-1, vocab_size), targets.view(-1))
    return loss

In [22]:
def calc_loss_batch(input_batch, target_batch, model, device=torch_device):
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)
    logits = model(input_batch).logits
    loss = torch.nn.functional.cross_entropy(
        logits.view(-1, logits.shape[-1]), 
        target_batch.view(-1),
    )
    return loss

In [23]:
def calc_loss_loader(dataloader, model, device, num_batches=None):
    total_loss = 0
    if len(dataloader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(dataloader)
    else:
        num_batches = min(num_batches, len(dataloader))
    for i, (input_batch, target_batch) in enumerate(dataloader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

### Preloaded model testing

In [24]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False,
}
GPT_CONFIG_355M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 1024,
    "n_heads": 16,
    "n_layers": 24,
    "drop_rate": 0.1,
    "qkv_bias": False,
}

GPT_CONFIG = GPT_CONFIG_355M

In [25]:
sample_input = "Every effort moves you"
token_ids = generate_text_simple(
    gpt2, 
    text_to_token_ids(sample_input, tokenizer), 
    max_new_tokens=15, 
    context_size=GPT_CONFIG["context_length"]
)

In [26]:
print(token_ids_to_text(token_ids, tokenizer))

Every effort moves you forward, but you must be careful. You must not let your guard down


In [27]:
sample_spam_q = (
    "Is the following text 'spam'? Answer with 'yes' or 'no':"
    " 'You are a winner and you have been specially"
    " selected to receive $1000 cash or a $2000 award.'"
)
token_ids = generate_text_simple(
    gpt2, 
    text_to_token_ids(sample_spam_q, tokenizer), 
    max_new_tokens=15, 
    context_size=GPT_CONFIG["context_length"]
)
print(token_ids_to_text(token_ids, tokenizer))


Is the following text 'spam'? Answer with 'yes' or 'no': 'You are a winner and you have been specially selected to receive $1000 cash or a $2000 award.'

The following text 'spam'? Answer with 'yes' or


In [28]:
print(formatted_data[1])

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction
What are the three primary colors?

### Output
The three primary colors are red, blue, and yellow.


In [29]:
sample_instruction = """'Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction
Edit the following sentence for grammar.

### Input
He go to the park every day.

### Output"""
sample_answer = """He goes to the park every day."""
token_ids = generate_text_simple(
    gpt2, 
    text_to_token_ids(sample_instruction, tokenizer), 
    max_new_tokens=15, 
    context_size=GPT_CONFIG["context_length"]
)
print(token_ids_to_text(token_ids, tokenizer))


'Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction
Edit the following sentence for grammar.

### Input
He go to the park every day.

### Output

He go to the park every day.

### Output



## Fine-tuning (same as training recipe)

In [30]:
model = gpt2.to(torch_device)

In [31]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=3072, nx=1024)
          (c_proj): Conv1D(nf=1024, nx=1024)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=4096, nx=1024)
          (c_proj): Conv1D(nf=1024, nx=4096)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)

In [32]:
from torch import nn
torch.manual_seed(123)


<torch._C.Generator at 0x72672e1af570>

In [33]:
len(train_dataloader), len(validation_dataloader), len(test_dataloader)

(11050, 650, 1300)

In [34]:
train_loss = calc_loss_loader(train_dataloader, model, torch_device, num_batches=10)
validation_loss = calc_loss_loader(validation_dataloader, model, torch_device, num_batches=10)
test_loss = calc_loss_loader(test_dataloader, model, torch_device, num_batches=10)
print(f"Train loss: {train_loss:.4f}, Validation loss: {validation_loss:.4f}, Test loss: {test_loss:.4f}")

Train loss: 3.2824, Validation loss: 3.1368, Test loss: 3.2574


## Train

In [35]:
def evaluate_model(model, train_dataloader, validation_dataloader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_dataloader, model, device, num_batches=eval_iter)
        validation_loss = calc_loss_loader(validation_dataloader, model, device, num_batches=eval_iter)
    model.train()
    print(f"Train loss: {train_loss:.4f}, validation loss: {validation_loss:.4f}")
    return train_loss, validation_loss

def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = GPT_CONFIG["context_length"]
    token_ids = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(model, token_ids, 50, context_size)
    decoded_text = tokenizer.decode(token_ids[0].tolist())
    print(decoded_text.replace("\n", " "))
    model.train()

In [36]:
def train_model_simple(
    model, train_dataloader, test_dataloader,
    optimizer, device, num_epochs,
    eval_freq, eval_iter, start_context, tokenizer
):
    train_losses, test_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    for epoch in range(num_epochs):
        model.train()
        for input_batch, target_batch in train_dataloader:
            optimizer.zero_grad()
            # print(input_batch)
            # print(target_batch)
            loss = calc_loss_batch(
                input_batch, target_batch, model, device
            )
            loss.backward()
            # grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            # print(f"Gradient norm: {grad_norm:.4f}")
            optimizer.step()
            tokens_seen += input_batch.numel()
            global_step += 1

            if global_step %eval_freq == 0:
                train_loss, test_loss = evaluate_model(
                    model, train_dataloader, test_dataloader, device, eval_iter
                )
                train_losses.append(train_loss)
                test_losses.append(test_loss)
                track_tokens_seen.append(tokens_seen)
                print(
                    f"Epoch {epoch + 1} (Step {global_step:06d}): "
                    f"Train Loss: {train_loss:.3f}, "
                    f"Val Loss: {test_loss:.3f}"
                )

        generate_and_print_sample(
            model, tokenizer, device, start_context
        )
    return train_losses, test_losses, track_tokens_seen


In [37]:
import time
start_time = time.time()
torch.manual_seed(123)
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr = 0.00005, weight_decay = 0.1
)
num_epochs = 2
train_losses, test_losses, track_tokens_seen = train_model_simple(
    gpt2, train_dataloader, test_dataloader, optimizer, torch_device,
    num_epochs, eval_freq=5, eval_iter=5, start_context=sample_instruction, tokenizer=tokenizer,
)
end_time = time.time()
print(f"Training time: {end_time - start_time:.2f} seconds")

Train loss: 2.4060, validation loss: 2.6876
Epoch 1 (Step 000000): Train Loss: 2.406, Val Loss: 2.688
Train loss: 1.8223, validation loss: 1.8991
Epoch 1 (Step 000005): Train Loss: 1.822, Val Loss: 1.899
Train loss: 1.6765, validation loss: 1.7581
Epoch 1 (Step 000010): Train Loss: 1.676, Val Loss: 1.758
Train loss: 1.7293, validation loss: 1.7074
Epoch 1 (Step 000015): Train Loss: 1.729, Val Loss: 1.707
Train loss: 1.7899, validation loss: 1.6726
Epoch 1 (Step 000020): Train Loss: 1.790, Val Loss: 1.673
Train loss: 1.5373, validation loss: 1.6532
Epoch 1 (Step 000025): Train Loss: 1.537, Val Loss: 1.653
Train loss: 1.4878, validation loss: 1.6425
Epoch 1 (Step 000030): Train Loss: 1.488, Val Loss: 1.643
Train loss: 1.6702, validation loss: 1.6273
Epoch 1 (Step 000035): Train Loss: 1.670, Val Loss: 1.627
Train loss: 1.5748, validation loss: 1.6167
Epoch 1 (Step 000040): Train Loss: 1.575, Val Loss: 1.617
Train loss: 1.5834, validation loss: 1.6127
Epoch 1 (Step 000045): Train Loss: 1.5

In [38]:
train_loss = calc_loss_loader(train_dataloader, model, torch_device, num_batches=10)
validation_loss = calc_loss_loader(validation_dataloader, model, torch_device, num_batches=10)
test_loss = calc_loss_loader(test_dataloader, model, torch_device, num_batches=10)
print(f"Train loss: {train_loss:.4f}, Validation loss: {validation_loss:.4f}, Test loss: {test_loss:.4f}")

Train loss: 1.0387, Validation loss: 1.4368, Test loss: 1.4859
