In [1]:
import json
import os
import urllib
import tiktoken
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn.functional as F

In [2]:

def download_and_load_file(file_path, url):
    if not os.path.exists(file_path):
        with urllib.request.urlopen(url) as response:
            text_data = response.read().decode("utf-8")
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(text_data)
    else:
        with open(file_path, "r", encoding="utf-8") as file:
            text_data = file.read()
    with open(file_path, "r") as file:
        data = json.load(file)
    return data

file_path = "../data/alpaca_data.json"
# url = (
# "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch"
# "/main/ch07/01_main-chapter-code/instruction-data.json"
# )
url = ("https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/main/alpaca_data.json")
data = download_and_load_file(file_path, url)
print("Number of entries:", len(data))

Number of entries: 52002


In [3]:
print("Example entry:\n", data[1])

Example entry:
 {'instruction': 'What are the three primary colors?', 'input': '', 'output': 'The three primary colors are red, blue, and yellow.'}


In [4]:
print("Another example entry:\n", data[999])

Another example entry:
 {'instruction': 'Describe the flavor profile of the following type of cuisine', 'input': 'Japanese', 'output': 'Japanese cuisine is characterized by its subtle and delicate flavors, featuring a combination of salty, sweet, sour, and umami flavors. It also utilizes fresh ingredients with a focus on preserving their natural flavors.'}


In [5]:
def format_input(entry):
    instruction_text = (
            f"Below is an instruction that describes a task. "
            f"Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
        )
    input_text = (
    f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""
    )
    return instruction_text + input_text

In [6]:
model_input = format_input(data[1])
desired_response = f"\n\n### Response:\n{data[1]['output']}"
print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
What are the three primary colors?

### Response:
The three primary colors are red, blue, and yellow.


In [7]:
train_portion = int(len(data) * 0.85)
test_portion = int(len(data) * 0.1)
val_portion = len(data) - train_portion - test_portion

train_data = data[:train_portion]
test_data = data[train_portion:train_portion + test_portion]
val_data = data[train_portion + test_portion:]
print("Training set length: 85%", len(train_data))
print("Validation set length: 5%", len(val_data))
print("Test set length: 10%", len(test_data))

Training set length: 85% 44201
Validation set length: 5% 2601
Test set length: 10% 5200


In [8]:

class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.encoded_texts = []
        for entry in data:
            instruction_plus_input = format_input(entry)
            response_text = f"\n\n### Response:\n{entry['output']}"
            full_text = instruction_plus_input + response_text
            self.encoded_texts.append(
            tokenizer.encode(full_text)
            )
    def __getitem__(self, index):
        return self.encoded_texts[index]
    def __len__(self):
        return len(self.data)

In [9]:
tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

[50256]


In [10]:
def custom_collate_draft_1(batch, pad_token_id=50256, device="cpu"):
    
    batch_max_length = max(len(item)+1 for item in batch)
    # print(batch_max_length)
    inputs_lst = []
    for item in batch:
        
        new_item = item.copy()
        new_item += [pad_token_id]

        padded = (
        new_item + [pad_token_id] *
        (batch_max_length - len(new_item))
        )
        inputs = torch.tensor(padded[:-1])
        # print("inputs",inputs)
        inputs_lst.append(inputs)
    inputs_tensor = torch.stack(inputs_lst).to(device)
    return inputs_tensor

In [11]:
inputs_1 = [0, 1, 2, 3, 4]
inputs_2 = [5, 6]
inputs_3 = [7, 8, 9]
batch = (
inputs_1,
inputs_2,
inputs_3
)
print(custom_collate_draft_1(batch))

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])


In [12]:
print(batch)

([0, 1, 2, 3, 4], [5, 6], [7, 8, 9])


In [13]:
def custom_collate_draft_2(batch, pad_token_id=50256, device="cpu"):
    
    batch_max_length = max(len(item)+1 for item in batch)
    inputs_lst, targets_lst = [], []
    for item in batch:
        new_item = item.copy()
        new_item += [pad_token_id]
        padded = (
            new_item + [pad_token_id] *
            (batch_max_length - len(new_item))
                )
        inputs = torch.tensor(padded[:-1])
        targets = torch.tensor(padded[1:])
        inputs_lst.append(inputs)
        targets_lst.append(targets)
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)
    return inputs_tensor, targets_tensor

    
inputs, targets = custom_collate_draft_2(batch)
print("inputs: ",inputs, end="\n\n\n")
print("targets: ",targets)

inputs:  tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])


targets:  tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256, 50256, 50256, 50256],
        [    8,     9, 50256, 50256, 50256]])


In [14]:
torch.cuda.is_available()

True

In [15]:
batch

([0, 1, 2, 3, 4], [5, 6], [7, 8, 9])

In [16]:
max(len(item)+1 for item in batch)

6

In [17]:
a =(batch[1] + [456] *2)

In [18]:
a[:-1]

[5, 6, 456]

In [19]:
def custom_collate_fn(batch, pad_token_id=50256, ignore_index=-100, allowed_max_length=None, device="cpu"):
    batch_max_length = max(len(item)+1 for item in batch)
    inputs_lst, targets_lst = [], []
    for item in batch:
        new_item = item.copy()
        new_item += [pad_token_id]
        padded = (new_item + [pad_token_id] * (batch_max_length - len(new_item)))
        inputs = torch.tensor(padded[:-1])
        targets = torch.tensor(padded[1:])
        mask = targets == pad_token_id
        indices = torch.nonzero(mask).squeeze()
        if indices.numel() > 1:
            targets[indices[1:]] = ignore_index
        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]
        inputs_lst.append(inputs)
        targets_lst.append(targets)
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)
    return inputs_tensor, targets_tensor

In [20]:
inputs, targets = custom_collate_fn(batch)
print(inputs)
print(targets)

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])
tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256,  -100,  -100,  -100],
        [    8,     9, 50256,  -100,  -100]])


In [21]:
logits_1 = torch.tensor(
   [ [-1.0, 1.0],
    [-0.5, 1.5]]
)

In [22]:
targets_1 = torch.tensor([0, 1]) # Correct token indices to generate
loss_1 = torch.nn.functional.cross_entropy(logits_1, targets_1)
print(loss_1)

tensor(1.1269)


In [23]:
logits_1.shape

torch.Size([2, 2])

In [24]:
logits_2 = torch.tensor(
        [[-1.0, 1.0],
        [-0.5, 1.5],
        [-0.5, 1.5]]
)
targets_2 = torch.tensor([0, 1, 1])
loss_2 = torch.nn.functional.cross_entropy(logits_2, targets_2)
print(loss_2)

tensor(0.7936)


In [25]:
targets_3 = torch.tensor([0, 1, -100])
loss_3 = torch.nn.functional.cross_entropy(logits_2, targets_3)
print(loss_3)
print("loss_1 == loss_3:", loss_1 == loss_3)

tensor(1.1269)
loss_1 == loss_3: tensor(True)


In [26]:
x = torch.FloatTensor([[1.,0.,0.],
                       [0.,1.,0.],
                       [0.,0.,1.]])
y = torch.LongTensor([0,1,2])

print(torch.nn.functional.cross_entropy(x, y))

print(F.softmax(x, 1).log())
print(F.log_softmax(x, 1))

print(F.nll_loss(F.log_softmax(x, 1), y))

tensor(0.5514)
tensor([[-0.5514, -1.5514, -1.5514],
        [-1.5514, -0.5514, -1.5514],
        [-1.5514, -1.5514, -0.5514]])
tensor([[-0.5514, -1.5514, -1.5514],
        [-1.5514, -0.5514, -1.5514],
        [-1.5514, -1.5514, -0.5514]])
tensor(0.5514)


In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.backends.mps.is_available():
    device = torch.device("mps")
print("Device:", device)

Device: cuda


In [28]:
from functools import partial
customized_collate_fn = partial(
custom_collate_fn,
device=device,
allowed_max_length=1024
)

In [29]:
num_workers = 0
batch_size = 1
torch.manual_seed(123)

train_dataset = InstructionDataset(train_data, tokenizer)
train_loader = DataLoader(
train_dataset,
batch_size=batch_size,
collate_fn=customized_collate_fn,
shuffle=True,
drop_last=True,
num_workers=num_workers
)

val_dataset = InstructionDataset(val_data, tokenizer)
val_loader = DataLoader(
val_dataset,
batch_size=batch_size,
collate_fn=customized_collate_fn,
shuffle=False,
drop_last=False,
num_workers=num_workers
)

test_dataset = InstructionDataset(test_data, tokenizer)
test_loader = DataLoader(
test_dataset,
batch_size=batch_size,
collate_fn=customized_collate_fn,
shuffle=False,
drop_last=False,
num_workers=num_workers
)

In [30]:
train_dataset

<__main__.InstructionDataset at 0x72b6b99e4df0>

In [31]:
print("Train loader:")
for inputs, targets in train_loader:
    print(inputs.shape, targets.shape)
    break

Train loader:
torch.Size([1, 88]) torch.Size([1, 88])


In [32]:
inputs[0].shape

torch.Size([88])

In [33]:
inputs[0]

tensor([21106,   318,   281, 12064,   326,  8477,   257,  4876,    13, 19430,
          257,  2882,   326, 20431, 32543,   262,  2581,    13,   198,   198,
        21017, 46486,    25,   198, 15056,   257,  1351,   286,  4736,    11,
        36509,  1123,  1748,   355,  2035,   257,  4166,   393,  5922,  1748,
           13,   198,   198, 21017, 23412,    25,   198,    50,  5173,  1681,
           11, 15338,   390, 42799,    11, 23732,   198,   198, 21017, 18261,
           25,   198,    50,  5173,  1681,   318,   257,  4166,  1748,    13,
          198,    49,   952,   390, 42799,   318,   257,  5922,  1748,    13,
          198,    34, 18131,   318,   257,  5922,  1748,    13],
       device='cuda:0')

In [34]:
targets.shape

torch.Size([1, 88])

In [35]:
a = targets[0].tolist()

In [36]:
len(a)

88

In [37]:
tokenizer.decode(a[:60])

' is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nGiven a list of cities, classify each city as either a developed or developing city.\n\n### Input:\nSydney, Rio de Janeiro, Cairo\n\n### Response:'

In [38]:
# !pip install tensorflow>=2.15.0

In [39]:
from gpt_download import download_and_load_gpt2

BASE_CONFIG = {
"vocab_size": 50257,  # Vocabulary size
"context_length": 1024, # Context length
"drop_rate": 0.0, # Dropout rate
"qkv_bias": True  # Query-key-value bias
}


GPT_CONFIG_124M = {
"vocab_size": 50257,
"context_length": 256,
"emb_dim": 768,
"n_heads": 12,
"n_layers": 12,
"drop_rate": 0.1,
"qkv_bias": False
}
model_configs = {
"gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
"gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
"gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
"gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}
# CHOOSE_MODEL = "gpt2-medium (355M)"
# BASE_CONFIG.update(model_configs[CHOOSE_MODEL])
# model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
# settings, params = download_and_load_gpt2(
# model_size=model_size,
# models_dir="gpt2"
# )
# model = GPTModel(BASE_CONFIG)
# load_weights_into_gpt(model, params)
# model.eval();

2025-05-07 06:46:50.521229: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-07 06:46:50.686164: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746580610.750111    5226 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746580610.766988    5226 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746580610.897448    5226 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [40]:
torch.manual_seed(123)
input_text = format_input(val_data[0])
print(input_text)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Explain how using transitional words help in writing

### Input:
"<noinput>"


In [41]:
# from data_preprocessing.ipynb import generate, text_to_token_ids, token_ids_to_text

def generate(model, idx, max_new_tokens, context_size,
             temperature=0.0, top_k=None, eos_id=None):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]
        
        if top_k is not None:
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(
                logits < min_val,
                torch.tensor(float('-inf')).to(logits.device), logits)
        
        if temperature > 0.0:
            logits = logits / temperature
            probs = torch.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)
            
        if idx_next == eos_id:
            break
        idx = torch.cat((idx, idx_next), dim=1)
    return idx

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor
def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

In [42]:
%cd ..

/home/aroop/llm


In [43]:
from src.model import GPTModel

In [44]:
%cd notebook

/home/aroop/llm/notebook


In [45]:
GPT_CONFIG_124M = {
"vocab_size": 50257,
"context_length": 1024,
"emb_dim": 768,
"n_heads": 12,
"n_layers": 12,
"drop_rate": 0.1,
"qkv_bias": False
}

In [46]:
# from ..src.model import GPTModel
model = GPTModel(GPT_CONFIG_124M)
model.eval()
token_ids = generate(
model=model,
idx=text_to_token_ids(input_text, tokenizer),
max_new_tokens=35,
context_size=BASE_CONFIG["context_length"],
eos_id=50256,
)
generated_text = token_ids_to_text(token_ids, tokenizer)

In [47]:
response_text = generated_text[len(input_text):].strip()
print(response_text)

vul fourth Arche provincialreditation Casting Swe module waive piljsthirst improperondo LV misunderstoodarticlebrowser celeopenshandler charter ReconERY skippingDiv Manor positive Av bows Wilmington Isis Estimatedَ Psal


In [48]:
# copy pasted

def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(
        train_loader, model, device, num_batches=eval_iter
        )
        val_loss = calc_loss_loader(
        val_loader, model, device, num_batches=eval_iter
        )
        model.train()
    return train_loss, val_loss

def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(
    logits.flatten(0, 1), target_batch.flatten()
    )
    return loss

def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        
        if i < num_batches:
            loss = calc_loss_batch(
            input_batch, target_batch, model, device
            )
            total_loss += loss.item()
            
        else:
            break
        return total_loss / num_batches

def train_model_simple(model, train_loader, val_loader,
                        optimizer, device, num_epochs,
                            eval_freq, eval_iter, start_context, tokenizer):
    
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1
   
    for epoch in range(num_epochs):
        model.train()
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward()
            optimizer.step()
            tokens_seen += input_batch.numel()
            global_step += 1
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                    f"Train loss {train_loss:.3f}, "
                    f"Val loss {val_loss:.3f}"
                )
        generate_and_print_sample(model, tokenizer, device, start_context)
    return train_losses, val_losses, track_tokens_seen

def generate_text_simple(model, idx, max_new_tokens, context_size):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        print(idx_cond)
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]
        probas = torch.softmax(logits, dim=-1)
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)
        idx = torch.cat((idx, idx_next), dim=1)
    return idx

def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
        model=model, idx=encoded,
        max_new_tokens=50, context_size=context_size
        )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))
    model.train()

In [49]:
model.to(device)
torch.manual_seed(123)
with torch.no_grad():
    train_loss = calc_loss_loader(
        train_loader, model, device, num_batches=5
    )
val_loss = calc_loss_loader(
val_loader, model, device, num_batches=5
)
print("Training loss:", train_loss)
print("Validation loss:", val_loss)

Training loss: 2.194048309326172
Validation loss: 2.189373779296875


In [50]:
import time
start_time = time.time()
torch.manual_seed(123)
optimizer = torch.optim.AdamW(
model.parameters(), lr=0.00005, weight_decay=0.1
)
num_epochs = 2
train_losses, val_losses, tokens_seen = train_model_simple(
model, train_loader, val_loader, optimizer, device,
num_epochs=num_epochs, eval_freq=5, eval_iter=5,
start_context=format_input(val_data[0]), tokenizer=tokenizer
)
end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Ep 1 (Step 000000): Train loss 2.083, Val loss 2.122
Ep 1 (Step 000005): Train loss 1.845, Val loss 1.869
Ep 1 (Step 000010): Train loss 1.828, Val loss 1.746
Ep 1 (Step 000015): Train loss 1.198, Val loss 1.649
Ep 1 (Step 000020): Train loss 1.556, Val loss 1.580
Ep 1 (Step 000025): Train loss 1.420, Val loss 1.537
Ep 1 (Step 000030): Train loss 1.420, Val loss 1.504
Ep 1 (Step 000035): Train loss 1.408, Val loss 1.479
Ep 1 (Step 000040): Train loss 1.089, Val loss 1.459
Ep 1 (Step 000045): Train loss 0.918, Val loss 1.435
Ep 1 (Step 000050): Train loss 0.909, Val loss 1.420
Ep 1 (Step 000055): Train loss 1.551, Val loss 1.398
Ep 1 (Step 000060): Train loss 1.668, Val loss 1.381
Ep 1 (Step 000065): Train loss 1.354, Val loss 1.367
Ep 1 (Step 000070): Train loss 1.146, Val loss 1.357
Ep 1 (Step 000075): Train loss 0.877, Val loss 1.349
Ep 1 (Step 000080): Train loss 1.050, Val loss 1.337
Ep 1 (Step 000085): Train loss 0.827, Val loss 1.330
Ep 1 (Step 000090): Train loss 0.847, Val loss

KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):
    fig, ax1 = plt.subplots(figsize=(5, 3))
    ax1.plot(epochs_seen, train_losses, label="Training loss")
    ax1.plot(
    epochs_seen, val_losses, linestyle="-.", label="Validation loss"
    )
    ax1.set_xlabel("Epochs")
    ax1.set_ylabel("Loss")
    ax1.legend(loc="upper right")
    ax2 = ax1.twiny()
    ax2.plot(tokens_seen, train_losses, alpha=0)
    ax2.set_xlabel("Tokens seen")
    fig.tight_layout()
    plt.show()

In [None]:
epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))
plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses)

In [None]:
torch.manual_seed(123)

for entry in test_data[:5]:
    input_text = format_input(entry)
    token_ids = generate(model=model, idx=text_to_token_ids(input_text, tokenizer).to(device),
        max_new_tokens=256,
            context_size=BASE_CONFIG["context_length"],
        eos_id=50256
        )
    generated_text = token_ids_to_text(token_ids, tokenizer)
    response_text = (
    generated_text[len(input_text):]
    .replace("### Response:", "")
    .strip()
    )
    print(input_text)
    print(f"\nCorrect response:\n>> {entry['output']}")
    print(f"\nModel response:\n>> {response_text.strip()}")
    print("-------------------------------------")

In [None]:
torch.save({
"model_state_dict": model.state_dict(),
"optimizer_state_dict": optimizer.state_dict(),
},
"model_and_optimizer.pth"
)