In [1]:
import torch
from gptmodel import GPTModel

GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

# torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.eval() # turn into eval mode 


GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features

In [2]:
# running untrained model
import tiktoken
from gptmodel import generate_text_simple

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")
token_ids = generate_text_simple(
    model=model, 
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"]
)
print("Output text:\n", token_ids_to_text(token_ids, tokenizer))


Output text:
 Every effort moves you retrievalables-------------------------------- weddinggovernmental.",Master loved referees familiar


In [3]:
# the next few cells demonstrate calculating loss --
inputs = torch.tensor([[16833, 3626, 6100], # ["every effort moves",
                        [40, 1107, 588]]) # "I really like"]
targets = torch.tensor([[3626, 6100, 345 ], # [" effort moves you",
                        [1107, 588, 11311]]) # " really like chocolate"]

with torch.no_grad():
    logits = model(inputs)
probas = torch.softmax(logits, dim=-1)
print(probas.shape) # batch size x number of tokens in each row x vocab size

token_ids = torch.argmax(probas, dim=-1, keepdim=True)
print("Token IDs:\n", token_ids)

print(f"Targets batch 1: {token_ids_to_text(targets[0], tokenizer)}")
print(f"Outputs batch 1:" f" {token_ids_to_text(token_ids[0].flatten(), tokenizer)}")

torch.Size([2, 3, 50257])
Token IDs:
 tensor([[[ 8604],
         [ 6423],
         [28079]],

        [[47922],
         [10285],
         [30649]]])
Targets batch 1:  effort moves you
Outputs batch 1:  landsThenongs


In [4]:
text_idx = 0 # ["every effort moves",
target_probas_1 = probas[text_idx, [0, 1, 2], targets[text_idx]] # get probabiltiies of "effort moves you"?
print("Text 1:", target_probas_1)
text_idx = 1 # "I really like"]
target_probas_2 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 2:", target_probas_2)

# the resulting tensors are three indexes representing the probabilities of outputting "effort moves you"

Text 1: tensor([1.8930e-05, 4.9060e-06, 2.9921e-05])
Text 2: tensor([1.5949e-05, 1.4157e-05, 9.9239e-06])


In [5]:
# calculate the log probabilities of the previous softmax probabiltiies
log_probas = torch.log(torch.cat((target_probas_1, target_probas_2)))
print(log_probas)
# avg
avg_log_probas = torch.mean(log_probas)
print(avg_log_probas)
# take negative
neg_avg_log_probas = avg_log_probas * -1
print(neg_avg_log_probas)

tensor([-10.8748, -12.2250, -10.4170, -11.0461, -11.1653, -11.5206])
tensor(-11.2081)
tensor(11.2081)


In [6]:
# now calculate cross entropy loss between the logits and the targets
#  cross entropy loss is the same as what was done in the previous cell, but using pytorch's built in function
# measures the difference between the probabilities between the token IDs we want to generate and the 
# logits which contain all model outputs
print("Logits shape:", logits.shape)
print("Targets shape:", targets.shape)
logits_flat = logits.flatten(0, 1)
targets_flat = targets.flatten()
print("Flattened logits:", logits_flat.shape)
print("Flattened targets:", targets_flat.shape)

loss = torch.nn.functional.cross_entropy(logits_flat, targets_flat)
print(loss)

Logits shape: torch.Size([2, 3, 50257])
Targets shape: torch.Size([2, 3])
Flattened logits: torch.Size([6, 50257])
Flattened targets: torch.Size([6])
tensor(11.2081)


In [7]:
# now start computing the loss on training and validation datasets
file_path = "the-verdict.txt"
with open(file_path, "r", encoding="utf-8") as file:
    text_data = file.read()

total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data))
print("Characters:", total_characters)
print("Tokens:", total_tokens)

Characters: 20479
Tokens: 5145


In [8]:
# split the training dataset into 90:10 training to validation
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

In [12]:
# instantiate the dataloader from text-embedding.ipynb
from gptmodel import create_dataloader_v1
torch.manual_seed(123)
train_loader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=True,
    shuffle=True,
    num_workers=0
)
val_loader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=GPT_CONFIG_124M["context_length"],
    drop_last=False,
    shuffle=False,
    num_workers=0
)

# check that the sizes are correct
print("Train loader:")
for x, y in train_loader:
    print(x.shape, y.shape)
print("\nValidation loader:")
for x, y in val_loader:
    print(x.shape, y.shape)

Train loader:
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])

Validation loader:
torch.Size([2, 256]) torch.Size([2, 256])


In [14]:
# function to calculate loss of a given batch returned by the data loaders
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(
        logits.flatten(0, 1), target_batch.flatten()
    )
    return loss

In [30]:
def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))

    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(
                input_batch, target_batch, model, device
            )
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

In [None]:
# finally, we can calculate the loss of training/validation loaders with the data as input and shifted data as predicted output
# (measures how well the training data is predicted by the model)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device)
    val_loss = calc_loss_loader(val_loader, model, device)
print("Training loss:", train_loss)
print("Validation loss:", val_loss)

tensor([[  286,  1762,    30,  2011, 29483,  2540,   284,   467,   257,  1310,
          4295,   438,    40,  2936, 10927,   290,  8627,    13,   198,   198,
             1,  7454,    11,   618,   314,  3114,   510,    11,   314,  3947,
           284,   766,   257,  8212,  2157,   465,  1969, 12768,   680, 21213,
           438,   292,   611,   339,   550,   262,  3200,    11,   290,   547,
         28297,  2241,   416,  4769,   340,   736,   422,   502,    13,  1320,
         41851,   515,   502,   991,   517,    13,   383,  3200,    30,  4162,
            11,   314,   550,   257,  3200,  2861,  8208,   286,   465,     0,
           314, 37901,   379,   262, 21978, 44896,    11,   290,  3088,   617,
           286,   616, 49025,  5330, 15910,    13,   887,   484,  4054,   502,
            11,   484,  1067, 11137,    13,   314,  2497,   326,   339,  2492,
           470,  4964,   262,   905,    88, 10340,   438,    40,  3521,   470,
         11786,   465,  3241,    26,   339,   655,  