# GPT Architecture – Implementation

#### Step 1: Start with the Skeleton

In [354]:
import torch
import math

# Model configuration
batch_size = 1
sequence_length = 5
embedding_dim = 16
num_heads = 4
head_dim = embedding_dim // num_heads
vocab_size = 50257
dropout_rate = 0.1

#### Step 2: Create a Pretend Sentence for Input
- Imagine a pretend input sentence where each word is replaced by a number
- This creates a sentence with 5 random words (token IDs between 0 and vocab_size)

In [356]:
tokenizer = tiktoken.get_encoding("gpt2")

# Here is the pretend sentence:
sentence = "The cat sat on the"

# Tokenize the sentence into token IDs
input_tokens = tokenizer.encode(sentence)

# Adjust shape to match (batch_size, sequence_length)
# If your sentence is shorter, pad it. If longer, truncate it.
if len(input_tokens) < sequence_length:
    input_tokens += [0] * (sequence_length - len(input_tokens))  # Padding
else:
    input_tokens = input_tokens[:sequence_length]                # Truncate

# Convert to tensor
x = torch.tensor([input_tokens])

print("Input Token IDs:", x)

Input Token IDs: tensor([[ 464, 3797, 3332,  319,  262]])


#### Step 3: Convert words into Numbers that GPT can understand (Token and Position Embedding)

In [358]:
# Create a table that converts each word into a list of 16 numbers
token_embedding = torch.rand(vocab_size, embedding_dim) # Shape: [50257, 16]

# Create another table that tells the model the position of each word (1st, 2nd, 3rd...)
position_embedding = torch.rand(sequence_length, embedding_dim) # Shape: [5, 16]

# Look up each word in the token embedding table
tok_emb = token_embedding[x] # Shape: [1, 5, 16] — words converted into vectors

# Look up position embeddings and apply to the sentence
pos_emb = position_embedding.unsqueeze(0).repeat(batch_size, 1, 1)  # Shape: [1, 5, 16] — position info for each word

# Combine word meaning + word position
x_emb = tok_emb + pos_emb

# Apply dropout — randomly removes some info to help the model generalize better
x_emb = torch.dropout(x_emb, p=dropout_rate, train=True)

print("Token + Position Embedding Shape:", x_emb.shape)

Token + Position Embedding Shape: torch.Size([1, 5, 16])


#### Step 3: Pass through Dummy Transformer Blocks (12 Times)

In [360]:
# Pretend there are 12 processing steps (Transformer Blocks)
# Each step refines the understanding of the sentence

x_trf = x_emb.clone() # Make a copy to process

for _ in range(12):
    # Repeat this block 12 times (like stacking 12 LEGO blocks)
    # Normally, each block does smart stuff: attention, thinking, balancing.
    # BUT since this is a dummy model, we skip all of that.
    x_trf = x_trf  ## Do nothing, just pass it as is

print("Output after Transformer Stack:", x_trf.shape)

Output after Transformer Stack: torch.Size([1, 5, 16])


#### Step 4: Final Normalization 

In [362]:
# Normally this step would clean and stabilize the numbers.
# But in this dummy model, we skip it and pass as is.
x_norm = x_trf  # No-op

#### Step 5: Output Prediction

In [364]:
# Create a table that converts the final numbers back to word predictions
output_projection = torch.rand(embedding_dim, vocab_size)
# Shape: [16, 50257] — from features back to possible words

# Multiply the output with this table to get raw scores (logits) for every word in the vocabulary
logits = x_norm @ output_projection

print("Logits shape:", logits.shape)  # This means: for each of the 5 words, the model gives scores for all 50257 words


Logits shape: torch.Size([1, 5, 50257])


# Step 6: Convert logits into next predicted word

In [366]:
import torch.nn.functional as F
import tiktoken

# Load the GPT-2 tokenizer
tokenizer = tiktoken.get_encoding("gpt2")

# Convert logits to probabilities
probs = F.softmax(logits, dim=-1)

# Focus on the last token in the sentence
last_token_probs = probs[0, -1]   # Shape: [vocab_size]

# Pick the word with the highest probability
predicted_token_id = torch.argmax(last_token_probs).item()

# Convert token ID back to a readable word
predicted_word = tokenizer.decode([predicted_token_id])

print("Predicted Word:", predicted_word)


Predicted Word:  troubling
