In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import torch.nn.functional as F

In [2]:
# Load GPT-2 small (decoder-only Transformer)
model_name = "gpt2"  # ~124M parameters
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

In [3]:
# Set model to evaluation mode
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [4]:
# Example prompt
prompt = "ChatGPT explains things"

In [5]:
# Encode prompt
input_ids = tokenizer.encode(prompt, return_tensors="pt")

In [6]:
# Set generation parameters
max_new_tokens = 10
pad_token_id = tokenizer.eos_token_id

temperature = 0.8       # Lower = more deterministic, Higher = more random
top_k = 50              # Sample only from top 50 logits
top_p = 0.9             # Nucleus sampling: sample from smallest set of tokens whose cumulative prob >= 0.9

In [7]:
# Generate tokens autoregressively with pad_token_id explicitly set
output_ids = model.generate(
    input_ids,
    attention_mask=torch.ones_like(input_ids),
    max_new_tokens=max_new_tokens,
    do_sample=True,         # Must enable sampling to use top-k/top-p/temperature
    top_k=top_k,
    top_p=top_p,
    temperature=temperature,
    pad_token_id=pad_token_id
)

In [8]:
# Decode to text
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Prompt:", prompt)
print("Generated text:", generated_text)

Prompt: ChatGPT explains things
Generated text: ChatGPT explains things about the program.

In this episode we


example with step-by-step generation:

In [9]:
# Initialize sequence
generated_ids = input_ids.clone()

print("Initial prompt tokens:", generated_ids[0].tolist())

Initial prompt tokens: [30820, 38, 11571, 6688, 1243]


In [10]:
print("Prompt tokens:", tokenizer.convert_ids_to_tokens(input_ids[0].tolist()))

Prompt tokens: ['Chat', 'G', 'PT', 'Ġexplains', 'Ġthings']


In [11]:
# Autoregressive generation step-by-step
for step in range(max_new_tokens):
    # Generate causal mask (upper triangular)
    seq_len = generated_ids.size(1)
    causal_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool()
    
    print(f"\nStep {step+1}: sequence length = {seq_len}")
    print("Causal attention mask:")
    print(causal_mask.int())  # 1 = masked, 0 = attendable

    # Forward pass
    outputs = model(generated_ids)
    logits = outputs.logits

    # Get logits for the last token
    next_token_logits = logits[0, -1, :]

    # Apply temperature
    next_token_logits = next_token_logits / temperature

    # Compute probabilities
    probs = F.softmax(next_token_logits, dim=-1)

    # Sample next token
    next_token_id = torch.multinomial(probs, num_samples=1)
    generated_ids = torch.cat([generated_ids, next_token_id.unsqueeze(0)], dim=1)

    # Decode last token to word
    next_token_word = tokenizer.decode(next_token_id)
    next_token_prob = probs[next_token_id].item()
    print(f"Step {step+1}: Generated token ID={next_token_id.item()}, word='{next_token_word}', prob={next_token_prob:.4f}")

# Decode full generated text
full_generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print("\nFull generated text:\n", full_generated_text)


Step 1: sequence length = 5
Causal attention mask:
tensor([[0, 1, 1, 1, 1],
        [0, 0, 1, 1, 1],
        [0, 0, 0, 1, 1],
        [0, 0, 0, 0, 1],
        [0, 0, 0, 0, 0]], dtype=torch.int32)
Step 1: Generated token ID=345, word=' you', prob=0.0195

Step 2: sequence length = 6
Causal attention mask:
tensor([[0, 1, 1, 1, 1, 1],
        [0, 0, 1, 1, 1, 1],
        [0, 0, 0, 1, 1, 1],
        [0, 0, 0, 0, 1, 1],
        [0, 0, 0, 0, 0, 1],
        [0, 0, 0, 0, 0, 0]], dtype=torch.int32)
Step 2: Generated token ID=1244, word=' might', prob=0.0660

Step 3: sequence length = 7
Causal attention mask:
tensor([[0, 1, 1, 1, 1, 1, 1],
        [0, 0, 1, 1, 1, 1, 1],
        [0, 0, 0, 1, 1, 1, 1],
        [0, 0, 0, 0, 1, 1, 1],
        [0, 0, 0, 0, 0, 1, 1],
        [0, 0, 0, 0, 0, 0, 1],
        [0, 0, 0, 0, 0, 0, 0]], dtype=torch.int32)
Step 3: Generated token ID=407, word=' not', prob=0.7946

Step 4: sequence length = 8
Causal attention mask:
tensor([[0, 1, 1, 1, 1, 1, 1, 1],
        [0, 0,