In [None]:
from transformers import GPT2LMHeadModel

# Load model and inspect state dict
hf_model = GPT2LMHeadModel.from_pretrained("gpt2")  # 124M
hf_state_dict = hf_model.state_dict()

for name, tensor in hf_state_dict.items():
    print(name, tensor.shape)

# Peek at first 20 values of the positional embedding vector (flattened)
hf_state_dict["transformer.wpe.weight"].view(-1)[:20]


In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

# Visualize positional embeddings and a few rows
plt.imshow(hf_state_dict["transformer.wpe.weight"], cmap="gray")
plt.show()

plt.plot(hf_state_dict["transformer.wpe.weight"][:, 150])
plt.plot(hf_state_dict["transformer.wpe.weight"][:, 200])
plt.plot(hf_state_dict["transformer.wpe.weight"][:, 250])
plt.show()

# Visualize a slice of attention projection weights
plt.imshow(hf_state_dict["transformer.h.1.attn.c_attn.weight"][:300, :300], cmap="gray")
plt.show()


In [None]:
from transformers import pipeline, set_seed

text_generator = pipeline("text-generation", model="gpt2")
set_seed(42)
text_generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5)


In [None]:
# Manual top-k sampling generation (equivalent to HF defaults but explicit)
import torch
from torch.nn import functional as F

gen_model = GPT2LMHeadModel.from_pretrained("gpt2")  # 124M
gen_model.eval()

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
gen_model.to(DEVICE)

torch.manual_seed(42)
if DEVICE == "cuda":
    torch.cuda.manual_seed(42)

# Prompt: "Hello, I'm a language model,"
prompt_tokens = [15496, 11, 314, 1101, 257, 3303, 2746, 11]
token_batch = torch.tensor(prompt_tokens, dtype=torch.long).unsqueeze(0).repeat(5, 1)  # (B=5, T=8)
x_tokens = token_batch.to(DEVICE)

max_length = 30
top_k = 50

with torch.no_grad():
    while x_tokens.size(1) < max_length:
        logits = gen_model(x_tokens).logits                  # (B, T, vocab_size)
        last_logits = logits[:, -1, :]                       # (B, vocab_size)
        probs = F.softmax(last_logits, dim=-1)               # (B, vocab_size)
        topk_probs, topk_indices = torch.topk(probs, top_k, dim=-1)  # (B, top_k)
        sampled_idx = torch.multinomial(topk_probs, 1)       # (B, 1)
        next_token = torch.gather(topk_indices, -1, sampled_idx)     # (B, 1)
        x_tokens = torch.cat((x_tokens, next_token), dim=1)

# Decode the first 30 tokens of each sequence
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

for i in range(x_tokens.size(0)):
    seq_tokens = x_tokens[i, :max_length].tolist()
    print(">", tokenizer.decode(seq_tokens))


In [None]:
# Tiny Shakespeare sample
with open("input.txt", "r") as f:
    tiny_text = f.read()

data_slice = tiny_text[:1000]  # first 1,000 characters
print(data_slice[:100])

import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
tok_ids = tokenizer.encode(data_slice)
print(tok_ids[:24])

import torch
buf = torch.tensor(tok_ids[:25])  # need 25 to make 4x6 pairs (24) for (x,y)
x_win = buf[:-1].view(4, 6)
y_win = buf[1:].view(4, 6)
print(x_win)
print(y_win)


In [None]:
# Check tied weights: lm_head.weight should equal transformer.wte.weight
import torch

print(hf_state_dict["lm_head.weight"].shape)
print(hf_state_dict["transformer.wte.weight"].shape)

print(torch.equal(hf_state_dict["lm_head.weight"], hf_state_dict["transformer.wte.weight"]))

print(hf_state_dict["lm_head.weight"].data_ptr())
print(hf_state_dict["transformer.wte.weight"].data_ptr())


In [None]:
# Standard deviation grows inside the residual stream (toy)
import torch

residual = torch.zeros(768)
num_layers = 100
for _ in range(num_layers):
    residual += (num_layers ** -0.5) * torch.randn(768)

print(residual.std())


In [None]:
# Simple MLP and gradient check
import torch

mlp = torch.nn.Sequential(
    torch.nn.Linear(16, 32),
    torch.nn.GELU(),
    torch.nn.Linear(32, 1),
)

torch.manual_seed(42)
x_in = torch.randn(4, 16)
y_true = torch.randn(4, 1)

mlp.zero_grad()
y_pred = mlp(x_in)
loss = torch.nn.functional.mse_loss(y_pred, y_true)
loss.backward()
print(mlp[0].weight.grad.view(-1)[:10])

# Gradient accumulation demo (restore mean normalizer manually)
mlp.zero_grad()
for i in range(4):
    y_pred_i = mlp(x_in[i])
    loss_i = torch.nn.functional.mse_loss(y_pred_i, y_true[i]) / 4.0  # add back 1/4 normalizer
    loss_i.backward()
print(mlp[0].weight.grad.view(-1)[:10])


In [None]:
# Parse and visualize a training log
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

size_label = "124M"

loss_baseline = {
    "124M": 3.2924,
}[size_label]

hella_gpt2_baseline = {  # HellaSwag for GPT-2
    "124M": 0.294463,
    "350M": 0.375224,
    "774M": 0.431986,
    "1558M": 0.488946,
}[size_label]

hella_gpt3_baseline = {  # HellaSwag for GPT-3
    "124M": 0.337,
    "350M": 0.436,
    "774M": 0.510,
    "1558M": 0.547,
}[size_label]

with open("log124M_40B/log.txt", "r") as f:
    log_lines = f.readlines()

# Group metrics by stream (train, val, hella)
streams = {}
for line in log_lines:
    step_str, stream_name, value_str = line.strip().split()
    streams.setdefault(stream_name, {})[int(step_str)] = float(value_str)

# Convert to sorted x/y for plotting
streams_xy = {}
for stream_name, kv in streams.items():
    xy_sorted = sorted(kv.items())
    steps, vals = zip(*xy_sorted)
    streams_xy[stream_name] = (list(steps), list(vals))

plt.figure(figsize=(16, 6))

# Panel 1: train/val losses
plt.subplot(1, 2, 1)
xs, ys = streams_xy["train"]
ys = np.array(ys)
plt.plot(xs, ys, label=f"nanogpt ({size_label}) train loss")
print("Min Train Loss:", np.min(ys))

xs, ys = streams_xy["val"]
ys = np.array(ys)
plt.plot(xs, ys, label=f"nanogpt ({size_label}) val loss")
if loss_baseline is not None:
    plt.axhline(y=loss_baseline, linestyle="--", label=f"OpenAI GPT-2 ({size_label}) checkpoint val loss")
plt.xlabel("steps")
plt.ylabel("loss")
plt.yscale("log")
plt.ylim(top=4.0)
plt.legend()
plt.title("Loss")

print("Min Validation Loss:", np.min(ys))

# Panel 2: HellaSwag eval
plt.subplot(1, 2, 2)
xs, ys = streams_xy["hella"]
ys = np.array(ys)
plt.plot(xs, ys, label=f"nanogpt ({size_label})")
if hella_gpt2_baseline:
    plt.axhline(y=hella_gpt2_baseline, linestyle="--", label=f"OpenAI GPT-2 ({size_label}) checkpoint")
if hella_gpt3_baseline:
    plt.axhline(y=hella_gpt3_baseline, linestyle="--", label=f"OpenAI GPT-3 ({size_label}) checkpoint")
plt.xlabel("steps")
plt.ylabel("accuracy")
plt.legend()
plt.title("HellaSwag eval")

print("Max Hellaswag eval:", np.max(ys))
