## Andrej Karpathy Walktrouugh

In [None]:
# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2025-03-19 00:56:06--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2025-03-19 00:56:07 (27.6 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [None]:
# read the data
with open("input.txt", "r", encoding="utf-8") as f:
  text = f.read()

In [None]:
print(f"length of the dataset {len(text)}")

length of the dataset 1115394


In [None]:
# look the first 500 character
print(text[:500])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


# Tokenization
(character language level)

In [None]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("".join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [None]:
# create a mapping from characters to integers
stoi = { ch:i for i, ch in enumerate(chars)}
itos = { i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join(itos[i] for i in l)

print(encode("hello there"))
print(decode(encode("hello there")))

[46, 43, 50, 50, 53, 1, 58, 46, 43, 56, 43]
hello there


for tokenization, there's some libraries that can do that, for example:

  google use SentencePiece [https://github.com/google/sentencepiece](https://github.com/google/sentencepiece)
  
  openai use tiktoken [https://github.com/openai/tiktoken](https://github.com/openai/tiktoken)


In [None]:
# encode the entire dataset
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:500])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [None]:
print(decode([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,])) # the first row

First Citizen:
Bef


# Split the data

In [None]:
n = int(0.9*len(data)) # take 90% value based on the data
train_data = data[:n]
val_data = data[n:]

"batch"-ing the data!!

In [None]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [None]:
print(decode(train_data[:block_size+1].tolist())) # Converts the tensor to a Python list.

First Cit


In [None]:
x = train_data[:block_size]
y = train_data[:block_size+1]
for t in range(block_size):
  context = x[:t+1]
  target = y[t]
  print(f"when input is {context} target: {target}")

when input is tensor([18]) target: 18
when input is tensor([18, 47]) target: 47
when input is tensor([18, 47, 56]) target: 56
when input is tensor([18, 47, 56, 57]) target: 57
when input is tensor([18, 47, 56, 57, 58]) target: 58
when input is tensor([18, 47, 56, 57, 58,  1]) target: 1
when input is tensor([18, 47, 56, 57, 58,  1, 15]) target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) target: 47


In [None]:
torch.manual_seed(1121)

batch_size = 4  # how many independent sequence will process in parallel
block_size = 8  # what is the maximum context length for prediction

def get_batches(split):
  if split == 'train':
    data = train_data
  else:
    data = val_data

  ix = torch.randint(len(data) - block_size, (batch_size, ))
  x = torch.stack([data[i:i+block_size] for i in ix ])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  return x, y

xb, yb = get_batches('train')
print("inputs:")
print(xb.shape)
print(xb)
print("targets:")
print(yb.shape)
print(yb)

print("--------------")

for b in range(batch_size):
  for t in range(block_size):
    context = xb[b, :t+1]
    target = yb[b, t]
    print(f"when input is {context.tolist()} the taget: {target}")

print("--------------------")
print(decode(context.tolist()))
print("--------------------")

for b in range(batch_size):
  for t in range(block_size):
    context = xb[b, :t+1]
    target = yb[b, t]
    print(f"when input is {decode(context.tolist())} the target: {decode([target.tolist()])}")

inputs:
torch.Size([4, 8])
tensor([[52, 52,  5, 42,  1, 58, 46, 43],
        [46, 43, 47, 56,  1, 42, 43, 39],
        [47, 57,  1, 51, 53, 57, 58,  1],
        [23, 21, 26, 19,  1, 17, 16, 35]])
targets:
torch.Size([4, 8])
tensor([[52,  5, 42,  1, 58, 46, 43, 51],
        [43, 47, 56,  1, 42, 43, 39, 58],
        [57,  1, 51, 53, 57, 58,  1, 53],
        [21, 26, 19,  1, 17, 16, 35, 13]])
--------------
when input is [52] the taget: 52
when input is [52, 52] the taget: 5
when input is [52, 52, 5] the taget: 42
when input is [52, 52, 5, 42] the taget: 1
when input is [52, 52, 5, 42, 1] the taget: 58
when input is [52, 52, 5, 42, 1, 58] the taget: 46
when input is [52, 52, 5, 42, 1, 58, 46] the taget: 43
when input is [52, 52, 5, 42, 1, 58, 46, 43] the taget: 51
when input is [46] the taget: 43
when input is [46, 43] the taget: 47
when input is [46, 43, 47] the taget: 56
when input is [46, 43, 47, 56] the taget: 1
when input is [46, 43, 47, 56, 1] the taget: 42
when input is [46, 43, 47

# BIGRAM!! (simplest language model)

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.token_embeding_table = nn.Embedding(vocab_size, vocab_size)

  def forward(self, idx, targets):

    logits = self.token_embeding_table(idx)

    return logits

model = BigramLanguageModel(vocab_size)
out = model(xb, yb)  # xb = input, yb = target
print(out.shape)

torch.Size([4, 8, 65])


In [None]:
# adding loss and generate function!!

class BigramLanguageModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.token_embeding_table = nn.Embedding(vocab_size, vocab_size)

  def forward(self, idx, targets=None):

    logits = self.token_embeding_table(idx) # (B, T, C)

    if targets == None:
      loss = None
    else:
      # fixing the shape for crossentropy loss
      B, T, C  = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    for _ in range(max_new_tokens):
      # get the prediction
      logits, loss = self(idx)
      # focus only on the last time step
      logits = logits[:, -1, :] # become (B, C)
      # apply softmax
      probs = F.softmax(logits, dim=-1) # (B, C)
      # sample from the distribution
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      # append sampled index to the running sequence
      idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
    return idx

model = BigramLanguageModel(vocab_size)
logits, loss = model(xb, yb)  # xb = input, yb = target
print(out.shape)
print(loss)
print(decode(model.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


torch.Size([4, 8, 65])
tensor(4.7411, grad_fn=<NllLossBackward0>)

hbH

:CLP.A!fq'3ggt!O!T?X!!SA?W&TrpvYybSE3w&S BXUhmiKYyTmWMPhhmnHKj!!btgnwNNULuEzRuYyiWEQxPX!$3C'MBj


there you go, the bigram language model!! (without training lol)

explanation for bigram and loss:

[https://colab.research.google.com/drive/1gcGbNbQBIxTlz07mXZ2Df4b8f8zJkYh9](https://colab.research.google.com/drive/1gcGbNbQBIxTlz07mXZ2Df4b8f8zJkYh9)

the reason why we change the shape for the loss:

[https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html)

## Training the bigram model

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [None]:
batch_size = 32
for steps in range(14000):

  # sample a batch of data
  xb, yb = get_batches('train')

  # evaluate the loss
  logits, loss = model(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

print(loss.item())

2.4667112827301025


In [None]:
print(decode(model.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


BULURYothorshoolandine! t h

RY:
Wha-whe, heresety fousull, stholount;
AUKironongan tak t jot, wan, meriongry henowatsthepe ainceleand t, fig:
NELLYORComicecarindiss spoubloureroanrsth, wisshr ppprt nd,
Mart atht ld hathexcoute:
ppe

LOvere ing beeses, u biue p! nofllealllatho hathe thowil oute lf ur,

Insihe wan hin bo tllaighelat at; b n asir cove, tis, tond d mall amid s
CUSouss hatou t
BRTome,

I Yere t t totly ngut angustculexproour whis misorabround.


Bul bety
Yond sithe, sthigaist tsoss 


there you go, bigram language model for real (because we trained it)!!

Note:

B = independent sequences process in parallel
T = number of token in each sequence
C = number of feature per token

# Self-attention

###### THE MATHEMATICAL TRICK IN SELF-ATTENTION (using toy example)

In [None]:
# THE MATHEMATICAL TRICK IN SELF-ATTENTION

torch.manual_seed(1337)
B, T, C = 4, 8, 2 # batch, time, channel
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

we want a variable that contain an average value based on the previous time on every batch


small intuition: if I have [1, 2, 3, 4,] --> [mean of 1, mean of 1,2, mean of 1,2,3, mean of 1,2,3,4] ----> [1, 1.5, 2, 2.5]

In [None]:
# 1st option (not efficient)
xbow = torch.zeros((B, T, C))
for b in range(B):
  for t in range(T):
    xprev = x[b,:t+1] #(t, C)
    xbow[b, t] = torch.mean(xprev, 0)

In [None]:
x[0]

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])

In [None]:
xbow[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [None]:
# 2nd option (using matrix so it's more efficient)

torch.manual_seed(42)

a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)  # a / sum every value on each dimension of a
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print("a=")
print(a)
print('----')
print("b=")
print(b)
print('----')
print("c=")
print(c)
print('----')

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
----
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
----
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])
----


In [None]:
wei = torch.tril(torch.ones(T, T))   # T = represent 8 character in this case
wei = wei / torch.sum(wei, 1, keepdim=True)
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [None]:
xbow2 = wei @ x
xbow2[1]

tensor([[ 1.3488, -0.1396],
        [ 0.8173,  0.4127],
        [-0.1342,  0.4395],
        [ 0.2711,  0.4774],
        [ 0.2421,  0.0694],
        [ 0.0084,  0.0020],
        [ 0.0712, -0.1128],
        [ 0.2527,  0.2149]])

In [None]:
xbow[1]

tensor([[ 1.3488, -0.1396],
        [ 0.8173,  0.4127],
        [-0.1342,  0.4395],
        [ 0.2711,  0.4774],
        [ 0.2421,  0.0694],
        [ 0.0084,  0.0020],
        [ 0.0712, -0.1128],
        [ 0.2527,  0.2149]])

In [None]:
torch.allclose(xbow[0], xbow2[0])

True

and it's the same with the 1st option

In [None]:
# 3rd option (using softmax)
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float("-inf"))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x

why softmax?

exp of 0 is 1

and exp of -inf is 0

We create a mask with values -∞ (negative infinity) for positions we want to ignore.

and when applying softmax, any value close to -∞ becomes zero probability. basicly preventing information leakage.

In [None]:
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float("-inf"))
wei

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [None]:
wei = F.softmax(wei, dim=-1)  # exp of 0 = 1, and exp of -inf is 0
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [None]:
xbow3[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [None]:
torch.allclose(xbow[0], xbow3[0])

True

# THE CRUX OF SELF-ATTENTION

In [None]:
# 4th option (self-attention!!!)

torch.manual_seed(1337)
B, T, C = (4, 8, 32) # batch, time, channels
x = torch.randn(B, T, C) # input using random value for the purpose of study

# single head perform self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x)  # (B, T, 16)
q = query(x) # (B, T, 16)
wei = q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) --> (B, T, T) because we want to preserve the batch

tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei@ v

out.shape

torch.Size([4, 8, 16])

q = "What am I looking for?"

k = "What do I have?"

### Scaling self-attention

In [None]:
# 4th option (self-attention!!!)

torch.manual_seed(1337)
B, T, C = (4, 8, 32) # batch, time, channels
x = torch.randn(B, T, C) # input using random value for the purpose of study

# single head perform self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x)  # (B, T, 16)
q = query(x) # (B, T, 16)
wei = q @ k.transpose(-2, -1) * C**-0.5 # here's the scaling

tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei@ v

out.shape

torch.Size([4, 8, 16])

# Bring everything together !!!

the same with the code above

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameter
batch_size = 32
block_size = 8
max_iters = 2000
eval_interval = 300
learning_rate = 1e-2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200

torch.manual_seed(1337)

# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open("input.txt", "r", encoding="utf-8") as f:
  text = f.read()

# tokenization
chars = sorted(list(set(text)))
vocab_size = len(chars)

# create a mapping from characters that occur in this text
stoi = { ch:i for i, ch in enumerate(chars)}
itos = { i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join(itos[i] for i in l)

# train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
  # generate a small batch of data input x and target y
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  x, y = x.to(device), y.to(device)

  return x, y

@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval()
  for split in ('train', 'val'):
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split)
      logits, loss = model(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  model.train()
  return out

# super simple bigram model
class BigramLanguageModel(nn.Module):

  def __init__(self):
    super().__init__()

    self.token_embeding_table = nn.Embedding(vocab_size, vocab_size)

  def forward(self, idx, targets=None):

    # idx and targets are both (B, T) tensor of integers
    logits = self.token_embeding_table(idx) # (B, T, C)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    for _ in range(max_new_tokens):
      logits, loss = self(idx)
      # foucus only on the last time step
      logits = logits[:, -1, :] # (B, C)
      # apply softmax to get probabilities
      probs = F.softmax(logits, dim=-1) # (B, C)
      # sample from the distribution
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      # append sampled index into running sequence
      idx = torch.cat((idx, idx_next), dim=1) # (B,T+1)
    return idx

model = BigramLanguageModel()
m = model.to(device)

# create pytorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

  if iter % eval_interval == 0:
    losses = estimate_loss()
    print(f"Step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  # sample a batch of data
  xb, yb = get_batch('train')

  # evaluate the loss
  logits, loss = model(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))


--2025-03-19 00:57:05--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2025-03-19 00:57:05 (28.3 MB/s) - ‘input.txt.1’ saved [1115394/1115394]

Step 0: train loss 4.7305, val loss 4.7241
Step 300: train loss 2.8110, val loss 2.8249
Step 600: train loss 2.5434, val loss 2.5682
Step 900: train loss 2.4932, val loss 2.5088
Step 1200: train loss 2.4863, val loss 2.5035
Step 1500: train loss 2.4665, val loss 2.4921
Step 1800: train loss 2.4683, val loss 2.4936

Anafyir tit, wrerto:
ICK:
M?
Coveay pbanPr Totil;
ABy sub fourintersod t he s t Boke

S:
II:
Antioucldeam.
Whof he taves tellan enourd geitouroucan yomistrsetho

## Upgrade

add token embedding

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameter
batch_size = 32
block_size = 8
max_iters = 200
eval_interval = 300
learning_rate = 1e-2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd=32

torch.manual_seed(1337)

# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open("input.txt", "r", encoding="utf-8") as f:
  text = f.read()

# tokenization
chars = sorted(list(set(text)))
vocab_size = len(chars)

# create a mapping from characters that occur in this text
stoi = { ch:i for i, ch in enumerate(chars)}
itos = { i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join(itos[i] for i in l)

# train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
  # generate a small batch of data input x and target y
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  x, y = x.to(device), y.to(device)

  return x, y

@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval()
  for split in ('train', 'val'):
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split)
      logits, loss = model(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  model.train()
  return out

# super simple bigram model
class BigramLanguageModel(nn.Module):

  def __init__(self):
    super().__init__()

    self.token_embeding_table = nn.Embedding(vocab_size, n_embd)
    self.lm_head = nn.Linear(n_embd, vocab_size)

  def forward(self, idx, targets=None):

    # idx and targets are both (B, T) tensor of integers
    tok_emb = self.token_embeding_table(idx) # (B, T, C)  --> C in here is n_emb
    logits = self.lm_head(tok_emb) # (B, T, C)  --> C in here is vocab_size ===> (B, T, vocab_size)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    for _ in range(max_new_tokens):
      logits, loss = self(idx)
      # foucus only on the last time step
      logits = logits[:, -1, :] # (B, C)
      # apply softmax to get probabilities
      probs = F.softmax(logits, dim=-1) # (B, C)
      # sample from the distribution
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      # append sampled index into running sequence
      idx = torch.cat((idx, idx_next), dim=1) # (B,T+1)
    return idx

model = BigramLanguageModel()
m = model.to(device)

# create pytorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

  if iter % eval_interval == 0:
    losses = estimate_loss()
    print(f"Step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  # sample a batch of data
  xb, yb = get_batch('train')

  # evaluate the loss
  logits, loss = model(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))


--2025-03-19 00:57:14--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.2’


2025-03-19 00:57:14 (22.9 MB/s) - ‘input.txt.2’ saved [1115394/1115394]

Step 0: train loss 4.3886, val loss 4.3734

INTEVyonge thy I hyy mirtom or.
F the ty win mt, hanow,
He t be th myouveelthe kesthee?
Thefthe hemalde n orcherizn th ghieHucelis
Su lder ph you IMWhameachigshal ORE:
Hrt 's!es f he uo bleangn beyoollis yowhy use, ng ndis'des; t, e n, tony ths gengen famolat gatout od t btatl m, qnce scurcheg bee lanofe bat my; LIfWAUCo teres.
I tovigworescy the the Ses bly ilyonomere, cenghe orkhaine lere omemathyy bllot my at ad athe lathealud

add positional embedding

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameter
batch_size = 32
block_size = 8
max_iters = 200
eval_interval = 300
learning_rate = 1e-2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd=32

torch.manual_seed(1337)

# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open("input.txt", "r", encoding="utf-8") as f:
  text = f.read()

# tokenization
chars = sorted(list(set(text)))
vocab_size = len(chars)

# create a mapping from characters that occur in this text
stoi = { ch:i for i, ch in enumerate(chars)}
itos = { i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join(itos[i] for i in l)

# train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
  # generate a small batch of data input x and target y
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  x, y = x.to(device), y.to(device)

  return x, y

@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval()
  for split in ('train', 'val'):
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split)
      logits, loss = model(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  model.train()
  return out

# super simple bigram model
class BigramLanguageModel(nn.Module):

  def __init__(self):
    super().__init__()

    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.position_embedding_table = nn.Embedding(block_size, n_embd)
    self.lm_head = nn.Linear(n_embd, vocab_size)

  def forward(self, idx, targets=None):
    B, T = idx.shape
    # idx and targets are both (B, T) tensor of integers
    tok_emb = self.token_embedding_table(idx) # (B, T, C)  --> C in here is n_emb
    pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T, C)
    x = tok_emb + pos_emb   #(B, T, C)
    logits = self.lm_head(x) # (B, T, vocab_size)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    for _ in range(max_new_tokens):
      idx_cond = idx[:, -block_size:]
      logits, loss = self(idx_cond)
      # foucus only on the last time step
      logits = logits[:, -1, :] # (B, C)
      # apply softmax to get probabilities
      probs = F.softmax(logits, dim=-1) # (B, C)
      # sample from the distribution
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      # append sampled index into running sequence
      idx = torch.cat((idx, idx_next), dim=1) # (B,T+1)
    return idx

model = BigramLanguageModel()
m = model.to(device)

# create pytorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

  if iter % eval_interval == 0:
    losses = estimate_loss()
    print(f"Step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  # sample a batch of data
  xb, yb = get_batch('train')

  # evaluate the loss
  logits, loss = model(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))


--2025-03-19 00:57:16--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.3’


2025-03-19 00:57:16 (25.5 MB/s) - ‘input.txt.3’ saved [1115394/1115394]

Step 0: train loss 4.4801, val loss 4.4801

F.
D:
Fout od t statl lt qnce scuraveg whe lanofe tat at--LIfWiloo teres.
I tovigworescy athe I o peron, ilyomo; per cend?
Horkhaine lere omemathyy splot my at ad at falsthealud bimeirmes ot, st ad heal the aROon--lwhotod,
OLalLe I lond!
Ma- gyo r, s e, h s quce
 s peavefe, pagorsom asiny tog

Fyourist lof s ce hear lon:
FENCUKAntas la; m yonthy.
II An baoou:
TI 's s he s;
Wh nd mres gomnd the hitha, thiserhice mat:
C--

BICDpad 

add self-attention

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameter
batch_size = 32
block_size = 8
max_iters = 200
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd=32

torch.manual_seed(1337)

# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open("input.txt", "r", encoding="utf-8") as f:
  text = f.read()

# tokenization
chars = sorted(list(set(text)))
vocab_size = len(chars)

# create a mapping from characters that occur in this text
stoi = { ch:i for i, ch in enumerate(chars)}
itos = { i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join(itos[i] for i in l)

# train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
  # generate a small batch of data input x and target y
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  x, y = x.to(device), y.to(device)

  return x, y

@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval()
  for split in ('train', 'val'):
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split)
      logits, loss = model(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  model.train()
  return out

class Head(nn.Module):

  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embd, head_size, bias=False)
    self.query = nn.Linear(n_embd, head_size, bias=False)
    self.value = nn.Linear(n_embd, head_size, bias=False)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

  def forward(self, x):
    B,T,C = x.shape
    k = self.key(x)
    q = self.query(x)

    wei = q @ k.transpose(-2, -1) * C**-0.5
    wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
    wei = F.softmax(wei, dim=-1)

    v = self.value(x)
    out = wei @ v
    return out

# super simple bigram model
class BigramLanguageModel(nn.Module):

  def __init__(self):
    super().__init__()

    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.position_embedding_table = nn.Embedding(block_size, n_embd)
    self.sa_head = Head(n_embd)
    self.lm_head = nn.Linear(n_embd, vocab_size)

  def forward(self, idx, targets=None):
    B, T = idx.shape
    # idx and targets are both (B, T) tensor of integers
    tok_emb = self.token_embedding_table(idx) # (B, T, C)  --> C in here is n_emb
    pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T, C)
    x = tok_emb + pos_emb   #(B, T, C)
    x = self.sa_head(x)
    logits = self.lm_head(x) # (B, T, vocab_size)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    for _ in range(max_new_tokens):
      idx_cond = idx[:, -block_size:]
      logits, loss = self(idx_cond)
      # foucus only on the last time step
      logits = logits[:, -1, :] # (B, C)
      # apply softmax to get probabilities
      probs = F.softmax(logits, dim=-1) # (B, C)
      # sample from the distribution
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      # append sampled index into running sequence
      idx = torch.cat((idx, idx_next), dim=1) # (B,T+1)
    return idx

model = BigramLanguageModel()
m = model.to(device)

# create pytorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

  if iter % eval_interval == 0:
    losses = estimate_loss()
    print(f"Step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  # sample a batch of data
  xb, yb = get_batch('train')

  # evaluate the loss
  logits, loss = model(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))


--2025-03-19 00:57:18--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.4’


2025-03-19 00:57:18 (28.3 MB/s) - ‘input.txt.4’ saved [1115394/1115394]

Step 0: train loss 4.2000, val loss 4.2047
Step 100: train loss 3.2338, val loss 3.2548

bsSttiosovkauk Ey h md hss gf,f ySsn'
.AT B htytrN sts g brw'eienoYNpot,id iee ttnc tIIhoitefrde ;ttmeeagGde
yKeTNfuv w
:ygOher,ll Ptinen hhsytotolipdRn l:

wet
H
WaiHhiabveo b h twWhehtet
GFdysonmCeu
v,uKnotgl.Z!r xcp
:T hiredoCu a
3rgNiK:Tnrnm
JV
T.'a'endfs Ie hs As' owmsimlae HymoGuysuoPyiaieH' t, ooc btew cfomweisradrka.
Tsn idRnenrmrsdiu,m,utsselke
a?vri Cign
Kcd  OMs rtoea!ngdti 

add multi-head self-attention

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameter
batch_size = 32
block_size = 8
max_iters = 3000
eval_interval = 300
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd=32

torch.manual_seed(1337)

# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open("input.txt", "r", encoding="utf-8") as f:
  text = f.read()

# tokenization
chars = sorted(list(set(text)))
vocab_size = len(chars)

# create a mapping from characters that occur in this text
stoi = { ch:i for i, ch in enumerate(chars)}
itos = { i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join(itos[i] for i in l)

# train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
  # generate a small batch of data input x and target y
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  x, y = x.to(device), y.to(device)

  return x, y

@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval()
  for split in ('train', 'val'):
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split)
      logits, loss = model(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  model.train()
  return out

class Head(nn.Module):

  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embd, head_size, bias=False)
    self.query = nn.Linear(n_embd, head_size, bias=False)
    self.value = nn.Linear(n_embd, head_size, bias=False)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

  def forward(self, x):
    B,T,C = x.shape
    k = self.key(x)
    q = self.query(x)

    wei = q @ k.transpose(-2, -1) * C**-0.5
    wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
    wei = F.softmax(wei, dim=-1)

    v = self.value(x)
    out = wei @ v
    return out

class MultiHeadAttention(nn.Module):
  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])

  def forward(self, x):
    return torch.cat([h(x) for h in self.heads], dim=-1)

# super simple bigram model
class BigramLanguageModel(nn.Module):

  def __init__(self):
    super().__init__()

    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.position_embedding_table = nn.Embedding(block_size, n_embd)
    self.sa_head = MultiHeadAttention(4, n_embd//4)  # i.e. 4 heads of 8-dimensional self-attention
    self.lm_head = nn.Linear(n_embd, vocab_size)

  def forward(self, idx, targets=None):
    B, T = idx.shape
    # idx and targets are both (B, T) tensor of integers
    tok_emb = self.token_embedding_table(idx) # (B, T, C)  --> C in here is n_emb
    pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T, C)
    x = tok_emb + pos_emb   #(B, T, C)
    x = self.sa_head(x)
    logits = self.lm_head(x) # (B, T, vocab_size)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    for _ in range(max_new_tokens):
      idx_cond = idx[:, -block_size:]
      logits, loss = self(idx_cond)
      # foucus only on the last time step
      logits = logits[:, -1, :] # (B, C)
      # apply softmax to get probabilities
      probs = F.softmax(logits, dim=-1) # (B, C)
      # sample from the distribution
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      # append sampled index into running sequence
      idx = torch.cat((idx, idx_next), dim=1) # (B,T+1)
    return idx

model = BigramLanguageModel()
m = model.to(device)

# create pytorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

  if iter % eval_interval == 0:
    losses = estimate_loss()
    print(f"Step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  # sample a batch of data
  xb, yb = get_batch('train')

  # evaluate the loss
  logits, loss = model(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))


--2025-03-19 00:57:21--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.5’


2025-03-19 00:57:21 (22.8 MB/s) - ‘input.txt.5’ saved [1115394/1115394]

Step 0: train loss 4.2227, val loss 4.2226
Step 300: train loss 2.8402, val loss 2.8482
Step 600: train loss 2.6071, val loss 2.6175
Step 900: train loss 2.5209, val loss 2.5235
Step 1200: train loss 2.4699, val loss 2.4689
Step 1500: train loss 2.4201, val loss 2.4294
Step 1800: train loss 2.3949, val loss 2.4017
Step 2100: train loss 2.3656, val loss 2.3681
Step 2400: train loss 2.3427, val loss 2.3595
Step 2700: train loss 2.3246, val loss 2.3454

ABows theod davis ardi

add feed-forward layer

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameter
batch_size = 32
block_size = 8
max_iters = 3000
eval_interval = 300
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd=32

torch.manual_seed(1337)

# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open("input.txt", "r", encoding="utf-8") as f:
  text = f.read()

# tokenization
chars = sorted(list(set(text)))
vocab_size = len(chars)

# create a mapping from characters that occur in this text
stoi = { ch:i for i, ch in enumerate(chars)}
itos = { i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join(itos[i] for i in l)

# train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
  # generate a small batch of data input x and target y
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  x, y = x.to(device), y.to(device)

  return x, y

@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval()
  for split in ('train', 'val'):
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split)
      logits, loss = model(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  model.train()
  return out

class Head(nn.Module):

  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embd, head_size, bias=False)
    self.query = nn.Linear(n_embd, head_size, bias=False)
    self.value = nn.Linear(n_embd, head_size, bias=False)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

  def forward(self, x):
    B,T,C = x.shape
    k = self.key(x)
    q = self.query(x)

    wei = q @ k.transpose(-2, -1) * C**-0.5
    wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
    wei = F.softmax(wei, dim=-1)

    v = self.value(x)
    out = wei @ v
    return out

class MultiHeadAttention(nn.Module):
  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])

  def forward(self, x):
    return torch.cat([h(x) for h in self.heads], dim=-1)

class FeedForward(nn.Module):

  def __init__(self, n_embd):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embd, n_embd),
        nn.ReLU(),
    )

  def forward(self, x):
    return self.net(x)


# super simple bigram model
class BigramLanguageModel(nn.Module):

  def __init__(self):
    super().__init__()

    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.position_embedding_table = nn.Embedding(block_size, n_embd)
    self.sa_head = MultiHeadAttention(4, n_embd//4)  # i.e. 4 heads of 8-dimensional self-attention
    self.ffwd = FeedForward(n_embd)
    self.lm_head = nn.Linear(n_embd, vocab_size)

  def forward(self, idx, targets=None):
    B, T = idx.shape
    # idx and targets are both (B, T) tensor of integers
    tok_emb = self.token_embedding_table(idx) # (B, T, C)  --> C in here is n_emb
    pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T, C)
    x = tok_emb + pos_emb   #(B, T, C)
    x = self.sa_head(x)
    x = self.ffwd(x)  # (B, T, C)
    logits = self.lm_head(x) # (B, T, vocab_size)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    for _ in range(max_new_tokens):
      idx_cond = idx[:, -block_size:]
      logits, loss = self(idx_cond)
      # foucus only on the last time step
      logits = logits[:, -1, :] # (B, C)
      # apply softmax to get probabilities
      probs = F.softmax(logits, dim=-1) # (B, C)
      # sample from the distribution
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      # append sampled index into running sequence
      idx = torch.cat((idx, idx_next), dim=1) # (B,T+1)
    return idx

model = BigramLanguageModel()
m = model.to(device)

# create pytorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

  if iter % eval_interval == 0:
    losses = estimate_loss()
    print(f"Step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  # sample a batch of data
  xb, yb = get_batch('train')

  # evaluate the loss
  logits, loss = model(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))


--2025-03-19 01:07:58--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.6’


2025-03-19 01:07:59 (23.6 MB/s) - ‘input.txt.6’ saved [1115394/1115394]

Step 0: train loss 4.1996, val loss 4.1995
Step 300: train loss 2.7609, val loss 2.7737
Step 600: train loss 2.5634, val loss 2.5733
Step 900: train loss 2.4929, val loss 2.4792
Step 1200: train loss 2.4409, val loss 2.4348
Step 1500: train loss 2.3911, val loss 2.4002
Step 1800: train loss 2.3620, val loss 2.3672
Step 2100: train loss 2.3343, val loss 2.3430
Step 2400: train loss 2.3102, val loss 2.3326
Step 2700: train loss 2.2846, val loss 2.3050

Casth ollowsixs my hin

add residual connection

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameter
batch_size = 32
block_size = 8
max_iters = 3000
eval_interval = 300
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd=32

torch.manual_seed(1337)

# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open("input.txt", "r", encoding="utf-8") as f:
  text = f.read()

# tokenization
chars = sorted(list(set(text)))
vocab_size = len(chars)

# create a mapping from characters that occur in this text
stoi = { ch:i for i, ch in enumerate(chars)}
itos = { i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join(itos[i] for i in l)

# train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
  # generate a small batch of data input x and target y
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  x, y = x.to(device), y.to(device)

  return x, y

@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval()
  for split in ('train', 'val'):
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split)
      logits, loss = model(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  model.train()
  return out

class Head(nn.Module):

  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embd, head_size, bias=False)
    self.query = nn.Linear(n_embd, head_size, bias=False)
    self.value = nn.Linear(n_embd, head_size, bias=False)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

  def forward(self, x):
    B,T,C = x.shape
    k = self.key(x)
    q = self.query(x)

    wei = q @ k.transpose(-2, -1) * C**-0.5
    wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
    wei = F.softmax(wei, dim=-1)

    v = self.value(x)
    out = wei @ v
    return out

class MultiHeadAttention(nn.Module):
  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj = nn.Linear(n_embd, n_embd)

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.proj(out)
    return out

class FeedForward(nn.Module):

  def __init__(self, n_embd):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embd, 4 * n_embd),
        nn.ReLU(),
        nn.Linear(4 * n_embd, n_embd),
    )

  def forward(self, x):
    return self.net(x)


class Block(nn.Module):
  def __init__(self, n_embd, n_head):
    super().__init__()
    # we want to intersperses communitation and then computation
    head_size = n_embd // n_head
    self.sa = MultiHeadAttention(n_head, head_size)  # communication
    self.ffwd = FeedForward(n_embd)                  # computation

  def forward(self, x):
    x = x + self.sa(x)
    x = x + self.ffwd(x)
    return x

# super simple bigram model
class BigramLanguageModel(nn.Module):

  def __init__(self):
    super().__init__()

    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.position_embedding_table = nn.Embedding(block_size, n_embd)
    self.blocks = nn.Sequential(
        Block(n_embd, n_head=4),
        Block(n_embd, n_head=4),
        Block(n_embd, n_head=4),
    )
    #self.sa_head = MultiHeadAttention(4, n_embd//4)  # i.e. 4 heads of 8-dimensional self-attention
    #self.ffwd = FeedForward(n_embd)
    self.lm_head = nn.Linear(n_embd, vocab_size)

  def forward(self, idx, targets=None):
    B, T = idx.shape
    # idx and targets are both (B, T) tensor of integers
    tok_emb = self.token_embedding_table(idx) # (B, T, C)  --> C in here is n_emb
    pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T, C)
    x = tok_emb + pos_emb   #(B, T, C)
    x = self.blocks(x) # (B, T, C)
    #x = self.sa_head(x)
    #x = self.ffwd(x)  # (B, T, C)
    logits = self.lm_head(x) # (B, T, vocab_size)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    for _ in range(max_new_tokens):
      idx_cond = idx[:, -block_size:]
      logits, loss = self(idx_cond)
      # foucus only on the last time step
      logits = logits[:, -1, :] # (B, C)
      # apply softmax to get probabilities
      probs = F.softmax(logits, dim=-1) # (B, C)
      # sample from the distribution
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      # append sampled index into running sequence
      idx = torch.cat((idx, idx_next), dim=1) # (B,T+1)
    return idx

model = BigramLanguageModel()
m = model.to(device)

# create pytorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

  if iter % eval_interval == 0:
    losses = estimate_loss()
    print(f"Step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  # sample a batch of data
  xb, yb = get_batch('train')

  # evaluate the loss
  logits, loss = model(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))


--2025-03-19 01:27:18--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.7’


2025-03-19 01:27:19 (23.6 MB/s) - ‘input.txt.7’ saved [1115394/1115394]

Step 0: train loss 4.6255, val loss 4.6233
Step 300: train loss 2.4845, val loss 2.4845
Step 600: train loss 2.3610, val loss 2.3650
Step 900: train loss 2.2848, val loss 2.2949
Step 1200: train loss 2.2160, val loss 2.2435
Step 1500: train loss 2.1850, val loss 2.2176
Step 1800: train loss 2.1544, val loss 2.1919
Step 2100: train loss 2.1177, val loss 2.1619
Step 2400: train loss 2.1080, val loss 2.1569
Step 2700: train loss 2.1012, val loss 2.1499

Put ows couls, my sive

add layernorm

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameter
batch_size = 32
block_size = 8
max_iters = 3000
eval_interval = 300
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd=32

torch.manual_seed(1337)

# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open("input.txt", "r", encoding="utf-8") as f:
  text = f.read()

# tokenization
chars = sorted(list(set(text)))
vocab_size = len(chars)

# create a mapping from characters that occur in this text
stoi = { ch:i for i, ch in enumerate(chars)}
itos = { i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join(itos[i] for i in l)

# train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
  # generate a small batch of data input x and target y
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  x, y = x.to(device), y.to(device)

  return x, y

@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval()
  for split in ('train', 'val'):
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split)
      logits, loss = model(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  model.train()
  return out

class Head(nn.Module):

  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embd, head_size, bias=False)
    self.query = nn.Linear(n_embd, head_size, bias=False)
    self.value = nn.Linear(n_embd, head_size, bias=False)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

  def forward(self, x):
    B,T,C = x.shape
    k = self.key(x)
    q = self.query(x)

    wei = q @ k.transpose(-2, -1) * C**-0.5
    wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
    wei = F.softmax(wei, dim=-1)

    v = self.value(x)
    out = wei @ v
    return out

class MultiHeadAttention(nn.Module):
  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj = nn.Linear(n_embd, n_embd)

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.proj(out)
    return out

class FeedForward(nn.Module):

  def __init__(self, n_embd):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embd, 4 * n_embd),
        nn.ReLU(),
        nn.Linear(4 * n_embd, n_embd),
    )

  def forward(self, x):
    return self.net(x)


class Block(nn.Module):
  def __init__(self, n_embd, n_head):
    super().__init__()
    # we want to intersperses communitation and then computation
    head_size = n_embd // n_head
    self.sa = MultiHeadAttention(n_head, head_size)  # communication
    self.ffwd = FeedForward(n_embd)                  # computation
    self.ln1 = nn.LayerNorm(n_embd)
    self.ln2 = nn.LayerNorm(n_embd)

  def forward(self, x):
    x = x + self.sa(self.ln1(x))
    x = x + self.ffwd(self.ln2(x))
    return x

# super simple bigram model
class BigramLanguageModel(nn.Module):

  def __init__(self):
    super().__init__()

    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.position_embedding_table = nn.Embedding(block_size, n_embd)
    self.blocks = nn.Sequential(
        Block(n_embd, n_head=4),
        Block(n_embd, n_head=4),
        Block(n_embd, n_head=4),
        nn.LayerNorm(n_embd),
    )
    #self.sa_head = MultiHeadAttention(4, n_embd//4)  # i.e. 4 heads of 8-dimensional self-attention
    #self.ffwd = FeedForward(n_embd)
    self.lm_head = nn.Linear(n_embd, vocab_size)

  def forward(self, idx, targets=None):
    B, T = idx.shape
    # idx and targets are both (B, T) tensor of integers
    tok_emb = self.token_embedding_table(idx) # (B, T, C)  --> C in here is n_emb
    pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T, C)
    x = tok_emb + pos_emb   #(B, T, C)
    x = self.blocks(x) # (B, T, C)
    #x = self.sa_head(x)
    #x = self.ffwd(x)  # (B, T, C)
    logits = self.lm_head(x) # (B, T, vocab_size)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    for _ in range(max_new_tokens):
      idx_cond = idx[:, -block_size:]
      logits, loss = self(idx_cond)
      # foucus only on the last time step
      logits = logits[:, -1, :] # (B, C)
      # apply softmax to get probabilities
      probs = F.softmax(logits, dim=-1) # (B, C)
      # sample from the distribution
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      # append sampled index into running sequence
      idx = torch.cat((idx, idx_next), dim=1) # (B,T+1)
    return idx

model = BigramLanguageModel()
m = model.to(device)

# create pytorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

  if iter % eval_interval == 0:
    losses = estimate_loss()
    print(f"Step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  # sample a batch of data
  xb, yb = get_batch('train')

  # evaluate the loss
  logits, loss = model(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))


--2025-03-19 01:39:04--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.9’


2025-03-19 01:39:04 (23.6 MB/s) - ‘input.txt.9’ saved [1115394/1115394]

Step 0: train loss 4.3103, val loss 4.3097
Step 300: train loss 2.5220, val loss 2.5316
Step 600: train loss 2.3644, val loss 2.3668
Step 900: train loss 2.2732, val loss 2.2837
Step 1200: train loss 2.1984, val loss 2.2303
Step 1500: train loss 2.1678, val loss 2.2023
Step 1800: train loss 2.1357, val loss 2.1760
Step 2100: train loss 2.1031, val loss 2.1443
Step 2400: train loss 2.0926, val loss 2.1359
Step 2700: train loss 2.0806, val loss 2.1292

PERLIO:
Serms, my sity

## Scaling Up the Model!!

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameter
batch_size = 64
block_size = 256
max_iters = 3000
eval_interval = 300
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd= 384
n_head = 6
n_layer = 6
dropout = 0.2

torch.manual_seed(1337)

# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open("input.txt", "r", encoding="utf-8") as f:
  text = f.read()

# tokenization
chars = sorted(list(set(text)))
vocab_size = len(chars)

# create a mapping from characters that occur in this text
stoi = { ch:i for i, ch in enumerate(chars)}
itos = { i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join(itos[i] for i in l)

# train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
  # generate a small batch of data input x and target y
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  x, y = x.to(device), y.to(device)

  return x, y

@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval()
  for split in ('train', 'val'):
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split)
      logits, loss = model(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  model.train()
  return out

class Head(nn.Module):

  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embd, head_size, bias=False)
    self.query = nn.Linear(n_embd, head_size, bias=False)
    self.value = nn.Linear(n_embd, head_size, bias=False)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    B,T,C = x.shape
    k = self.key(x)
    q = self.query(x)

    wei = q @ k.transpose(-2, -1) * C**-0.5
    wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
    wei = F.softmax(wei, dim=-1)
    wei = self.dropout(wei)

    v = self.value(x)
    out = wei @ v
    return out

class MultiHeadAttention(nn.Module):
  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj = nn.Linear(n_embd, n_embd)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.dropout(self.proj(out))
    return out

class FeedForward(nn.Module):

  def __init__(self, n_embd):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embd, 4 * n_embd),
        nn.ReLU(),
        nn.Linear(4 * n_embd, n_embd),
        nn.Dropout(dropout),
    )

  def forward(self, x):
    return self.net(x)


class Block(nn.Module):
  def __init__(self, n_embd, n_head):
    super().__init__()
    # we want to intersperses communitation and then computation
    head_size = n_embd // n_head
    self.sa = MultiHeadAttention(n_head, head_size)  # communication
    self.ffwd = FeedForward(n_embd)                  # computation
    self.ln1 = nn.LayerNorm(n_embd)
    self.ln2 = nn.LayerNorm(n_embd)

  def forward(self, x):
    x = x + self.sa(self.ln1(x))
    x = x + self.ffwd(self.ln2(x))
    return x

# super simple bigram model
class BigramLanguageModel(nn.Module):

  def __init__(self):
    super().__init__()

    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.position_embedding_table = nn.Embedding(block_size, n_embd)
    self.blocks = nn.Sequential( * [Block(n_embd, n_head=n_head) for _ in range(n_layer)])
    self.ln_f = nn.LayerNorm(n_embd)  # final layernorm
    self.lm_head = nn.Linear(n_embd, vocab_size)

  def forward(self, idx, targets=None):
    B, T = idx.shape
    # idx and targets are both (B, T) tensor of integers
    tok_emb = self.token_embedding_table(idx) # (B, T, C)  --> C in here is n_emb
    pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T, C)
    x = tok_emb + pos_emb   #(B, T, C)
    x = self.blocks(x) # (B, T, C)
    x = self.ln_f(x)  # (B, T, C)
    logits = self.lm_head(x) # (B, T, vocab_size)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    for _ in range(max_new_tokens):
      idx_cond = idx[:, -block_size:]
      logits, loss = self(idx_cond)
      # foucus only on the last time step
      logits = logits[:, -1, :] # (B, C)
      # apply softmax to get probabilities
      probs = F.softmax(logits, dim=-1) # (B, C)
      # sample from the distribution
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      # append sampled index into running sequence
      idx = torch.cat((idx, idx_next), dim=1) # (B,T+1)
    return idx

model = BigramLanguageModel()
m = model.to(device)

# create pytorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

  if iter % eval_interval == 0:
    losses = estimate_loss()
    print(f"Step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  # sample a batch of data
  xb, yb = get_batch('train')

  # evaluate the loss
  logits, loss = model(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))


--2025-03-19 02:16:38--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.12’


2025-03-19 02:16:38 (22.9 MB/s) - ‘input.txt.12’ saved [1115394/1115394]



KeyboardInterrupt: 