<a href="https://colab.research.google.com/github/Benteaux/karpathy-tutorials/blob/main/notebooks/makemore5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [None]:


!wget https://raw.githubusercontent.com/karpathy/makemore/master/names.txt

--2024-02-14 14:33:09--  https://raw.githubusercontent.com/karpathy/makemore/master/names.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 228145 (223K) [text/plain]
Saving to: ‘names.txt’


2024-02-14 14:33:09 (8.54 MB/s) - ‘names.txt’ saved [228145/228145]



In [None]:
words = open('names.txt', 'r').read().splitlines()
print(len(words))

32033


In [None]:
chars = sorted(list(set('.'.join(words))))
stoi = {s : i for i, s in enumerate(chars)}
itos = {i : s for s, i in stoi.items()}
vocab_size = len(itos)
print(itos, vocab_size)

{0: '.', 1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z'} 27


In [None]:
import random
random.seed(42)
random.shuffle(words)

In [None]:
block_size = 8

def build_dataset(words):
  x, y = [], []

  for w in words:
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      x.append(context)
      y.append(ix)
      context = context[1:] + [ix]

  x = torch.tensor(x)
  y = torch.tensor(y)
  print(x.shape, y.shape)
  return x, y

n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))
xtr, ytr = build_dataset(words[:n1])
xdev, ydev = build_dataset(words[n1:n2])
xtest, ytest = build_dataset(words[n2:])

torch.Size([182625, 8]) torch.Size([182625])
torch.Size([22655, 8]) torch.Size([22655])
torch.Size([22866, 8]) torch.Size([22866])


In [None]:
class Linear:

  def __init__(self, fan_in, fan_out, bias = True):
    self.weight = torch.randn((fan_in, fan_out)) / fan_in**0.5 # kaiming/he init
    self.bias = torch.zeros(fan_out) if bias else None

  def __call__(self, x):
    self.out = x @ self.weight
    if self.bias is not None:
      self.out += self.bias
    return self.out

  def parameters(self):
    return [self.weight] + ([] if self.bias is None else [self.bias])

# ------------------------------------------------------------------------
class BatchNorm1d:

  def __init__(self, dim, eps = 1e-5, momentum = 0.1):
    self.eps = eps
    self.momentum = momentum
    self.training = True
    # parameters, trained w/ backprop
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)
    # buffers (trained w/ a running momentum update)
    self.running_mean = torch.zeros(dim)
    self.running_var = torch.ones(dim)

  def __call__(self, x):
    # forward pass
    if x.ndim == 2:
      dim = 0
    elif x.ndim == 3:
      dim = (0, 1)
    if self.training:
      xmean = x.mean(dim, keepdim = True) # batch mean
      xvar = x.var(dim, keepdim = True) # batch variance
    else:
      xmean = self.running_mean
      xvar = self.running_var
    xhat = (x - xmean) / (xvar + self.eps)
    self.out = self.gamma * xhat + self.beta
    if self.training:
      with torch.no_grad():
        self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
        self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar

    return self.out

  def parameters(self):
    return [self.gamma, self.beta]

#----------------------------------------------------
class Tanh:
  def __call__(self, x):
    self.out = torch.tanh(x)
    return self.out
  def parameters(self):
    return []

#-------------
class Embedding:

  def __init__(self, num_embeddings, embedding_dim):
    self.weight = torch.randn((num_embeddings, embedding_dim))

  def __call__(self, IX):
    self.out = self.weight[IX]
    return self.out

  def parameters(self):
    return [self.weight]

# ----------------------------
class FlattenConsecutive:

  def __init__(self, n):
    self.n = n

  def __call__(self, x):
    B, T, C = x.shape
    self.out = x.view(B, T//self.n, C*self.n)
    if self.out.shape[1] == 1:
      self.out = self.out.squeeze(1)
    return self.out

  def parameters(self):
    return []

# ------------------
class Sequential:

  def __init__(self, layers):
    self.layers = layers

  def __call__(self, x):
    for layer in self.layers:
      x = layer(x)
    self.out = x
    return self.out

  def parameters(self):
    return [p for layer in self.layers for p in layer.parameters()]

In [None]:
torch.manual_seed(42) # he has a semicolon after it for some reason

<torch._C.Generator at 0x7b95cd7a96d0>

In [None]:
n_embd = 35
n_hidden = 100

model = Sequential([
    Embedding(vocab_size, n_embd),
    FlattenConsecutive(2), Linear(n_embd * 2, n_hidden, bias = False), BatchNorm1d(n_hidden), Tanh(),
    FlattenConsecutive(2), Linear(n_hidden * 2, n_hidden, bias = False), BatchNorm1d(n_hidden), Tanh(),
    FlattenConsecutive(2), Linear(n_hidden * 2, n_hidden, bias = False), BatchNorm1d(n_hidden), Tanh(),
    Linear(n_hidden, n_hidden), BatchNorm1d(n_hidden), Tanh(),
    Linear(n_hidden, vocab_size)
])

# parameter init
with torch.no_grad():
  model.layers[-1].weight *= 0.1 # make last layer less confident

parameters = model.parameters()
print(sum(p.nelement() for p in parameters))
for p in parameters:
  p.requires_grad = True

61572


In [None]:
max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps + 1): # + 1 so I can see the loss at max_steps step

  # minibatching
  ix = torch.randint(0, xtr.shape[0], (batch_size,))
  Xb, Yb = xtr[ix], ytr[ix]


  logits = model(Xb)
  loss = F.cross_entropy(logits, Yb)

  for p in parameters:
    p.grad = None
  loss.backward()

  lr = 0.1 if i < 160000 else 0.01 if i < 180000 else 0.001
  for p in parameters:
    p.data += -lr * p.grad

  # stat tracking
  if i % 10000 == 0:
    print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
  lossi.append(loss.log10().item())


      0/ 200000: 3.2996


In [None]:
lossi = lossi[:-1]

In [None]:
plt.plot(torch.tensor(lossi).view(-1, 1000).mean(1))
# what if we had used learning rate decay sooner, and then again?

In [None]:
for layer in model.layers:
  layer.training = False
# so batchnorm doesn't do wonky things

In [None]:
@torch.no_grad()
def split_loss(split):
  x, y = {
      'train': {xtr, ytr},
      'val': {xdev, ydev},
      'test': {xtest, ytest}
  }[split]
  logits = model(x)
  loss = F.cross_entropy(logits, y)
  print(split, loss.item())

split_loss('train')
split_loss('val')

**experimentation log**


*   kaparthy's high-score: val 1.993
*   **experiment 1**: learning rate decay. n_embd = 10, n_hidden = 68.
- lr = 0.1 if i < 100000 else 0.01 if i < 170000 else 0.001 ----> train: 1.93
  val: 2.015
  * noticeable drop in loss at 100k mark, nothing notable at 170k mark
- **experiment 2:** lr = 0.1 if i < 160000 else 0.01 if i < 180000 else 0.001
  * train 1.93, val 2.02
- **experiment 3:** increase n_embd from 10 to 20
 * train: 1.913 , val: 2.006
- **experiment 4:** increase n_embd from 20 to 50
 * train: 1.905, val: 2.008
- **experiment 5:** increase n_hidden from 68 to 200
 * train: 1.75, val: 1.98
- **experiment 6:** decrease n_embd from 50 to 35
 * train: 1.749, val: 1.975
- **experiment 7:** decerase n_hidden from 200 to 150
 * train: 1.784, val: 1.976
- **experiment 8:** decerase n_hidden from 150 to 100
 * train: 1.839, val: 1.980
- **experiment 9:** add another layer
 * train: 1.823, val: 1.978 # OH WAIT I GOT IT HERE












In [None]:
# sampling
for _ in range(20):

  out = []
  context = [0] * block_size
  while True:
    logits = model(torch.tensor([context]))
    probs = F.softmax(logits, dim = 1)
    ix = torch.multinomial(probs, num_samples = 1).item()
    context = context[1:] + [ix]
    out.append(ix)
    if ix == 0:
      break

  print(''.join(itos[i] for i in out))

ni.
aemonin.
alle.
kahniolee.
koahyan.
jana.
derqes.
dsielya.
zaxelec.
avdyitoneal.
urjanlar.
milia.
efarridei.
lynan.
jameria.
maik.
arriulai.
karanhaly.
viwa.
lin.
