## Batch norm and Initilization and activations

In [1]:
import torch
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
words = open('30k.txt', 'r').read().splitlines()

In [3]:
words[:5]

['the\t', 'of\t', 'and\t', 'to\t', 'a\t']

In [4]:
words = [''.join(ch for ch in word if ch != "\t") for word in words]

In [5]:
words[:5]

['the', 'of', 'and', 'to', 'a']

In [6]:
len(words)

30000

## Create the dataset

In [7]:
# The defaults are 
BLOCK_SIZE = 3

In [9]:
# create the mappings 

chars = sorted(list(set(''.join(word for word in words))))
stoi = {s: i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i: s for s, i in stoi.items()}

In [14]:
def build_dataset(data):
    X, Y = [], []

    for word in data:
        context = [0] * BLOCK_SIZE
        for ch in word + '.':
            X.append(context)
            Y.append(stoi[ch])
            context = context[1:] + [stoi[ch]]

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(f"X.shape = {X.shape}   ||   Y.shape = {Y.shape}")

    return X, Y

In [15]:
Xtr, Ytr = build_dataset(words[:25000])
Xte, Yte = build_dataset(words[25000:27500])
Xdev, Ydev = build_dataset(words[27500:])

X.shape = torch.Size([195619, 3])   ||   Y.shape = torch.Size([195619])
X.shape = torch.Size([20005, 3])   ||   Y.shape = torch.Size([20005])
X.shape = torch.Size([19963, 3])   ||   Y.shape = torch.Size([19963])


In [17]:
Xtr[:5], Ytr[:5]

(tensor([[ 0,  0,  0],
         [ 0,  0, 20],
         [ 0, 20,  8],
         [20,  8,  5],
         [ 0,  0,  0]]),
 tensor([20,  8,  5,  0, 15]))

In [18]:
## Hard code the values of the hyperparameters

EMBEDDING_DIM = 10
NUM_NEURON = 100

In [26]:
## Define the default set of parameters and required initilization

g = torch.Generator().manual_seed(1234)

C = torch.rand((27, EMBEDDING_DIM), generator=g, requires_grad=True)  ## Embeddings

# Params for the 1st layer
W1 = torch.rand((EMBEDDING_DIM * BLOCK_SIZE, NUM_NEURON), generator=g, requires_grad=True)
b1 = torch.rand((NUM_NEURON, ), generator=g, requires_grad=True)

# params for the 2nd layer
W2 = torch.rand((NUM_NEURON, 27), generator=g, requires_grad=True)
b2 = torch.rand((27, ), generator=g, requires_grad=True)

params = [C, W1, b1, W2, b2]

In [27]:
print(f"C.shape: {C.shape}")
print(f"W1.shape: {W1.shape}")
print(f"b1.shape: {b1.shape}")
print(f"W2.shape: {W2.shape}")
print(f"b2.shape: {b2.shape}")

C.shape: torch.Size([27, 10])
W1.shape: torch.Size([30, 100])
b1.shape: torch.Size([100])
W2.shape: torch.Size([100, 27])
b2.shape: torch.Size([27])


In [25]:
# Hard Code the constants:

ITER = 200000
BATCH_SIZE = 32

In [62]:
## Create the model
lr = 1.0

for i in range(ITER):
    # Forward pass
    ix = torch.randint(0, Xtr.shape[0], (BATCH_SIZE, ), generator=g)
    emb = C[Xtr[ix]]               # Embedding
    h = torch.tanh(emb.view(-1, EMBEDDING_DIM * BLOCK_SIZE) @ W1 + b1)     # First layer
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Ytr[ix])

    # Backward Pass
    for p in params:
        p.grad = None
    loss.backward()

    # Update
    if i % (ITER // 3) == 0:
        lr /= 10
        print(f"LR = {lr}")
    for p in params:
        p.data += -lr * p.grad

    # Print loss every 10 steps
    if i % (ITER / 10) == 0:
        print(f"Iter = {i} Loss = {loss.item()}")

LR = 0.1
Iter = 0 Loss = 2.516104221343994
Iter = 20000 Loss = 2.663620948791504
Iter = 40000 Loss = 2.430905818939209
Iter = 60000 Loss = 2.230560541152954
LR = 0.01
Iter = 80000 Loss = 2.132500410079956
Iter = 100000 Loss = 2.0449297428131104
Iter = 120000 Loss = 2.271138906478882
LR = 0.001
Iter = 140000 Loss = 2.362372636795044
Iter = 160000 Loss = 2.7325353622436523
Iter = 180000 Loss = 2.0289199352264404
LR = 0.0001


In [63]:
loss.data.item()

2.204268217086792

In [64]:
# sample from the model
g = torch.Generator().manual_seed(1234)

for _ in range(20):
    
    out = []
    context = [0] * BLOCK_SIZE # initialize with all ...
    while True:
      emb = C[torch.tensor([context])] # (1,block_size,d)
      h = torch.tanh(emb.view(1, -1) @ W1 + b1)
      logits = h @ W2 + b2
      probs = F.softmax(logits, dim=1)
      ix = torch.multinomial(probs, num_samples=1, generator=g).item()
      context = context[1:] + [ix]
      out.append(ix)
      if ix == 0:
        break
    
    print(''.join(itos[i] for i in out))

s.
chand.
dationie.
sverated.
catorin.
treure.
pose.
gch.
bouthocstrew.
refaol.
inter.
unche.
armeniogotionfuropones.
precauten.
lowels.
stlateegeg.
setianestli.
extare.
oratinas.
burrentatious.


In [65]:
# ix = torch.randint(0, Xtr.shape[0], (BATCH_SIZE, ), generator=g)
emb = C[Xdev]               # Embedding
h = torch.tanh(emb.view(-1, EMBEDDING_DIM * BLOCK_SIZE) @ W1 + b1)     # First layer
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Ydev)
loss

tensor(2.4717, grad_fn=<NllLossBackward0>)

In [66]:
emb = C[Xtr]               # Embedding
h = torch.tanh(emb.view(-1, EMBEDDING_DIM * BLOCK_SIZE) @ W1 + b1)     # First layer
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Ytr)
loss

tensor(2.3218, grad_fn=<NllLossBackward0>)