In [3]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import random
%matplotlib inline

In [4]:
words = open('names.txt', 'r').read().splitlines()
# words = open('rwords.txt', 'r', encoding='utf-8').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [5]:
chars = sorted(list(set(''.join(words))))
itos =  {i+1:s for i,s in enumerate(chars)}
itos[0] = '.'
stoi = {i:s for s,i in itos.items()}
vocab_size = len(itos)

In [6]:
block_size = 10

def build_dataset(words):
    X,Y = [],[]
    for w in words:
        context = [0] * block_size
        for c in w + '.':
            ix = stoi[c]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]
    
    X = torch.tensor(X)        
    Y = torch.tensor(Y)
    return X,Y

random.seed(33)
random.shuffle(words)

n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

Xtr,Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

In [7]:
n_embd = 10
n_hidden = 500

C = torch.randn(vocab_size, n_embd)
W1 = torch.randn(n_embd* block_size ,n_hidden)  * (5/3) / (n_embd* block_size)**0.5 
b1 = torch.randn(n_hidden) * 0.01
W2 = torch.randn(n_hidden ,vocab_size) * 0.01
b2 = torch.randn(vocab_size) * 0

bngain = torch.ones((1,n_hidden))
bnbias = torch.zeros((1,n_hidden))
bnmean_running = torch.zeros((1,n_hidden))
bnstd_running = torch.ones((1,n_hidden))

parameters= [C , W1 , b1 , W2 , b2, bngain, bnbias]

for p in parameters:
    p.requires_grad = True

sum(p.nelement() for p in parameters)

65297

In [8]:
max_steps = 10*10000
batch_size = 32
lossi = []

for i in range(max_steps):
    # minibatch
    ix= torch.randint(0, Xtr.shape[0], (batch_size,))
    Xb,Yb = Xtr[ix], Ytr[ix]     

    #forward pass
    emb = C[Xb]
    embcat = emb.view(emb.shape[0],-1)
    
    #Linear layer
    hpreact = embcat @ W1 + b1

    # BatchNorm Layer
    #==========================================================================
    bnmeani =  hpreact.mean(0,keepdim =True)
    bnstdi =  hpreact.std(0,keepdim =True)
    hpreact = bngain * (hpreact - bnmeani) / (bnstdi + 1e-5) + bnbias
    with torch.no_grad():
        bnmean_running = 0.999 * bnmean_running + 0.001 * bnmeani
        bnstd_running = 0.999 * bnstd_running + 0.001 * bnstdi
    #==========================================================================
    
    # no-linerity  
    h = torch.tanh(hpreact) # hidden layer
    logits = h @ W2 + b2 # output
    loss = F.cross_entropy(logits, Yb) # loss function
    
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # update
    lr = 0.1 if i < 1000 else 0.01
    for p in parameters:
        p.data += -lr* p.grad
        
    # track stats
    if i % 10000 == 0:
        print(f"{i}: {loss.item()}")
        
    lossi.append(loss.log10().item())

0: 3.343235969543457
10000: 2.4740166664123535
20000: 2.3188698291778564
30000: 2.253460168838501
40000: 1.9107609987258911
50000: 2.3613126277923584
60000: 2.2524850368499756
70000: 1.9750839471817017
80000: 2.2724897861480713
90000: 2.2335116863250732


In [9]:
# with torch.no_grad():
#     emb = C[Xtr]
#     embcat = emb.view(emb.shape[0],-1)
#     hpreact = embcat @ W1 + b1
#     bnmean = hpreact.mean(0,keepdim =True)
#     bnstd = hpreact.std(0,keepdim =True)

In [10]:
# plt.subplot(311)
# plt.hist(hpreact.view(-1).tolist(), 50)
# plt.title('Histogram of hpreact')


# plt.subplot(312)
# plt.hist(h.view(-1).tolist(), 50)
# plt.title('Histogram of h')


# plt.subplot(313)
# plt.imshow(h.abs() > 0.99, cmap='gray', interpolation='nearest')
# plt.title('Mask of |h| > 0.99')

# plt.subplots_adjust(hspace=0.5) 

In [11]:
# plt.plot(lossi)

In [12]:
@torch.no_grad()
def split_loss(split):
    X, Y = {
        'train': (Xtr, Ytr),
        'val': (Xdev, Ydev),
        'test': (Xte, Yte)
    }[split]
    emb = C[X]
    embcat = emb.view(emb.shape[0], -1)
    hpreact = embcat @ W1 + b1
    hpreact = bngain * (hpreact - bnmean_running) / (bnstd_running + 1e-5) + bnbias
    h = torch.tanh(hpreact)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y)
    print(f"{split}, loss = {loss.item()}")

split_loss('train')
split_loss('val')

train, loss = 2.0321037769317627
val, loss = 2.1050684452056885


In [13]:
for _ in range(20):
    out = []
    context = [0] * block_size
    while True:
        emb = C[torch.tensor([context])]  # Shape: (1, block_size, embedding_dim), e.g., (1, 3, 10)
        embcat = emb.view(emb.shape[0], -1)  # Shape: (1, block_size * embedding_dim), e.g., (1, 30)
        hpreact = embcat @ W1 + b1  # Shape: (1, hidden_dim), e.g., (1, 200)
        hpreact = bngain * (hpreact - bnmean_running) / (bnstd_running + 1e-5) + bnbias  # BatchNorm
        h = torch.tanh(hpreact)  # Shape: (1, 200)
        logits = h @ W2 + b2  # Shape: (1, vocab_size)
        probs = F.softmax(logits, dim=1)
        ix = torch.multinomial(probs, 1)
        context = context[1:] + [ix.item()]
        out.append(ix.item())
        if ix == 0:
            break
    print(''.join(itos[i] for i in out))

laenel.
caristadis.
kinvy.
zero.
japhenn.
farler.
kenuyn.
esrus.
sonleigh.
akharh.
shrlyn.
charlun.
chamilighu.
windor.
namieri.
kimina.
yusu.
nuzoesh.
jaivian.
niakni.


In [494]:
# # # logits = torch.tensor([0.0, 0.0, 0.0, 0.0])
# # logits = torch.randn(4)
# # probs = torch.softmax(logits,dim=0)
# # loss = - probs[2].log()
# # probs , loss


# x = torch.randn(1000,10)
# w = torch.randn(10,200) / 10**0.5
# y = x @ w 

# print(x.mean(), x.std())
# print(y.mean(), y.std())

# plt.figure(figsize=(28,5))
# plt.subplot(121)
# plt.hist(x.view(-1).tolist(), 50);
# plt.subplot(122)
# plt.hist(y.view(-1).tolist(), 50);

In [495]:
# ### the inital loss
# to fix we had to make the logits samller 
# to be close to the expected loss (1/27) 
# we did this by makeing w2 and b2 smaller

# ### fix the sattuerd tanh
# we fix it by making the values in the pre activation layer smaller 
# because if they too high tanh will be 1 or -1
# if it is 1 or -1 the gradient will be satrt vanishit..
# and if u think about it when u cahnge the input the output wont really change
# beacause we are in the falt region of tanh ...

## batchNorm
#tweaks each number so they all have an average of 0 and a spread of 1. 


In [None]:
# actvation and gradient and their statstics
# distrubtion in actvation layer if it is too confident
# u get this hookey stick.. and to
    

In [1]:
class Linear:

    def __init__(self, fan_in , fan_out, bias= True):
        self.weight = torch.randn(fan_in, fan_out) / fan_in**0.5 # kamining init
        self.bias = torch.zeros(fan_out) if bias else None

    def __call__(self,x):
        self.out = x @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out

    def parameters(self):
        return [self.weight] + [] if self.bias is None else [self.bias]


class BatchNorm1d:

    def __init__(self, dim, eps=1e-5, momentum=0.1): 
        self.eps = eps
        self.momentum = True
        self.training = True
        # parameters
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)
        # buffers
        self.running_mean = torch.zeros((1,fan_out))
        self.running_var = torch.ones((1,fan_out))

    def __call__(self, x):
        if self.training:
            xmean = x.mean(0, keepdim=True)
            xvar = x.var(0, keepdim=True)
        else:
            xmean = self.running_mean
            xvar = self.running_var
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
        self.out = self.gamma * xhat + self.beta 
        # update buffers
        if self.training:
            with torch.no_grad():
                self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
                self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
            return self.out

    def parameters(self):
        return (self.gamma, self.beta)

class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out
    def parameters(self):
        return []

n_embd = 10
n_hidden = 100
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((vocab_size, n_embd), generator=g)

layers = [
    Linear(n_embd * block_size, n_hidden), Tanh(),
    Linear(           n_hidden, n_hidden), Tanh(),
    Linear(           n_hidden, n_hidden), Tanh(),
    Linear(           n_hidden, n_hidden), Tanh(),
    Linear(           n_hidden, n_hidden), Tanh(),
    Linear(           n_hidden, vocab_size),
]

with torch.no_grad():
    layers[-1].weight *= 0.1
    for layer in layers[:-1]:
        if isinstance(layer, Linear):
            layer.weight *= 5/3

parameters = [C] + [p for layer in layers for p in layer.parameters()]

for p in parameters:
    p.requires_grad = True

NameError: name 'torch' is not defined