In [23]:
# word feature vector:
# In natural language processing (NLP), a word embedding is a representation of a word. 
# The embedding is used in text analysis. Typically, the representation is a real-valued 
# vector that encodes the meaning of the word in such a way that words that are closer in 
# the vector space are expected to be similar in meaning. Word embeddings can be obtained 
# using language modeling and feature learning techniques, where words or phrases from the 
# vocabulary are mapped to vectors of real numbers. 
# https://en.wikipedia.org/wiki/Word_embedding
# https://www.youtube.com/watch?v=gQddtTdmG_8
# https://www.baeldung.com/cs/dimensionality-word-embeddings

In [24]:
# at first the words are initialized and spread out randomly
# then, we're gonna tune embeddings of these words using backprop
# by maximizing the log-likelihood

In [25]:
# first few things were already done in bigram_basic

In [26]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [27]:
# load ~32000 names from a file, one in each line
with open("names.txt", "r") as infile:
    words = infile.read().splitlines() # ignores \n

In [28]:
words[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [29]:
len(words)

32033

In [30]:
chars = sorted(set("".join(words)))

In [31]:
ctoi = {c: i+1 for i, c in enumerate(chars)}
ctoi["."] = 0

In [32]:
itoc = {i: c for c, i in ctoi.items()}

In [33]:
# now the new stuff

In [34]:
# building a neural network that takes three characters (a context)
# each character has an embedding in a lookup table
# the nn has 27 outputs corresponding to a letter that we expect to get from a given context
# we will check that using our Y tensor

In [35]:
# building the dataset (with contexts)

# training split -> ~80% -> optimize parameters of the model using gradient descent
# dev/validation split -> ~10% -> hyperparameter tuning
# test split -> ~10% -> evaluate model's final performance

# updated so that we can also create dev/test splits

block_size = 4 # context length - how many chars to take into account to predict the next one?

def build_dataset(words):
    X, Y = [], [] # X - input to the neural net, Y - expected labels for each example inside X
    
    for w in words:
        # print(w)
        context = [0] * block_size # ...
        for c in w + ".":
            idx = ctoi[c]
            X.append(context)
            Y.append(idx)
            # print("".join(itoc[i] for i in context), "--->", itoc[idx])
            context = context[1:] + [idx]
    
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xtest, Ytest = build_dataset(words[n2:])

torch.Size([182625, 4]) torch.Size([182625])
torch.Size([22655, 4]) torch.Size([22655])
torch.Size([22866, 4]) torch.Size([22866])


In [36]:
X.shape, X.dtype, Y.shape, Y.dtype

NameError: name 'X' is not defined

In [None]:
# embedding look-up table
# let's start with just 2 dimensions at first
C = torch.randn((27, 2)) # 27 because we need a 2 dim embedding for each letter

In [None]:
# some experiments to get a grasp of how that indexing works

In [None]:
C[:3]

In [None]:
C[0]

In [None]:
C[0].shape

In [None]:
C[[0, 1, 2]] # we can index with lists as well!

In [None]:
C[[0, 1, 2]].shape

In [None]:
C[[torch.tensor([0, 1, 2])]] # same with tensors

In [None]:
C[torch.tensor([[0, 1, 2], [1, 1, 1]])] # I finally get what that does now!

In [None]:
C[[torch.tensor([[0, 1, 2], [1, 1, 1]])]].shape
# we have two contexts, three letters each
# C is a lookup table that has 2 dim embeddings for each letter
# C[0] simply gives an embedding of a single letter -> so shape 2
# C[[0, 1, 2]] gives embedding of a context (3 letters) -> shape 3, 2, an embedding for each letter
# C[ [[0, 1, 2], [1, 1, 1]] ] gives embedding for these 2 contexts -> shape 2 (new!), 3, 2

In [None]:
X[:1]

In [None]:
C[X[:1]]

In [None]:
X[:2]

In [None]:
C[X[:2]]

In [None]:
C[X[:2]].shape

In [None]:
# end of experiments

In [None]:
# so that's why C[X] will contain embeddings for all letters in all contexts -> shape 32, 3, 2
# 32 contexts, 3 letters each and each letter has a 2 dim embedding

In [None]:
emb = C[X]
emb.shape

In [None]:
# tests

In [None]:
X[13, 2] # 13th context, 2nd letter

In [None]:
emb[13, 2] # embedding for that 13th context, 2nd letter

In [None]:
C[1] # embedding for letter with idx == 1, manually checked

In [None]:
# end of tests

In [None]:
# HIDDEN LAYER
# initialize weights randomly
W1 = torch.randn((3 * 2, 100)) # 3 * 2 cos 3 contexts times 2 dim embeddings give 6 numbers
# 100 is just a number of neurons in the hidden layer, might be different than 100 as well
b1 = torch.randn(100)

In [None]:
# we can't just do emb @ W1 + b1 because emb is [32, 2, 3] and W1 is [6, 100]
# that's why we need to flatten the emb matrix to [32, 6]
h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # 100 activations for all 32 context embeddings ! ! !
h.shape
# + b1 requires broadcasting!
    # 32, 100
    #  1, 100 (1 - "fake" dimension)
# view is the most optimal, doesn't create anything additional in the memory
# check: tensor -> storage

In [None]:
# OUTPUT LAYER
W2 = torch.randn((100, 27)) # 100 input activations from prev layer, 27 letters as an output
b2 = torch.randn(27)

In [None]:
logits = h @ W2 + b2
logits.shape # good!

In [None]:
counts = logits.exp()
counts.shape

In [None]:
# broadcasting, 1 says we want sums of rows (a column vector, 27x1), every prob row gets divided by the row sum
prob = counts / counts.sum(1, keepdim=True)
prob.shape # 27 letter probs for all 32 context embeddings

In [None]:
prob[torch.arange(32), Y] # get all rows of prob and a probability of the correct letter (from Y)

In [None]:
 nll_loss = -prob[torch.arange(32), Y].log().mean()

In [None]:
nll_loss

In [None]:
# NOW EVERYTHING ORGANIZED:

In [None]:
Xtr.shape, Ytr.shape # the dataset

In [None]:
hidden_n = 200
emb_dim = 8

gen = torch.Generator().manual_seed(2147483647)
C = torch.randn((C.shape[0], emb_dim), generator=gen)
W1 = torch.randn((emb_dim * block_size, hidden_n), generator=gen)
b1 = torch.randn(hidden_n, generator=gen)
W2 = torch.randn((hidden_n, C.shape[0]), generator=gen)
b2 = torch.randn(C.shape[0], generator=gen)
parameters = [C, W1, b1, W2, b2]

In [None]:
sum(p.numel() for p in parameters) # numel <=> nelement

In [None]:
for p in parameters:
    p.requires_grad = True

In [None]:
epochs = 200_000
minibatch_size = 500

loss_i = []

for i in range(epochs):
    # create a minibatch
    idx = torch.randint(0, Xtr.shape[0], (minibatch_size,)) # get 32 random integers
    # between 0 and 228146 (number of contexts)
    # that will be our minibatch (of size 32)
    
    # forward pass
    emb = C[Xtr[idx]]
    h = torch.tanh(emb.view(-1, emb_dim * block_size) @ W1 + b1) # hidden layer, (32, 100)
    logits = h @ W2 + b2 # (32, 27)
    # counts = logits.exp()
    # prob = counts / counts.sum(1, keepdim=True)
    # nll_loss = -prob[torch.arange(32), Y].log().mean()
    # nll_loss
    loss = F.cross_entropy(logits, Ytr[idx]) # more efficient, prevents inf and nan for big positive logits
    loss_i.append(loss.item())
    
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward() # it works but find out why later
    
    # update
    lr = 0.1 if i < epochs / 2 else 0.01
    for p in parameters:
        p.data += -lr * p.grad

print(loss.item())
# impossible to reach loss 0 because: ... -> e but also ... -> a, etc.

In [None]:
plt.plot(range(epochs)[30000:], loss_i[30000:])

In [None]:
# find out the loss for a whole (TR!) data set, not just a minibatch
emb = C[Xtr]
h = torch.tanh(emb.view(-1, emb_dim * block_size) @ W1 + b1) # hidden layer, (32, 100)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, Ytr) # more efficient, prevents inf and nan for big positive logits
print(loss.item())

In [None]:
# tr loss ~= dev loss -> not overfitting (actually maybe underfitting!)
# <=> nn not able to memorize the values
# might mean that the network is too small (not enough parameters)

# find out the loss for a whole (DEV!) data set
emb = C[Xdev]
h = torch.tanh(emb.view(-1, emb_dim * block_size) @ W1 + b1) # hidden layer, (32, 100)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, Ydev) # more efficient, prevents inf and nan for big positive logits
print(loss.item())

In [None]:
# SAMPLING!!!
gen = torch.Generator().manual_seed(2147483647 + 19)

for _ in range(20):
    name = []
    context = [0] * block_size # initialize with all ...
    while True: # create one name
        emb = C[torch.tensor([context])] # like C[X] but this time only 1 context
        # block_size x emb_dim
        h = torch.tanh(emb.view(1, -1) @ W1 + b1)
        logits = h @ W2 + b2
        probs = F.softmax(logits, dim=1)
        idx = torch.multinomial(probs, num_samples=1, generator=gen).item()
        context = context[1:] + [idx]
        name.append(idx)
        if idx == 0:
            break
    print("".join(itoc[i] for i in name))
        

In [None]:
C[X].shape, X.shape

In [None]:
C[X[0]] # idx (here 0) gets 1 context of 3 letters (2 * 3 = 6 embeddings)

In [None]:
from torchviz import make_dot
from IPython.display import display
graph = make_dot(loss)
display(graph) # W O W ! ! ! ! ! ! !

In [None]:
# Save the graph as a PDF file
graph.render("computation_graph")

In [None]:
print(loss.grad_fn)

In [None]:
print(loss.grad_fn.next_functions)

In [None]:
print(loss.grad_fn.next_functions[0][0].next_functions)

In [None]:
print(loss.grad_fn.next_functions[0][0].next_functions[0][0].next_functions)
# etc. -> that's how we could build such graph!