In [35]:
###################################################################################################################
#   MAKEMORE 2 - https://www.youtube.com/watch?v=TCH_1BHY58I&list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ&index=3      #
###################################################################################################################

import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [36]:
words = open('../names.txt', 'r').read().splitlines()
words = [word.lower() for word in words]
# show first few words
words[:8]

['emma', 'liam', 'olivia', 'noah', 'ava', 'ethan', 'sophia', 'mason']

In [37]:
# Map words to integers

# get unique set of chars in the names dataset
chars = sorted(list(set(''.join(words))))
NUM_CHARS = len(chars)+1 # number of chars in training set plus special char '.'
print(f'{NUM_CHARS=}')
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

NUM_CHARS=27
{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [38]:
# build the dataset

# block size is the context length - how many characters we use to predict the next one
block_size = 3 # get the Y character given the previous {block_size} number of characters...
# X are inputs and Y are the labels for each example X
X,Y = [],[]
# print examples of x input with their possible labels
for w in words:
    print(w)
    context = [0] * block_size # [0,0,0] 
    for ch in w + '.': # loop chars in word - add . to the word 'emma.'
        ix = stoi[ch] # number repr. of the character
        X.append(context) # add the block of characters that map to the Y label next character # [0,0,0] -> [E_numrepr]
        Y.append(ix) # add number repr of character to the label list
        print(''.join(itos[num] for num in context), '--->', itos[ix]) # the context and the next prediction label
        context = context[1:] + [ix] # add the label char (what comes next in the training set) to the current context and move to the next step over in the context to get next window of 3 chars to get next three chars and prediction label


X = torch.tensor(X)
Y = torch.tensor(Y)

print('X shape: ', X.shape)
print('Y shape: ', Y.shape)
# print(Y)

assert X.shape[0] == Y.shape[0]
USE_THIS = X.shape[0]


emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .
liam
... ---> l
..l ---> i
.li ---> a
lia ---> m
iam ---> .
olivia
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> a
via ---> .
noah
... ---> n
..n ---> o
.no ---> a
noa ---> h
oah ---> .
ava
... ---> a
..a ---> v
.av ---> a
ava ---> .
ethan
... ---> e
..e ---> t
.et ---> h
eth ---> a
tha ---> n
han ---> .
sophia
... ---> s
..s ---> o
.so ---> p
sop ---> h
oph ---> i
phi ---> a
hia ---> .
mason
... ---> m
..m ---> a
.ma ---> s
mas ---> o
aso ---> n
son ---> .
matthew
... ---> m
..m ---> a
.ma ---> t
mat ---> t
att ---> h
tth ---> e
the ---> w
hew ---> .
modine
... ---> m
..m ---> o
.mo ---> d
mod ---> i
odi ---> n
din ---> e
ine ---> .
isabella
... ---> i
..i ---> s
.is ---> a
isa ---> b
sab ---> e
abe ---> l
bel ---> l
ell ---> a
lla ---> .
william
... ---> w
..w ---> i
.wi ---> l
wil ---> l
ill ---> i
lli ---> a
lia ---> m
iam ---> .
mia
... ---> m
..m ---> i
.mi ---> a
mia ---> .
james
... ---> j
..j ---> a


In [39]:
# Number of examples - each input to the neural net is 3 integers (or whatever the block size is), the type of data
print(f'{X.shape} - Number of examples, size of an input into the neural net (block size number of ints)')
print(f'type of data of X/Y: {X.dtype}')
print(f'{Y.shape} - number of labels')


print(X[:2])# input examples
print(Y[:2]) # the labels for the input examples

torch.Size([790, 3]) - Number of examples, size of an input into the neural net (block size number of ints)
type of data of X/Y: torch.int64
torch.Size([790]) - number of labels
tensor([[0, 0, 0],
        [0, 0, 5]])
tensor([ 5, 13])


In [40]:
# Building the Embedding lookup table 'C' (from paper at https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)

NUM_CHARS = len(chars)+1 # add special char '.'

# embed our possible characters from the training set in a lower dimensional space

# for example, start with putting all into a 2 dimensional space
# The C embedding space is initialized randomly to start
embeddings_dims = 2
C = torch.randn((NUM_CHARS, embeddings_dims)) # ex: NUM CHARS rows and 2 cols - each character in the training set has a 2-dimensional embedding (i.e. a character has a coordinate of an x and y in the space...)
print('C embedding space matrix:')
print(C[:3])

### Example using C embedding space as a lookup for a character (i.e. the char repr. by 5)
# we could just index into C:
print(C[5]) # ex: the embedding for the char repr. by integer 5
# we could also one hot encode for the char int repr of 5 and multiply it by C (is the same result as indexing above)
F.one_hot(torch.tensor(5), num_classes=NUM_CHARS).float() @ C

# This means that besides thinking of C as a lookup table using the int repr. of the char, we can also think of it as a first layer of a neural net using the one hot encoded approach
# this layer is a linear layer with no tanh or activation function and the weights matrix is C
# so we one hot encode characters and feed them into a layer of a neural net

# NOTE: for the examples here we just index in because it's faster and we skip the one hot encoded approach

C embedding space matrix:
tensor([[ 1.4771,  0.6115],
        [ 0.0312, -0.6685],
        [ 1.0566, -0.1957]])
tensor([-0.3975,  0.6407])


tensor([-0.3975,  0.6407])

In [41]:

############################################################################################################################################################ 
#  indexing into C with X gets coordinates (a row in C) for each one of the characters in a block and puts that all into one row in the resulting Matrix 
#
#                       See torch_advanced_indexing.ipynb notebook for examples and playground
# ###########################################################################################################################################################
# print(f'C[X] = {C[X]}') # X is a list of blocks
print(X[:2])
C[[0,0,5]] # example - the second block of X is a list so we index into C with that to get coordinates for each char in the block (the index of C lines up with coordinates for the char int representation)

XX = X[:2]
print(XX)

print(C[XX])

C[torch.tensor([[0, 0, 0],
        [0, 0, 5]])]


tensor([[0, 0, 0],
        [0, 0, 5]])
tensor([[0, 0, 0],
        [0, 0, 5]])
tensor([[[ 1.4771,  0.6115],
         [ 1.4771,  0.6115],
         [ 1.4771,  0.6115]],

        [[ 1.4771,  0.6115],
         [ 1.4771,  0.6115],
         [-0.3975,  0.6407]]])


tensor([[[ 1.4771,  0.6115],
         [ 1.4771,  0.6115],
         [ 1.4771,  0.6115]],

        [[ 1.4771,  0.6115],
         [ 1.4771,  0.6115],
         [-0.3975,  0.6407]]])

In [42]:
# The embeddings for the characters in blocks
emb = C[X] # using torch advanced indexing as explored above

########## Construct the hidden layer "Hop Parameter" (timestamp 18:40)

# initialize weights randomly 
num_inputs = block_size * embeddings_dims # example 3 x 2 (number of chars in a block multiplied by the embedding dimension gives you the number of inputs to the layer)
num_neurons = 100 # variable number of neurons in the layer - you can choose and experiment with different numbers
W1 = torch.randn((num_inputs, num_neurons)) # initialize random weights

# initialize biases randomly as well
b1 = torch.randn(num_neurons) # one for each neuron in the layer


# The goal is to multiply the inputs by the weights and add the biases (as per normal in a NN), but the shapes of the embeddings matrix (emb) is not multipliable by the shape of W1 for matrix multiplication
print('The matrices of embeddings and Weights are not matrix multipliable (cols of embeddings matrix must match rows of Weights matrix):')
print(f'shape of embeddings matrix (emb): {emb.shape}')
print(f'shape of Weights matrix: {W1.shape}')

# we need to transform the embeddings matrix so that it's last dimension is equal to the first dimension of the Weights matrix: N1xM1 -> M1xN2
  # can use torch.cat to concatenate the 3 embeddings in a row of the emb matrix (which are 3x2 sets, so 6 when concatenated)
  # NOTE: should use views instead, though (see later cells)

# this is the embeddings for the first character in each block of the trainingset
first_char_emb = emb[:,0,:] # get all top lvl rows (blocks), get first embedding in each of those rows, get all elements in that embedding (the xy vector)
second_char_emb = emb[:,1,:] # 2nd char in each block embeddings
third_char_emb = emb[:,2,:] # third char in each block embeddings

# torch.cat concatenates along the 2nd dimension (index 1 - second argument) to concatentate the embeddings for each character into 6 values (2 coords for each char)
print(torch.cat([first_char_emb, second_char_emb, third_char_emb], 1).shape) # should have a cols number of 6 to make it matrix multipliable by the weights


### NOTE: using cat and unbind is an inefficient way of accomplishing this and we should use tensor views instead (see below cells)
# cat has to create a new tensor so new memory is used and needed where view does not (it accesses properties on the existing tensor)

# to make the code more dynamic in case the block size changes, you can use torch.unbind()
#print(torch.unbind(emb, 1))
# print(torch.cat(torch.unbind(emb, 1), 1).shape) # same shape as previous torch.cat
### See 23:25 for more explanation
unb = torch.unbind(emb, 1)
print('has len of 3: Unbind(emb, 1):')
print(len(unb))
torch.cat(torch.unbind(emb, 1), 1) # same thing as the previous torch.cat call except more dynamic based on block size (??)



The matrices of embeddings and Weights are not matrix multipliable (cols of embeddings matrix must match rows of Weights matrix):
shape of embeddings matrix (emb): torch.Size([790, 3, 2])
shape of Weights matrix: torch.Size([6, 100])
torch.Size([790, 6])
has len of 3: Unbind(emb, 1):
3


tensor([[ 1.4771,  0.6115,  1.4771,  0.6115,  1.4771,  0.6115],
        [ 1.4771,  0.6115,  1.4771,  0.6115, -0.3975,  0.6407],
        [ 1.4771,  0.6115, -0.3975,  0.6407,  0.4360,  0.8485],
        ...,
        [-2.4419,  2.1903, -0.2650, -0.2605, -0.9303, -0.6410],
        [-0.2650, -0.2605, -0.9303, -0.6410,  0.0684, -1.6289],
        [-0.9303, -0.6410,  0.0684, -1.6289,  0.0684, -1.6289]])

In [43]:
test = torch.tensor([[[1,  2],
         [3,  4],
         [5,  6]]])

print(test)
print('\n')
print(test[:,0,:])
# : = everything of the first dimension (all top level rows)
# :,1 = all top rows and then get index 1 (2nd el) in each of the rows
test[:,1]

test2 = torch.randn(2,5)
test2.shape
print('test2: ', test2)

test3 = torch.randn(1,5)
test3.shape
print('test3: ',test3)


tensor([[[1, 2],
         [3, 4],
         [5, 6]]])


tensor([[1, 2]])
test2:  tensor([[-1.2044, -0.4219,  0.2054, -1.3889, -0.6689],
        [ 0.5901,  0.9371,  0.6702,  0.4017,  0.9293]])
test3:  tensor([[-2.0624, -0.1915, -0.5579,  0.7462, -0.4422]])


In [44]:
# There is a better way than using unbind:

# timestamp 23:30

# array of els 0-17
a = torch.arange(18)
print('single vector of 18 before view: ', a)
# print('shape: ', a.shape) # single vector of 18 numbers

# You can represent the above as different size dimensional tensors
# Use a view which is very efficient:
a.view(2, 9) # change the single 18 sized vector to a 2 by 9 tensor
a.view(3,3,2) # or change it to a 3x3x2 tensor (The total number of elements must multiply to be the same as 18)

print(emb.shape) # 26x3x2
emb.view(USE_THIS,6) # 26 and 3x2 (6) = 26, 6 - we can reshape the embeddings matrix with the view

# print(emb.view(26,6) == torch.cat(torch.unbind(emb, 1), 1)) # using view is equivalent to the less efficient method with unbind and cat

# This now gives us the hidden states we're after (see 28:16 timestamp - hidden layer of activations?)
# Use shape[0] to not hard code, or You can also have pytorch derive the size for the view automatically using -1:     
#    emb.view(-1, 6)
h = emb.view(emb.shape[0],6) @ W1 + b1 # multiply embeddings vals by the weights and add the bias
# NOTE on the bias: make sure the dimensions match up for broadcasting always - see timestamp 28:27
print('h Shape: ', h.shape) # this is the 100 dimensional activations for our 27 examples

# use tanh to make the activations between -1 and 1 (squashing function)
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
print(h)
print(h.shape) # 26x100 - hidden layer of activations shown at timestamp 28:16 for every one of the 27 examples

single vector of 18 before view:  tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])
torch.Size([790, 3, 2])
h Shape:  torch.Size([790, 100])
tensor([[-0.9715, -0.9957, -0.9990,  ...,  0.9912, -0.8431,  0.5019],
        [ 0.6854, -0.9968, -0.7634,  ..., -0.9470, -0.9959,  0.9388],
        [ 0.9979, -0.9220, -0.9910,  ..., -0.9506,  0.8473,  0.4729],
        ...,
        [ 0.9996,  0.5055, -0.3891,  ..., -0.8670, -1.0000,  1.0000],
        [ 0.9987,  0.9996, -0.2071,  ..., -0.8807, -0.9720,  0.0201],
        [-0.6256,  0.9991, -0.3339,  ..., -0.0425, -0.9890, -0.4152]])
torch.Size([790, 100])


In [45]:
# Create the final layer of the neural net (29:21 timetstamp)

# initialize weights and biases
W2 = torch.randn(100, USE_THIS) # 100 inputs and 27 possible characters that can come next
b2 = torch.randn(USE_THIS) # the biases need to be 27 as well (match the columns dimension of weights)

In [46]:
logits = h @ W2 + b2

print('logits shape: ', logits.shape)
# logits

logits shape:  torch.Size([790, 790])


In [47]:
# exponentiate the logits to get the fake counts
counts = logits.exp()
print('counts shape: ', counts.shape)

# normalize them into a probability
prob = counts / counts.sum(1, keepdim=True) # sum along the first dimension (the row)
print('prob shape: ', prob.shape)
print('every row of prob now sums to 1:')
print(prob[0].sum()) # sum up everything along the first dimension (by rows)

counts shape:  torch.Size([790, 790])
prob shape:  torch.Size([790, 790])
every row of prob now sums to 1:
tensor(1.0000)


In [48]:
# The next layer comes from Y which created during data separation - the next character in the sequence we want to predict
#   See timestamp 30:40
print(Y)
print(Y.shape)
print(prob.shape)

print(torch.arange(Y.shape[0]).shape)

# index into each row of probabilities and pluck out the probability assigned to the correct character
probabilities = prob[torch.arange(Y.shape[0]), Y] # before training a lot of these will be very close to 0
# After training ideally all the numbers should be 1 which means we're correctly predicting the next character
print('probs: ', probabilities)


tensor([ 5, 13, 13,  1,  0, 12,  9,  1, 13,  0, 15, 12,  9, 22,  9,  1,  0, 14,
        15,  1,  8,  0,  1, 22,  1,  0,  5, 20,  8,  1, 14,  0, 19, 15, 16,  8,
         9,  1,  0, 13,  1, 19, 15, 14,  0, 13,  1, 20, 20,  8,  5, 23,  0, 13,
        15,  4,  9, 14,  5,  0,  9, 19,  1,  2,  5, 12, 12,  1,  0, 23,  9, 12,
        12,  9,  1, 13,  0, 13,  9,  1,  0, 10,  1, 13,  5, 19,  0,  3,  8,  1,
        18, 12, 15, 20, 20,  5,  0,  2,  5, 14, 10,  1, 13,  9, 14,  0,  1, 13,
         5, 12,  9,  1,  0, 12, 21,  3,  1, 19,  0,  8,  1, 18, 16,  5, 18,  0,
         8,  5, 14, 18, 25,  0,  5, 22,  5, 12, 25, 14,  0,  1, 12,  5, 24,  1,
        14,  4,  5, 18,  0,  1,  2,  9,  7,  1,  9, 12,  0, 13,  9,  3,  8,  1,
         5, 12,  0,  5, 13,  9, 12, 25,  0,  4,  1, 14,  9,  5, 12,  0,  5, 12,
         9, 26,  1,  2,  5, 20,  8,  0, 10,  1,  3, 15,  2,  0, 13,  9, 12,  1,
         0, 12, 15,  7,  1, 14,  0,  5, 12, 12,  1,  0, 10,  1,  3, 11, 19, 15,
        14,  0,  1, 22,  5, 18, 25,  0, 

In [49]:
# Get the negative log likelihood
# take the log of the probability and get the average log probability and take the negative of it
loss = -prob[torch.arange(Y.shape[0]), Y].log().mean()
print('loss: ', loss) # this is the loss we want to minimize

loss:  tensor(27.5185)


In [50]:
############# REWRITTEN FROM TIMESTAMP 32:25 ##################

# Dataset
X.shape, Y.shape

(torch.Size([790, 3]), torch.Size([790]))

In [51]:
g = torch.Generator().manual_seed(2147483647) # to make deterministic and reproducable
# parameters:
C = torch.randn((USE_THIS, 2), generator=g)
W1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100, USE_THIS), generator=g)
b2 = torch.randn(USE_THIS, generator=g)
parameters = [C, W1, b1, W2, b2]

In [52]:
print('Number of parameters: ', sum(p.nelement() for p in parameters)) # total parameters

Number of parameters:  82070


In [57]:
for p in parameters:
    p.requires_grad = True # needed to do with pytorch before starting gradient descent iterations

for _ in range(100):
    # contruct a mini-batch for efficiency:
    ix = torch.randint(0, X.shape[0], (32,)) # integers we want to optimize in this iteration. See timestamp 43:38

    # Forward Pass
    emb = C[X[ix]] # (NUM_EXAMPLES, 3, 2). Only grab the rows of the mini-batch
    # print(emb.shape)
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (26, 100)
    logits = h @ W2 + b2 # (26, 26)
    # This is classification:
    # You would never do this manually in practice - it is very inefficient and see reasons around timestamp 36:00
    # counts = logits.exp()
    # prob = counts / counts.sum(1, keepdim=True)
    # loss = -prob[torch.arange(emb.shape[0]), Y].log().mean()
    loss = F.cross_entropy(logits, Y[ix]) # use builtin cross_entropy() instead of calc manually above - better and efficient. Also use the mini-batch by passing ix into indexing into Y - makes it MUCH FASTER
    print(loss.item()) # expresses how well the neural network works with current set of parameters

    # Backward Pass
    for p in parameters:
        p.grad = None # same as setting it to 0 in pytorch
    # use .backward to populate the gradients
    loss.backward()

    # update the parameters
    LEARNING_RATE = 0.1
    for p in parameters:
        p.data += -LEARNING_RATE * p.grad # nudge the parameters by the learning rate



# The loss should show as decreasing more and more with more iterations

# NOTE: Overfitting is occurring with the above because you have so many parameters (3,378) for so few examples (26 or 32)
# This makes the loss very easy to make low
# See timestamp 40:40

15.423449516296387
13.593306541442871
16.120805740356445
13.175605773925781
10.533838272094727
12.658477783203125
12.15114688873291
14.161653518676758
13.379488945007324
11.827338218688965
12.270668029785156
9.604423522949219
9.870196342468262
9.376617431640625
9.859009742736816
10.342354774475098
7.551503658294678
6.221015453338623
10.087067604064941
7.332768440246582
9.642024040222168
6.610126972198486
11.33931827545166
8.919302940368652
9.880581855773926
8.215899467468262
9.752113342285156
8.777703285217285
8.369864463806152
7.183315277099609
7.9839067459106445
7.251392841339111
10.04757308959961
9.06892204284668
8.216794967651367
5.529954433441162
4.962522506713867
9.235088348388672
5.746991157531738
5.8533148765563965
6.736738681793213
6.80381441116333
5.472954750061035
4.757626056671143
4.711280345916748
3.965932846069336
6.654690742492676
5.054947376251221
4.748981475830078
6.900294303894043
6.73411750793457
5.934909343719482
4.919130802154541
6.0589375495910645
5.05915260314941

In [54]:
# we can use built in function for classification to get the loss
builtin = F.cross_entropy(logits, Y) 
print('builtin cross entropy: ', builtin)
print('original loss: ', loss) # manual calc is the same as the builtin cross_entropy()


builtin cross entropy:  tensor(21.7302, grad_fn=<NllLossBackward0>)
original loss:  tensor(21.7302, grad_fn=<NllLossBackward0>)


In [None]:
# indexes into the dataset - random ints and there are 32 of them
torch.randint(0, X.shape[0], (32,))

tensor([175, 670, 387,  11, 541,   4, 699, 596, 381, 737, 323, 417, 443, 482,
        599, 114, 318, 337, 116, 306, 293, 625, 783, 685, 375, 411, 523, 755,
        379, 643, 457, 709])