In [1]:
# https://youtu.be/TCH_1BHY58I?t=1868

In [2]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [4]:
len(words)

32033

In [5]:
# build the vocabulary of characters and mappings to/from integers

chars = sorted(list(set(''.join(words))))
stoi = {s:i + 1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


# #1

In [6]:
# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?
X, Y = [], []
for w in words[:5]:
    print(w)
    context = [0] * block_size
    
    # iterates through the words building lists of the 3 characters that preceed
    # the given character, fills in the 0s ('.')
    for ch in w + '.':  # add the final stop character
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '---->', itos[ix])
        
        # grabs the last 2 characters and makes them first, then the 
        # most recent character becomes the final character index
        context = context[1:] + [ix] # crop and append
        
X = torch.tensor(X)
Y = torch.tensor(Y)

emma
... ----> e
..e ----> m
.em ----> m
emm ----> a
mma ----> .
olivia
... ----> o
..o ----> l
.ol ----> i
oli ----> v
liv ----> i
ivi ----> a
via ----> .
ava
... ----> a
..a ----> v
.av ----> a
ava ----> .
isabella
... ----> i
..i ----> s
.is ----> a
isa ----> b
sab ----> e
abe ----> l
bel ----> l
ell ----> a
lla ----> .
sophia
... ----> s
..s ----> o
.so ----> p
sop ----> h
oph ----> i
phi ----> a
hia ----> .


In [7]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

## Build the Lookup Table (embedding space)
27 characters into ___ Space

# #2

In [8]:
# initially use a 27 --> 2 dimensional space; 27 characters in lexicon, 2 parameter embedding
C = torch.randn((27, 2)) # random starting values, untrained
C[5]

tensor([0.6538, 0.1818])

In [9]:
# example encoding one row of the C lookup table
example_row = F.one_hot(torch.tensor(5), num_classes=27).float() # expressly cast as float
example_row

tensor([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [10]:
example_row @ C  # output is identical to C[5] because as the first layer there is no nonlinearity
# the effect if to take the C values for that row and embed them in the neural network

tensor([0.6538, 0.1818])

In [11]:
# for the first layer use the row values of C, since they are equivalent

C[torch.tensor([5,6,7, 7, 7])] # rows can be repeated

tensor([[0.6538, 0.1818],
        [0.6006, 0.4620],
        [1.0696, 2.0331],
        [1.0696, 2.0331],
        [1.0696, 2.0331]])

In [12]:
C[X].shape  # embeds the (27 , 2) matrix of value from #2 into the (32, 3) corpus 

torch.Size([32, 3, 2])

In [13]:
# X is the collection of 3 preceeding character indices: 
# (row corresponding to the latest character, three indices of the three preceeding characters)
X[13, 2] # this is the 14th collection of three preceeding characters, from above: ..a ----> v
# this returns the index for 'a'

tensor(1)

In [14]:
C[X][13, 2] # the embedding at the position for the (13, 2) character 'a'

tensor([ 0.6385, -0.9289])

In [15]:
C[1] # another way to access the  embedding for the first character

tensor([ 0.6385, -0.9289])

In [16]:
# example of the 0 character
X[13, 1], C[0], C[X][13, 1]

(tensor(0), tensor([0.7421, 0.1760]), tensor([0.7421, 0.1760]))

In [17]:
# indices from X (the collection of 3 preceeding characters) are assigned the vectors 
# from C at those indices in C
# leads to a shape of the number of 3 character values from X, made up of the 
# vectors of the characters at each of the index values in C of the values from X

emb = C[X]
emb.shape

torch.Size([32, 3, 2])

In [18]:
print(f"indices for 'emm': {X[3]}")
print(f"vector embeddings for indices of X at 'emm':\n {emb[3]} ")

indices for 'emm': tensor([ 5, 13, 13])
vector embeddings for indices of X at 'emm':
 tensor([[ 0.6538,  0.1818],
        [-0.3289, -0.2409],
        [-0.3289, -0.2409]]) 


### Hidden Layer: [page 6](https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)

In [35]:
# inputs to the hidden layer
# 3 characters, 2 deep (as defined at C) --> 6 x 100 (100 assigned as choice)
W1 = torch.randn((6, 100))
b1 = torch.randn(100)
print(W1.shape, b1.shape)

torch.Size([6, 100]) torch.Size([100])


In [20]:
# desired action: emb @ W + b  <-- shape is wrong; need (32, 6)
# pulls out the C vectors for each position of the values of X
# this code is hard coded to the block_size = 3 --> use torch.unbind
torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], 1).shape

torch.Size([32, 6])

#### Technique 1

In [21]:
# torch.unbind extracts the values at the given dimension
# (torch.unbind(emb, 1)) # produces 3 tuples: the 2D vectors from C for each of the characters in dimension 1

In [25]:
# concatenates the 3 collections of C vectors for each value at dimension 1
# this could be expanded with a larger block_size, which would be found at dimension 1
# inefficient because a new tensor is created with cat

emb2 = torch.cat(torch.unbind(emb, 1), 1)
emb2.shape

torch.Size([32, 6])

#### Technique 2

In [27]:
emb3 = emb.view(32, 6) # more efficient was to re-assign values
emb3.shape

torch.Size([32, 6])

In [29]:
# emb2 == emb3   # elementwise comparison

#### Generalized use of the Hidden Layer

Take the time to confirm the shape of the values:
- emb.view(-1, 6) --> (32, 6)
- W1 --> (6, 100)
- b1 --> (,100) ; confirm the [broadcasting rules](https://pytorch.org/docs/stable/notes/broadcasting.html) ; trailing dimensions are equal --> b is broadcast across the product

In [33]:
# 100 length activations for each of the 32 samples
h_ = emb.view(-1, 6) @ W1 + b1  # the -1 value leads pytorch to derive the value 32
h = torch.tanh(h_)
h.shape

torch.Size([32, 100])

### Output Layer
Will derive the probabilities for the next character based on the input 3 character set

In [36]:
W2 = torch.randn((100, 27)) 
b2 = torch.randn(27)

In [37]:
logits = h @ W2 + b2

In [39]:
logits.shape

torch.Size([32, 27])

In [40]:
# probabilities for each of 27 characters based on the preceeding 3 character values 
counts = logits.exp()
prob = counts / counts.sum(1, keepdims = True)

In [41]:
prob.shape, prob[0].sum()

(torch.Size([32, 27]), tensor(1.))

In [43]:
# the probability from prob at the index of Y
# the closer to 1 the better trained the network is at predicting the correct Y

prob[torch.arange(32), Y]

Y's shape:  torch.Size([32])


tensor([1.5532e-15, 9.8704e-10, 3.7192e-05, 5.2907e-02, 4.0825e-04, 1.9278e-08,
        7.3502e-14, 4.1476e-12, 1.6223e-10, 3.2840e-11, 2.7298e-10, 1.2534e-09,
        3.5036e-08, 8.6059e-01, 2.3232e-05, 7.0813e-03, 7.7228e-13, 5.9000e-09,
        8.2822e-01, 6.5854e-10, 8.3075e-16, 3.2893e-17, 1.8251e-11, 1.8070e-06,
        4.0434e-04, 5.0777e-05, 8.6054e-09, 3.9314e-07, 1.1683e-04, 6.1279e-11,
        1.0775e-11, 1.2653e-01])