In [87]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline


In [88]:
words = open('/content/name.txt','r').read().splitlines()

In [89]:
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [90]:
len(words)

32033

In [91]:
# build the vocabulary of characters and mapping to from integers

cahrs = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(cahrs)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [92]:
# build the dataset

block_size = 3 # context length: how many characters do we take to
X,Y = [],[]
for w in words[:5]:
  print(w)
  context = [0]*block_size
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    Y.append(ix)
    print(''.join(itos[i] for i in context), '----->',itos[ix])
    context = context[1:] + [ix]
X = torch.tensor(X)
Y = torch.tensor(Y)

emma
... -----> e
..e -----> m
.em -----> m
emm -----> a
mma -----> .
olivia
... -----> o
..o -----> l
.ol -----> i
oli -----> v
liv -----> i
ivi -----> a
via -----> .
ava
... -----> a
..a -----> v
.av -----> a
ava -----> .
isabella
... -----> i
..i -----> s
.is -----> a
isa -----> b
sab -----> e
abe -----> l
bel -----> l
ell -----> a
lla -----> .
sophia
... -----> s
..s -----> o
.so -----> p
sop -----> h
oph -----> i
phi -----> a
hia -----> .


In [93]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In [94]:
X

tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        [ 5, 13, 13],
        [13, 13,  1],
        [ 0,  0,  0],
        [ 0,  0, 15],
        [ 0, 15, 12],
        [15, 12,  9],
        [12,  9, 22],
        [ 9, 22,  9],
        [22,  9,  1],
        [ 0,  0,  0],
        [ 0,  0,  1],
        [ 0,  1, 22],
        [ 1, 22,  1],
        [ 0,  0,  0],
        [ 0,  0,  9],
        [ 0,  9, 19],
        [ 9, 19,  1],
        [19,  1,  2],
        [ 1,  2,  5],
        [ 2,  5, 12],
        [ 5, 12, 12],
        [12, 12,  1],
        [ 0,  0,  0],
        [ 0,  0, 19],
        [ 0, 19, 15],
        [19, 15, 16],
        [15, 16,  8],
        [16,  8,  9],
        [ 8,  9,  1]])

In [95]:
Y

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])

First we need to make the lookup table C

In [96]:
C = torch.randn((27,2))

In [97]:
C

tensor([[ 0.1632,  0.0027],
        [-0.3457, -0.7611],
        [ 0.3607, -1.6767],
        [ 0.0512, -0.1271],
        [-1.4184,  0.1704],
        [ 0.6454, -0.6561],
        [-1.8666, -1.3483],
        [ 0.0731,  0.2369],
        [-0.0827, -0.2833],
        [-0.4592,  1.8215],
        [-0.9676,  0.5895],
        [-0.7444,  0.0755],
        [-1.0749, -0.0457],
        [-0.9963,  1.0991],
        [ 0.5406, -0.2172],
        [ 0.1269,  0.4111],
        [ 1.3428,  0.6710],
        [-0.2983,  0.1832],
        [-0.2311, -0.3014],
        [ 1.8139,  0.3023],
        [-0.0461, -0.5044],
        [-1.0156, -0.0462],
        [-0.5005,  0.6779],
        [ 0.5221,  0.4044],
        [ 0.3632,  0.0488],
        [ 2.2957, -0.9266],
        [ 0.6050,  0.1067]])

In [98]:
C[5]

tensor([ 0.6454, -0.6561])

In [99]:
F.one_hot(torch.tensor(5),num_classes=27).float() @ C


tensor([ 0.6454, -0.6561])

In [100]:
C[5]

tensor([ 0.6454, -0.6561])

In [101]:
C[[5,6,7]]

tensor([[ 0.6454, -0.6561],
        [-1.8666, -1.3483],
        [ 0.0731,  0.2369]])

In [102]:
C[X]

tensor([[[ 0.1632,  0.0027],
         [ 0.1632,  0.0027],
         [ 0.1632,  0.0027]],

        [[ 0.1632,  0.0027],
         [ 0.1632,  0.0027],
         [ 0.6454, -0.6561]],

        [[ 0.1632,  0.0027],
         [ 0.6454, -0.6561],
         [-0.9963,  1.0991]],

        [[ 0.6454, -0.6561],
         [-0.9963,  1.0991],
         [-0.9963,  1.0991]],

        [[-0.9963,  1.0991],
         [-0.9963,  1.0991],
         [-0.3457, -0.7611]],

        [[ 0.1632,  0.0027],
         [ 0.1632,  0.0027],
         [ 0.1632,  0.0027]],

        [[ 0.1632,  0.0027],
         [ 0.1632,  0.0027],
         [ 0.1269,  0.4111]],

        [[ 0.1632,  0.0027],
         [ 0.1269,  0.4111],
         [-1.0749, -0.0457]],

        [[ 0.1269,  0.4111],
         [-1.0749, -0.0457],
         [-0.4592,  1.8215]],

        [[-1.0749, -0.0457],
         [-0.4592,  1.8215],
         [-0.5005,  0.6779]],

        [[-0.4592,  1.8215],
         [-0.5005,  0.6779],
         [-0.4592,  1.8215]],

        [[-0.5005,  0

In [103]:
C[X].shape

torch.Size([32, 3, 2])

In [104]:
X[13,2]

tensor(1)

In [105]:
C[X][13,2]

tensor([-0.3457, -0.7611])

In [106]:
C[1]

tensor([-0.3457, -0.7611])

In [107]:
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

In [108]:
W1 = torch.rand((6,100))
b1 = torch.rand(100)


In [109]:
# emb @ W1 + b
emb.view(-1,6)@ W1 + b1

tensor([[ 1.0623,  0.4600,  0.9239,  ...,  0.6606,  0.9302,  0.7236],
        [ 1.0494,  0.0911,  0.6176,  ...,  0.8898,  1.0178,  1.1041],
        [ 0.7524,  1.2686,  1.3595,  ...,  0.1403,  0.7741,  0.0549],
        ...,
        [ 2.3355,  1.0718,  1.8633,  ...,  1.7967,  1.8805,  1.8334],
        [ 2.6052,  1.7226,  2.1039,  ...,  0.9023,  1.8168,  0.9355],
        [ 1.4063, -0.5952,  0.6512,  ...,  0.4944,  0.1216, -0.3667]])

In [110]:
torch.cat([emb[:,0,:], emb[:,1,:], emb[:,2,:]],1).shape

torch.Size([32, 6])

In [111]:
torch.cat(torch.unbind(emb,1),1).shape

torch.Size([32, 6])

In [112]:
a = torch.arange(18)

In [113]:
a

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])

As long as the multiplipcation of the number is the same, it will work

In [114]:
a.view(2,9)

tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8],
        [ 9, 10, 11, 12, 13, 14, 15, 16, 17]])

In [115]:
a.view(3,3,2)

tensor([[[ 0,  1],
         [ 2,  3],
         [ 4,  5]],

        [[ 6,  7],
         [ 8,  9],
         [10, 11]],

        [[12, 13],
         [14, 15],
         [16, 17]]])

In [116]:
a.storage()

 0
 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
[torch.storage.TypedStorage(dtype=torch.int64, device=cpu) of size 18]

In [117]:
h = emb.view(emb.shape[0],6) @ W1 + b1

In [118]:
h.shape

torch.Size([32, 100])

In [119]:
h = torch.tanh(emb.view(-1,6)@ W1 + b1)

In [120]:
h

tensor([[ 0.7865,  0.4301,  0.7277,  ...,  0.5788,  0.7307,  0.6191],
        [ 0.7816,  0.0908,  0.5494,  ...,  0.7113,  0.7690,  0.8020],
        [ 0.6366,  0.8534,  0.8763,  ...,  0.1393,  0.6493,  0.0548],
        ...,
        [ 0.9814,  0.7901,  0.9530,  ...,  0.9465,  0.9545,  0.9502],
        [ 0.9891,  0.9382,  0.9707,  ...,  0.7174,  0.9485,  0.7331],
        [ 0.8867, -0.5336,  0.5725,  ...,  0.4577,  0.1210, -0.3511]])

In [121]:
h.shape

torch.Size([32, 100])

In [122]:
W2 = torch.randn((100,27))
b2 = torch.randn(27)


In [123]:
logits = h@W2 + b2

In [124]:
logits.shape

torch.Size([32, 27])

In [125]:
counts = logits.exp()
prob = counts/counts.sum(1,keepdim=True)

In [126]:
prob.shape

torch.Size([32, 27])

In [127]:
prob[0].sum()

tensor(1.0000)

In [131]:
loss = -prob[torch.arange(32),Y].log().mean()

In [132]:
loss

tensor(17.7954)