In [68]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [69]:
#@title Load Dataset

words1 = open('mkd_names_1.txt', 'r').read().splitlines()
words2 = open('mkd_names_2.txt', 'r').read().splitlines()

words1 = [x.lower() if isinstance(x, str) else x for x in words1]
words2 = [x.lower() if isinstance(x, str) else x for x in words2]

words2_new = [w.split()[0] for w in words2]

words = words1 + words2_new
words = list(dict.fromkeys(words))  # list(set(words))

words[:5]

['аврам', 'александар', 'алимпиј', 'атанас', 'ангел']

In [70]:
#@title Build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s: i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i: s for s, i in stoi.items()}
print(itos)

{1: 's', 2: 'а', 3: 'б', 4: 'в', 5: 'г', 6: 'д', 7: 'е', 8: 'ж', 9: 'з', 10: 'и', 11: 'к', 12: 'л', 13: 'м', 14: 'н', 15: 'о', 16: 'п', 17: 'р', 18: 'с', 19: 'т', 20: 'у', 21: 'ф', 22: 'х', 23: 'ц', 24: 'ч', 25: 'ш', 26: 'ѓ', 27: 'ѕ', 28: 'ј', 29: 'љ', 30: 'њ', 0: '.'}


In [71]:
context = [0] * 3
print(''.join('a' for i in range(3)))

aaa


In [72]:
#@title Build the dataset

block_size = 3  # context (window) length: how many characters do we take to predict the next one (moving window)
X, Y = [], []

for w in words[:5]:
  print(w)
  context = [0] * block_size
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    Y.append(ix)
    print(''.join(itos[i] for i in context), '--->', itos[ix])
    context = context[1:] + [ix]  # crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)

аврам
... ---> а
..а ---> в
.ав ---> р
авр ---> а
вра ---> м
рам ---> .
александар
... ---> а
..а ---> л
.ал ---> е
але ---> к
лек ---> с
екс ---> а
кса ---> н
сан ---> д
анд ---> а
нда ---> р
дар ---> .
алимпиј
... ---> а
..а ---> л
.ал ---> и
али ---> м
лим ---> п
имп ---> и
мпи ---> ј
пиј ---> .
атанас
... ---> а
..а ---> т
.ат ---> а
ата ---> н
тан ---> а
ана ---> с
нас ---> .
ангел
... ---> а
..а ---> н
.ан ---> г
анг ---> е
нге ---> л
гел ---> .


In [73]:
X.shape, Y.shape, X.dtype, Y.dtype

(torch.Size([38, 3]), torch.Size([38]), torch.int64, torch.int64)

In [74]:
X[0], Y[0]

(tensor([0, 0, 0]), tensor(2))

In [75]:
VOCAB_SIZE = 30 + 1
EMBED_SIZE = 2
C = torch.rand((VOCAB_SIZE, EMBED_SIZE))

In [76]:
C[5]

tensor([0.0598, 0.9335])

In [77]:
F.one_hot(torch.tensor(5), num_classes=VOCAB_SIZE).float() @ C

tensor([0.0598, 0.9335])

In [78]:
emb = C[X]
emb.shape

torch.Size([38, 3, 2])

In [79]:
W1 = torch.randn((6, 100))
b1 = torch.rand(100)

In [80]:
# emb @ W1 + b1  --> This won't work
# RuntimeError: mat1 and mat2 shapes cannot be multiplied (114x2 and 6x100)

In [81]:
torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], dim=1).shape  # Inefficient, as it creates new memory.

torch.Size([38, 6])

In [82]:
torch.cat(torch.unbind(emb, dim=1), dim=1).shape

torch.Size([38, 6])

In [83]:
a = torch.arange(18)
a

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])

In [84]:
a.shape

torch.Size([18])

In [85]:
a.view(9, 2)  # more efficient way

tensor([[ 0,  1],
        [ 2,  3],
        [ 4,  5],
        [ 6,  7],
        [ 8,  9],
        [10, 11],
        [12, 13],
        [14, 15],
        [16, 17]])

In [86]:
a.storage()

 0
 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
[torch.storage.TypedStorage(dtype=torch.int64, device=cpu) of size 18]

In [87]:
h = torch.tanh(emb.view(emb.shape[0], 6) @ W1 + b1)  # emb.view(-1, 6) this also works
h.shape

torch.Size([38, 100])

In [88]:
W2 = torch.randn((100, VOCAB_SIZE))
b2 = torch.rand(VOCAB_SIZE)

logits = h @ W2 + b2
logits.shape

torch.Size([38, 31])

In [89]:
counts = logits.exp()
prob = counts / counts.sum(dim=1, keepdim=True)
prob.shape

torch.Size([38, 31])

In [90]:
loss = -prob[torch.arange(38), Y].log().mean()
loss

tensor(17.4565)

Putting everything together:

In [91]:
g = torch.Generator().manual_seed(123)  # for reproducibility
C = torch.randn((VOCAB_SIZE, 2), generator=g)

W1 = torch.randn((6, 100))
b1 = torch.rand(100)

W2 = torch.randn((100, VOCAB_SIZE))
b2 = torch.rand(VOCAB_SIZE)

parameters = [W1, b1, W2, b2]
sum(p.nelement() for p in parameters)  # number of parameters in total

3831

In [99]:
for p in parameters:
  p.requires_grad = True

for i in range(1000):

  # Forward pass
  emb = C[X]
  h = torch.tanh(emb.view(-1, 6) @ W1 + b1)  # emb.view(emb.shape[0], 6) also works
  logits = h @ W2 + b2
  # counts = logits.exp()
  # prob = counts / counts.sum(dim=1, keepdim=True)
  # loss = -prob[torch.arange(38), Y].log().mean()
  loss = F.cross_entropy(logits, Y)
  # print(loss.item())

  # Backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  # Update
  for p in parameters:
    p.data -= 0.1 * p.grad

print(loss.item())

0.21912384033203125


In [96]:
logits.max(1)

torch.return_types.max(
values=tensor([14.2162, 11.2371, 13.2242, 17.7287, 16.5510, 24.1478, 14.2162, 11.2371,
        14.1586, 13.3232, 14.3684, 16.8727, 15.6981, 16.2222, 12.9035, 20.7307,
        14.3791, 14.2162, 11.2371, 14.1586, 13.0246, 15.3155, 15.8626, 16.3694,
        17.3394, 14.2162, 11.2371, 18.7028, 14.3941, 17.3519, 14.5328, 20.7451,
        14.2162, 11.2371, 17.2718, 18.6365, 16.3216, 13.8586],
       grad_fn=<MaxBackward0>),
indices=tensor([ 2, 12, 17,  2, 13,  0,  2, 12,  7, 11, 18,  2, 14,  6,  2, 17,  0,  2,
        12,  7, 13, 16, 10, 28,  0,  2, 12,  2, 14,  2, 18,  0,  2, 12,  5,  7,
        12,  0]))

In [97]:
Y

tensor([ 2,  4, 17,  2, 13,  0,  2, 12,  7, 11, 18,  2, 14,  6,  2, 17,  0,  2,
        12, 10, 13, 16, 10, 28,  0,  2, 19,  2, 14,  2, 18,  0,  2, 14,  5,  7,
        12,  0])

We are overfitting, but not completely. We can't completely overfit because for some training samples the input is the same (...), but the target is different (a different letter).

In [None]:
block_size = 3  # context (window) length: how many characters do we take to predict the next one (moving window)
X, Y = [], []

for w in words:
  context = [0] * block_size
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    Y.append(ix)
    context = context[1:] + [ix]  # crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)

g = torch.Generator().manual_seed(123)  # for reproducibility
C = torch.randn((VOCAB_SIZE, 2), generator=g)

W1 = torch.randn((6, 100))
b1 = torch.rand(100)
W2 = torch.randn((100, VOCAB_SIZE))
b2 = torch.rand(VOCAB_SIZE)
parameters = [W1, b1, W2, b2]

for p in parameters:
  p.requires_grad = True

BATCH_SIZE = 32

for i in range(10000):

  # minibatch construct
  ix = torch.randint(0, X.shape[0], (32, ))

  # Forward pass
  emb = C[X[ix]]  # (32, 3, 2)
  h = torch.tanh(emb.view(-1, 6) @ W1 + b1)  # (32, 100)
  logits = h @ W2 + b2
  # counts = logits.exp()
  # prob = counts / counts.sum(dim=1, keepdim=True)
  # loss = -prob[torch.arange(38), Y].log().mean()
  loss = F.cross_entropy(logits, Y[ix])
  # print(loss.item())

  # Backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  # Update
  for p in parameters:
    p.data -= 0.1 * p.grad

emb = C[X]  # (32, 3, 2)
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)  # (32, 100)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y)
print(loss.item())

1.856877326965332
