In [13]:
import json
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(123)

<torch._C.Generator at 0x7f4516de6770>

# Dataset

## Load

MKD songs custom dataset

In [14]:
def read_json_songs_to_str(filename='mkd_songs.json'):
  with open(filename, 'r', encoding='utf-8') as json_file:
    return json.load(json_file)

In [15]:
songs = []
for filename in ['mkd_songs_1.json', 'mkd_songs_2.json', 'mkd_songs_3.json']:
  songs.extend(read_json_songs_to_str(filename))
len(songs)

298

In [16]:
songs[0]

'Денови\nКако на вратот ѓердани\nниски камења студени,\nтака на плешки денови\nлегнале та натежнале\nДенови ли се — денови\nаргатски маки големи!\nСтани си утре порано\nдојди си вечер подоцна,\nнаутро радост понеси\nнавечер тага донеси —\nај пуст да е, пуст да би\nостанал живот кучешки!\nРоди се човек — роб биди\nроди се човек — скот умри\nскотски цел живот работи\nза други, туѓи имоти.\nЗа туѓи бели дворови,\nкопај си црни гробови!\nЗа себе само ’ргај си\nза себе маки тргај си —\nнижи си ѓердан денови\nнижи си алки ковани,\nнижи си синџир железен\nоколу вратот навезен!'

The import worked, let's proceed.

## Data preprocessing

In [17]:
#@title Initial vocabulary
chars = sorted(list(set(''.join(songs))))
vocab_size = len(chars)
print(vocab_size)

117


In [18]:
print(''.join(chars))


 !"'()*,-.0123456789:;?ACHIMOPTV[]`acejopuxyèЃЈЉЌЏАБВГДЕЖЗИКЛМНОПРСТУФХЦЧШабвгдежзиклмнопрстуфхцчшѐёѓѕјљњќѝџ–—‘’“”„…


In [19]:
'è' == 'ѐ'

False

- Replace latinic letters (ACHMOPTV) with corresponding cyrillic letters.
- Remove arabic numbers such as 6, 7, 8, etc. from 'Огинот'
- Replace '…' (elipsis) with three dots.
- Combine '–' (en dash) and '—' (em dash) to just '-' (en dash).
- Replace 'ѐ' with 'e' and 'ѝ' with 'и'.
  - For a fancier model this shall not be done, but here we can simplify.
- Drop: ‘’“”„
  - They do not add much value for the given task.
- Replace '*' with space.
  - There are some lines such as '* * *', which mean nothing except a new line.
- Replace 'u' with 'и'.
- Remove 'I' and 'V', as they are used for roman numbers, which we don't need.
- Remove '2х' and 'x2'.
- Replace '6' with 'б'.
  - Mistake in the text.

And some others...

In [20]:
# for idx, song in enumerate(songs):
#   for char in song:
#     if char == 'V':
#       print(idx)
#       print(song)
#       break

In [21]:
#@title Preprocessing steps
processed_songs = []

for song in songs:
  # Rule 1: Replace Latinic letters (ACHMOPTV) with corresponding Cyrillic letters
  latin_to_cyrillic = str.maketrans("ACHMOPTacejopxy", "АСНМОПТасејорху")
  song = song.translate(latin_to_cyrillic)

  # Rule 2: Remove Arabic numbers
  song = song.replace('х2', ' ')
  song = song.replace('2х', ' ')
  song = song.replace('х 2', ' ')
  song = song.replace('2 пати', ' ')
  song = song.replace('2пати', ' ')
  song = song.replace('3х', ' ')
  song = re.sub(r'^\d+\.\s*', '\n', song, flags=re.MULTILINE)
  song = re.sub(r'\d+', '\n', song)

  # Special characters
  song = song.replace('…', '...')
  song = song.replace(';', ',')
  song = re.sub(r'[‘’“”„"\'`\[\]\(\)]', '', song)
  song = re.sub(r"[-–—]", "-", song)
  song = song.replace('*', ' ')

  # Special letters
  song = song.replace('ѐ', 'е').replace('è', 'е').replace('ѝ', 'и').replace('ё', 'е')

  # Remaining wrong letters
  song = song.replace("u", "и")
  song = song.replace('6', 'б')

  # Rule 9: Replace Roman numbers followed by a dot with a space
  song = re.sub(r"\b[IV]+\.\s*", "", song)
  song = re.sub(r"[IV]", "", song)

  # song = re.sub(r'-+\n', '\n', song)
  song = re.sub(r'^-+\s*', '', song, flags=re.MULTILINE)
  song = re.sub(r'-+\n', '\n', song)
  song = re.sub(r'^\.+\n', '', song, flags=re.MULTILINE)

  song = song.replace('Р А Н И', 'РАНИ')
  song = song.replace('Н А Р О Д Н И О Т П А БОТНИ К . . .', 'НАРОДНИОТ ПАБОТНИК...')
  song = song.replace('Б Р А Т О У Б И Е Ц', 'БРАТОУБИЕЦ')
  song = song.replace('П А Т Р И О Т', 'ПАТРИОТ')
  song = song.replace('Н А М Е А Н А', 'НА МЕАНА')
  song = song.replace('А Ј Д У Ш К О Л И Б Е', 'АЈДУШКО ЛИБЕ')
  song = song.replace('Ч О Р Б А Д Ж И СПИРО', 'ЧОРБАДЖИ СПИРО')
  song = song.replace('Л Е А Р И Т Е', 'ЛЕАРИТЕ')
  song = song.replace('П Р О Т А Т А', 'ПРОТАТА')
  song = song.replace('А М А Л О Т', 'АМАЛОТ')
  song = song.replace('Ж Е Т В А Р И Т Е', 'ЖЕТВАРИТЕ')
  song = song.replace('О Г И Н О Т', 'ОГИНОТ')
  song = song.replace('\nпоема\n', '\n')

  # Clean up extra spaces and newlines
  song = re.sub(r'\n\s*\n+', '\n\n', song)

  # Add processed song to the list
  processed_songs.append(song)

In [22]:
#@title Final vocabulary
chars = sorted(list(set(''.join(processed_songs))))
vocab_size = len(chars)
print(vocab_size)

68


In [23]:
len(processed_songs)

298

In [24]:
# for idx_song, song in enumerate(processed_songs):
#   for idx, char in enumerate(song):
#     if char == ')':
#       print(idx_song)
#       print(song[idx-10:idx+10])
#       break

In [25]:
print(''.join(chars))


 !,-.:?ЃЈЉЌЏАБВГДЕЖЗИКЛМНОПРСТУФХЦЧШабвгдежзиклмнопрстуфхцчшѓѕјљњќџ


In [26]:
chars_concat = ''.join(chars)

capital_letters = sum(1 for c in chars_concat if c.isupper())
lowercase_letters = sum(1 for c in chars_concat if c.islower())
special_characters = sum(1 for c in chars_concat if not c.isalnum())

print(f'Capital letters: {capital_letters}')
print(f'Lower case letters: {lowercase_letters}')
print(f'Special characters: {special_characters}')
print(f'Total: {capital_letters+lowercase_letters+special_characters}')

Capital letters: 29
Lower case letters: 31
Special characters: 8
Total: 68


In [27]:
# for ps in processed_songs:
#   print(ps)
#   print(50*'-')

In [28]:
type(''.join(processed_songs))

str

In [29]:
def save_songs_to_json(songs, filename='mkd_songs.json'):
    with open(filename, 'w', encoding='utf-8') as json_file:
        json.dump(songs, json_file, indent=4, ensure_ascii=False)

save_songs_to_json(songs=''.join(processed_songs), filename='mkd_songs_processed.json')

## Data splitting

In [30]:
#@title Encoder/Decoder
# Create a mapping from characters to integers
stoi = {s: i for i, s in enumerate(chars)}
itos = {i: s for i, s in enumerate(chars)}

encode = lambda x: [stoi[c] for c in x]
decode = lambda x: ''.join(itos[i] for i in x)

print(encode('еј Дејан'))
print(decode(encode('еј Дејан')))

[42, 63, 1, 17, 42, 63, 37, 49]
еј Дејан


In [31]:
text = ''.join(processed_songs)
data = torch.tensor(encode(text), dtype=torch.long)  # Encode the text and store it to a tensor
print(data.shape)
print(data[:100])

torch.Size([203554])
tensor([17, 42, 49, 50, 39, 45,  0, 22, 37, 46, 50,  1, 49, 37,  1, 39, 52, 37,
        54, 50, 54,  1, 61, 42, 52, 41, 37, 49, 45,  0, 49, 45, 53, 46, 45,  1,
        46, 37, 48, 42, 65, 37,  1, 53, 54, 55, 41, 42, 49, 45,  3,  0, 54, 37,
        46, 37,  1, 49, 37,  1, 51, 47, 42, 60, 46, 45,  1, 41, 42, 49, 50, 39,
        45,  0, 47, 42, 40, 49, 37, 47, 42,  1, 54, 37,  1, 49, 37, 54, 42, 43,
        49, 37, 47, 42,  0, 17, 42, 49, 50, 39])


In [32]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]
print(train_data.shape, val_data.shape)

torch.Size([183198]) torch.Size([20356])


## Data Loader

In [33]:
block_size = 8
train_data[:block_size+1]

tensor([17, 42, 49, 50, 39, 45,  0, 22, 37])

In [34]:
x = train_data[:block_size]
y = train_data[1:block_size + 1]
for t in range(block_size):
  context = x[:t+1]
  target = y[t]
  print(f'When the input is {context}, the target is {target}')

When the input is tensor([17]), the target is 42
When the input is tensor([17, 42]), the target is 49
When the input is tensor([17, 42, 49]), the target is 50
When the input is tensor([17, 42, 49, 50]), the target is 39
When the input is tensor([17, 42, 49, 50, 39]), the target is 45
When the input is tensor([17, 42, 49, 50, 39, 45]), the target is 0
When the input is tensor([17, 42, 49, 50, 39, 45,  0]), the target is 22
When the input is tensor([17, 42, 49, 50, 39, 45,  0, 22]), the target is 37


In [35]:
torch.manual_seed(1337)
batch_size = 4  # how many independent sequences we'll process in parallel
block_size = 8  # what is the maximum context length for predictions

def get_batch(split):
  # generate a small batch of data of inputs x and targets y
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size, ))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  return x, y

xb, yb = get_batch('train')
print('Inputs:')
print(xb.shape)
print(xb)
print('Targets:')
print(yb.shape)
print(yb)

print(50*'-')

for b in range(batch_size):  # batch dimension
  for t in range(block_size):  # time dimension
    context = xb[b, :t+1]
    target = yb[b, t]
    print(f'When the input is {context.tolist()}, the target is {target}')


Inputs:
torch.Size([4, 8])
tensor([[37,  3,  0, 37,  1, 51, 37,  1],
        [50, 48, 37,  1, 41, 37,  1, 53],
        [47, 37, 46, 54, 45,  1, 51, 47],
        [ 1, 53, 54, 52, 37, 41, 49, 37]])
Targets:
torch.Size([4, 8])
tensor([[ 3,  0, 37,  1, 51, 37,  1, 63],
        [48, 37,  1, 41, 37,  1, 53, 45],
        [37, 46, 54, 45,  1, 51, 47, 37],
        [53, 54, 52, 37, 41, 49, 37,  1]])
--------------------------------------------------
When the input is [37], the target is 3
When the input is [37, 3], the target is 0
When the input is [37, 3, 0], the target is 37
When the input is [37, 3, 0, 37], the target is 1
When the input is [37, 3, 0, 37, 1], the target is 51
When the input is [37, 3, 0, 37, 1, 51], the target is 37
When the input is [37, 3, 0, 37, 1, 51, 37], the target is 1
When the input is [37, 3, 0, 37, 1, 51, 37, 1], the target is 63
When the input is [50], the target is 48
When the input is [50, 48], the target is 37
When the input is [50, 48, 37], the target is 1
When

# Simplest baseline: Bigram Language Model

In [None]:
#@title Model definition

class BigramLanguageModel(nn.Module):
  def __init__(self, vocab_size):
    super().__init__()
    # Each token directly reads off the logits for the next token from a lookup table
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

  def forward(self, idx, targets=None):
    # idx and targets are both (B,T) tensor of integers
    logits = self.token_embedding_table(idx)  # (B,T,C)
    # logits are the "probs" for the next character
    if targets == None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)
    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx is (B,T) array of indices in the current context
    # we want to predict the next token(s), given the context
    for _ in range(max_new_tokens):
      # get the predictions
      logits, loss = self(idx)  # calls forward
      # focus only on the last time step (retrieve the last element in the time dimension)
      logits = logits[:, -1, :]  # becomes (B, C)
      # apply softmax to get probabilities
      probs = F.softmax(logits, dim=-1)  # (B, C)
      # sample from the distribution
      idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
      # append sampled index to the running sequence
      idx = torch.cat((idx, idx_next), dim=1)  # (B, (T+1))
    return idx
    # This function at the moment is kinda silly, because we pass a lot of
    # history, but use just the last character.
    # We do this because we will reuse it for the following models.


m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 68])
tensor(4.4151, grad_fn=<NllLossBackward0>)


Let's calculate what is the "random chance" loss:

In [None]:
-torch.tensor(1/68).log()

tensor(4.2195)

Our loss is a bit higher than this. The initial predictions aren't good, but we will improve ofc. For visualization purposes, let's try to perform inference:

In [None]:
print(decode(m.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


нцЃРљгсљ,нШ.ГХаКтИњшЕАачѕЛШжЃќжк
хџкирЖЕфУрИфЌУњлЏБФНаџјЏЖНРгСљ!ЗтазчснгуѕНЈцхПлПГофГЧ:ФџепЗуГкурВжТ


We can confirm that the predictions are "garbage".

In [None]:
#@title PyTorch Optimizer

optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [None]:
101 % 100

1

In [None]:
batch_size = 32

for steps in range(10000):
  # sample a new batch
  xb, yb = get_batch('train')
  # evaluate the loss
  logits, loss = m(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()
  if steps % 1000 == 0:
    print(loss.item())

4.624702453613281
3.6476426124572754
3.115750312805176
2.738480567932129
2.5497429370880127
2.594252347946167
2.6938440799713135
2.4024441242218018
2.5566020011901855
2.4998960494995117


In [None]:
print(decode(m.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


наљуја..
дриднама ми-зем,
Ехејдено праловане мејдаре минкречура набра Думлам пистажахам е
На!
Амини нечадемо,
жел Јоили
Тешт дЏПаГруата ќенах да.
ФТато дајкажунито ти
Зачкмикрзрк крдеБЕдено сит.
пи ќе сеќ смазојна з лида млегн ќезева ел и 
Зе етиј готизечако гоќери!
Курчеги толаш мо

Онори о докри дат т зо
Тре, До, дале ра ст.а мошада беја не тк е чи да кум мицеранцив..Дедота зе роданадури де ијанажи намеро ису се пом нено Ма. дл рви?
Сте ви,
Еле се гото вмо в?И набувра Аја н...
Смовеги м м,
ни 


Dramatic improvements, but we are still far from good.

# The mathematical trick in self attention

In [None]:
B, T, C = 4, 8, 2  # batch, time, channels
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [None]:
#@title Version 1:
# We want x[b,t] = mean_{i<=t} x[b,i]

xbow = torch.zeros((B,T,C))  # bag-of-words
for b in range(B):
  for t in range(T):
    xprev = x[b,:t+1]  # (t, C)  -> take everything up to and including the t-th token
    xbow[b, t] = torch.mean(xprev, 0)

In [None]:
x[0]

tensor([[ 0.4005,  1.8703],
        [-1.6798,  1.2621],
        [-0.0083,  1.2187],
        [ 0.7576, -0.3020],
        [ 0.1721, -1.6713],
        [ 0.1849, -0.3305],
        [ 0.0101, -1.0563],
        [-0.4005,  1.8449]])

In [None]:
xbow[0]

tensor([[ 0.4005,  1.8703],
        [-0.6397,  1.5662],
        [-0.4292,  1.4504],
        [-0.1325,  1.0123],
        [-0.0716,  0.4755],
        [-0.0288,  0.3412],
        [-0.0233,  0.1416],
        [-0.0704,  0.3545]])

The first row is the same. The second row is average of first and second, the third is average of 1, 2 and 3, etc...


The for loops aren't efficient. Let's see the trick now:

In [None]:
torch.tril(torch.ones(3, 3))  # lower triangular part

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])

In [None]:
a = torch.tril(torch.ones(3, 3))
a = a.divide(torch.sum(a, dim=1, keepdim=True))
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 0.],
        [4., 0.],
        [7., 3.]])
--
c=
tensor([[2.0000, 0.0000],
        [3.0000, 0.0000],
        [4.3333, 1.0000]])


In [None]:
#@title Version 2

weights = torch.tril(torch.ones(T, T))
weights = weights / weights.sum(1, keepdim=True)
weights.shape  # (T, T)

torch.Size([8, 8])

In [None]:
xbow2 = weights @ x  # (we will broadcast and add B, T, T) @ (B, T, C) -> (B, T, C)
xbow2.shape

torch.Size([4, 8, 2])

In [None]:
torch.allclose(xbow, xbow2)

True

In [None]:
#@title Version 3: Use SoftMax

tril = torch.tril(torch.ones(T, T))  # lower triangular ones
weights = torch.zeros((T, T))
weights = weights.masked_fill(tril == 0, float('-inf'))  # Upper triangular part will become -inf <=> The future cannot communicate with the past
print(weights)
weights = F.softmax(weights, dim=-1)
print(weights)
xbow3 = weights @ x
print(torch.allclose(xbow, xbow3))

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])
True


In [None]:
#@title Version 4: Self-attention
B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

# Let's see a single Head perorm self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)  # "what do I contain"
query = nn.Linear(C, head_size, bias=False)  # "what am I looking for" = search request in a database
value = nn.Linear(C, head_size, bias=False)

k = key(x)  # (B, T, head_size=16)
q = query(x)  # (B, T, head_size=16)
weights = q @ k.transpose(-2, -1)  # (B, T, head_size) @ (B, head_size, T) -> (B, T, T)

tril = torch.tril(torch.ones(T, T))  # lower triangular ones
# weights = torch.zeros((T, T))
weights = weights.masked_fill(tril == 0, float('-inf'))  # Upper triangular part will become -inf <=> The future cannot communicate with the past
# We don't need prev. line for encoder
weights = F.softmax(weights, dim=-1)

v = value(x)
out = weights @ v  # (B, T, head_size)
print(out.shape)

torch.Size([4, 8, 16])


In [None]:
weights[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3205, 0.6795, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5622, 0.2009, 0.2369, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2093, 0.1084, 0.3560, 0.3263, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0163, 0.1768, 0.7661, 0.0179, 0.0228, 0.0000, 0.0000, 0.0000],
        [0.0161, 0.3655, 0.0394, 0.1056, 0.0365, 0.4368, 0.0000, 0.0000],
        [0.0450, 0.2589, 0.0459, 0.1734, 0.0923, 0.0251, 0.3595, 0.0000],
        [0.2048, 0.1150, 0.0428, 0.1056, 0.0518, 0.0072, 0.3571, 0.1158]],
       grad_fn=<SelectBackward0>)

Notes:
- Attention is a **communication mechanism**. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.

  - Token: Arrows pointing to it
    - 1: 1
    - 2: 1, 2
    - 3: 1, 2, 3
    - 4: 1, 2, 3, 4
    - etc.

- There is no notion of space. Attention simply acts over a set of vectors. This is why we need to positionally encode tokens.
- Each example across batch dimension is of course processed completely independently and never "talk" to each other.
- In an "encoder" attention block just delete the single line that does masking with `tril`, allowing all tokens to communicate. This block here is called a "decoder" attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.
- "self-attention" just means that the keys and values are produced from the same source as queries. In "cross-attention", the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder module)
- "Scaled" attention additional divides `wei` by 1/sqrt(head_size). This makes it so when input Q,K are unit variance, wei will be unit variance too and Softmax will stay diffuse and not saturate too much. Illustration below

In [None]:
k = torch.randn(B, T, head_size)
q = torch.randn(B, T, head_size)
weights = q @ k.transpose(-2, -1)
print(k.var(), q.var(), weights.var())
weights = q @ k.transpose(-2, -1) * head_size**-0.5
print(k.var(), q.var(), weights.var())

tensor(0.9661) tensor(1.0400) tensor(15.9833)
tensor(0.9661) tensor(1.0400) tensor(0.9990)


The variance is preserved.

In [None]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1)

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])

In [None]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])*8, dim=-1) # gets too peaky, converges to one-hot

tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])

The softmax starts to sharpen towards the highest number, i.e. it will converge towards one-hot encoding quickly (even at initialization).

## LayerNorm

In [None]:
class BatchNorm1d:

  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.momentum = momentum
    self.training = True
    # Parameters (trained with backprop)
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)
    # Buffers (trained with a running 'momentum update')
    self.running_mean = torch.zeros(dim)
    self.running_var = torch.ones(dim)

  def __call__(self, x):
    # Calculate the forward pass
    if self.training:
      xmean = x.mean(0, keepdim=True)  # Batch mean
      xvar = x.var(0, keepdim=True)  # Batch variance
    else:
      xmean = self.running_mean
      xvar = self.running_var
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps)  # Normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    # Update the buffers (Exponential moving average)
    if self.training:
      with torch.no_grad():
        self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
        self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
    return self.out

  def parameters(self):
    return [self.gamma, self.beta]

In [None]:
module = BatchNorm1d(100)
x = torch.randn(32, 100)
x = module(x)
print(x.shape)
print(x[:,0].mean(), x[:,0].std())  # mean, std of one feature across all batch inputs
print(x[0,:].mean(), x[0,:].std())  # mean, std of a single input from the batch, of its features

torch.Size([32, 100])
tensor(2.9802e-08) tensor(1.0000)
tensor(-0.0935) tensor(1.0192)


In [None]:
class LayerNorm1d:  # (used to be BatchNorm1d)

  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)

  def __call__(self, x):  # calculate the forward pass
    # just change 0 to 1
    xmean = x.mean(1, keepdim=True) # batch mean
    xvar = x.var(1, keepdim=True) # batch variance
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps)  # normalize rows to unit variance
    self.out = self.gamma * xhat + self.beta
    return self.out

  def parameters(self):
    return [self.gamma, self.beta]

torch.manual_seed(1337)
module = LayerNorm1d(100)
x = torch.randn(32, 100) # batch size 32 of 100-dimensional vectors
x = module(x)
print(x.shape)
print(x[:,0].mean(), x[:,0].std())
print(x[0,:].mean(), x[0,:].std())

torch.Size([32, 100])
tensor(0.1469) tensor(0.8803)
tensor(-9.5367e-09) tensor(1.0000)


# GPT: Whole code

In [40]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import json

# Hyperparameters
batch_size = 64  # How many independent sequences will we process in parallel?
block_size = 256  # What is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4  # bigger NN => smaller learning rate
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embed = 384  # embedding dimension
n_head = 6  # 384/6 = 64 => each head will be 64-dimensional
n_layer = 6
dropout = 0.2

# ------------

torch.manual_seed(123)

def read_json_songs_to_str(filename='mkd_songs.json'):
    with open(filename, 'r', encoding='utf-8') as json_file:
        return json.load(json_file)
text = read_json_songs_to_str('mkd_songs_processed.json')

# Vocabulary (all the unique characters that occur in this text)
chars = sorted(list(set(text)))
vocab_size = len(chars)
print('Vocabulary size:', vocab_size)
# Encoder/Decoder: Create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda x: [stoi[c] for c in x]  # Encoder: take a string, output a list of integers
decode = lambda x: ''.join([itos[i] for i in x])  # Decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))  # The first 90% will be train, the rest val
train_data = data[:n]
val_data = data[n:]

# Data loader
def get_batch(split):
    # Generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    '''one head of self-attention'''
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))  # to add a variable tril to the model
        self.dropout = nn.Dropout(dropout)  # to prevent some of the nodes from randomly communicating

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)  # (B, T, C)
        q = self.query(x)  # (B, T, C)
        v = self.value(x)   # (B, T, C)
        # Compute attention scores ("affinities")
        wei = q @ k.transpose(-2, -1) * C**(-0.5)  # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))  # decoder block
        wei = F.softmax(wei, dim=-1)  # (B, T, T)
        wei = self.dropout(wei)
        # Perform the weighted aggregation of the values
        out = wei @ v  # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    ''' multiple heads of self-attention in parallel '''
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size=head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embed, n_embed)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)  # concatenate the outputs of all heads that run in parallel
        out = self.proj(out)  # linear projection
        out = self.dropout(out)
        return out

class FeedForward(nn.Module):
    ''' a simple linear layer followed by non-linearity'''
    def __init__(self, n_embed):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed),  # 4 times the embedding size, as in the original transformer
            nn.ReLU(),
            nn.Linear(4 * n_embed, n_embed),  # projection
            nn.Dropout(dropout)
        )
    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    ''' A transformer block: communication followed by computation '''
    def __init__(self, n_embed, n_head):
        # n_embed: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embed // n_head
        self.sa = MultiHeadAttention(num_heads=n_head, head_size=head_size)
        self.ffwd = FeedForward(n_embed)
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)

    def forward(self, x):
        # We will apply Layer norm before passing through the corresponding module
        # (this is a deviation from the original paper but it is more common now)
        x = x + self.sa(self.ln1(x))  # Residual connection
        x = x + self.ffwd(self.ln2(x))  # Residual connection
        return x

class GPTLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # Each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)  # (V, C)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)  # (T, C)
        self.blocks = nn.Sequential(*[Block(n_embed, n_head=4) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embed)  # final layer norm
        self.lm_head = nn.Linear(n_embed, vocab_size)  # (C, V)

    def forward(self, idx, targets=None):
        # idx and targets are both (B, T) tensor of integers
        B, T = idx.shape
        token_emb = self.token_embedding_table(idx)  # (B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T, C)
        x = token_emb + pos_emb  # (B, T, C) + (T, C) -> (B, T, C)
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)  # (B, T, C)
        # logits are the "probs" for the next character
        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B,T) array of indices in the current context
        # we want to predict the next token(s), given the context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)  # calls forward
            # focus only on the last time step (retrieve the last element in the time dimension)
            logits = logits[:, -1, :]  # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)  # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)  # (B, (T+1))
        return idx
        # This function at the moment is kinda silly, because
        # we pass a lot of  history, but use just the last character.
        # We do this because we will reuse it for the following models.

model = GPTLanguageModel().to(device)

# Create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(500*'-')
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        # Generate from the model
        context = torch.zeros((1, 1), dtype=torch.long, device=device)
        print(decode(model.generate(context, max_new_tokens=100)[0].tolist()))

    # Sample a new batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# Generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=1000)[0].tolist()))

Vocabulary size: 68
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
step 0: train loss 4.4301, val loss 4.4271

орљЧожИнЃтлпќПќ?ОпнЦЏЌ!,вј-ЖСЈРУр:жГцосхшаИЃЧљНх!пфПЈАЏЛѓѕЉвзГ.ѓЗ дУубдзињАлжАЛџРЧИиљсмбСИшСаЈ!ЃКџрЧ
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Up to 1500 steps training is going well, after that the model starts to overfit.

Vocabulary size: 68


1. step 0: train loss 4.4301, val loss 4.4271

  орљЧожИнЃтлпќПќ?ОпнЦЏЌ!,вј-ЖСЈРУр:жГцосхшаИЃЧљНх!пфПЈАЏЛѓѕЉвзГ.ѓЗ дУубдзињАлжАЛџРЧИиљсмбСИшСаЈ!ЃКџрЧ

2. step 500: train loss 2.2438, val loss 2.3122

  Суна е памат наа Анице мерџа ефан, Довеси,\
  Бе пофтефе со омле,\
  Ле јт оф пчет, ме прелино спуша мо,\
  Д

3. step 1000: train loss 1.6628, val loss 1.8578

  амченах врах на на пратлихнавт,\
  по не мела платнегне,\
  Вда прртугнави водини!\
  Захо по Стргна пратен у

4. step 1500: train loss 1.2045, val loss 1.6788

  покуси грло\
  Депчеха пољусна,\
  Ми гдемало либе\
  Двајца не му да и преп.\
  Ајде дали му\
  Кој ти славај мајк

5. step 2000: train loss 0.8328, val loss 1.7946

  тор на црне, горо,\
  земе на е братко, плака,\
  Всега маки.\
  Свеко да братко, почи и три минцови\
  а гради

6. step 2500: train loss 0.5163, val loss 2.0596

  идаф крлети испрвта,\
  О твојата слзи будухте,\
  грмно и пророј, шуми раниот!\
\
  Шумот је бил, прикале\
  ен

7. step 3000: train loss 0.2887, val loss 2.3827

  -полни стегнал,\
  светат маки души в таурчи,\
  браќа цел на срце жнее\
  от ради в портење.\
\
  Секоја немаш с\

8. step 3500: train loss 0.1740, val loss 2.7028

  рамна ке те излажам\
  за верите славја\
  на моево врваве\
  стаха му на Тори чедо.\
  Слушај, синко, пушка сја

9. step 4000: train loss 0.1267, val loss 2.9846

  Лика си ја да си ја дојдам.\
  Што не слушај, пофа ќе си ја ме\
  Македонски ја зарпот тамба.\
  Тагине от ер

10. step 4500: train loss 0.1048, val loss 3.1717

  милома мома мамо ино за до ме нападар,\
  чекај ми е мило моме,\
  бегај ми е мило да коњче,\
  Оф аман, оф а


Post-training results (5000 tokens):

  Со си зелени, мајкин стресојте,\
  Отвој ми леле, севдо мори\
  мавме клета.\  
  Ајде купијам извадина\
  вадиони шета ми долж да коленам\
  без леб да вратиш лице?\
  Ајде едестраш ли гори,\
  удрел еден брег на широка.\  
  Не ми реме сакаш бре,\
  сака да си растрмаш\
  улав ти род кара...\
  Оште и вликиот \
  Наљути мајкиот морена,\
  настанах, настах\
  иродни на душата\
  и см жени мја даде\
  салнуто си мајка\
  туку, мајко ќе назвезе\
  женеки по тагата потегне\
  и белки да го тагата неме.\
\
  Ако умра им има имат ми иди\
  срце што животини,\
  не што се да одават,\
  над тага да драгат\
  и на утринта ружи свет\
  да не радост потегната!\
\
  Ста ме, од викаш, одавна доба\
  на чедено утро да оди\
  дод грабит денот мои.\
  И ке ке чува цути не ми е?\
  Зар велит: бујнак да бегат,\
  и ли бегат, мили веднаш, зар бегат,\
  радо ви примкна падни силна,\
  душа да буриче да врлика.\
  Сега нек ога светит:\
\
  Стрите тешки Боже светели Маре Помоноса,\
  Го дружи сестрит ние.\
  Кога виде секо црна, коња јачи\
  Да преку сака се стоит снитите \
  а мојте лудо ни срце\
  и дружи што ми градини\
  как\