# Day 26 - Introducing generative models

#### This chapter covers

* An explanation of the text generation problem.
* An introduction to unsupervised learning.
* Learning structure using attention mechanism.
* Building up from simple probabilistic models to deep learning models.
* The transformer architecture and its variants and applications.

## A motivating example: generating names character by character

* We need to be able to map between characters and integers
* We shall use the special character $\$$ to denote the beginning and the end of a name

In [1]:
vocab = "$abcdefghijklmnopqrstuvwxyz"
vocab_size = len(vocab)

ch_to_i = {ch: i for i, ch in enumerate(vocab)}
i_to_ch = {i: ch for i, ch in enumerate(vocab)}

* A crude first step is uniform random sampling

In [2]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
from torch import nn
import torch.nn.functional as F

uniform_probs = F.softmax(torch.ones(vocab_size), dim=0)
for i in range(5):
    generated = ""
    while True:
        random_int = torch.multinomial(uniform_probs, 1).item()
        if random_int == 0:
            break
        generated += i_to_ch[random_int]
    print(f"Name {i+1}:", generated)

Name 1: zpuevmmykoejo
Name 2: jrrttmmywahjrqunqsmzklhixawlsobgbmzbcvcdc
Name 3: nxpsackyde
Name 4: vcujbckryarjmvbcoangkghyrxgijizoiuvjimeffjnqhgodvkkqjgmcfnkuuytienkvvuxprfaxevfzz
Name 5: pzjdwynriegdgwmhurloyjwehafhcinojtiotq


* These names are no particularly good

## Self-supervised learning

* We steal the [names of American children](https://www.ssa.gov/oact/babynames/names.zip)

In [3]:
names = []
with open('./DLPT/data/text/names/yob2023.txt', 'r') as file:
    for line in file:
        name, _, _= line.lower().strip().split(',')
        names.append("$" + name + "$")
len(names), names[:3], names[-3:]

(31682,
 ['$olivia$', '$emma$', '$charlotte$'],
 ['$zymirr$', '$zyquan$', '$zyrin$'])

* We can now calculate the frequencies of all bigrams

In [4]:
bigram = torch.zeros((vocab_size, vocab_size))
total = 0
for name in names:
    for ch1, ch2 in zip(name, name[1:]):
        ch1_idx = ch_to_i[ch1]
        ch2_idx = ch_to_i[ch2]
        bigram[ch1_idx][ch2_idx] += 1
        total += 1
bigram /= total

* And then, we can sample according to these, instead of the uniform distribution

In [5]:
for i in range(5):
    generated = "$"
    while True:
        bigram_probs = bigram[ch_to_i[generated[-1]]]
        sampled_char = i_to_ch[
            torch.multinomial(bigram_probs, 1).item()
        ]
        if sampled_char == "$":
            break
        generated += sampled_char
    print(f"name {i}: {generated[1:]}")

name 0: josondr
name 1: zirio
name 2: cendeonsuglunahlessemiam
name 3: kh
name 4: m


* These are names

## Generating our training data

In [6]:
example_name = "$ada$"
# Define encode and decode functions
encode = lambda word: torch.tensor([ch_to_i[c] for c in word])
decode = lambda tensor_i: ''.join(i_to_ch[i.item()] for i in tensor_i)
print(encode(example_name))
print(decode(encode(example_name)))

name_indices = [encode(name) for name in names]
target_indices = [name_index[1:] for name_index in name_indices]

tensor([0, 1, 4, 1, 0])
$ada$


In [7]:
from torch.nn.utils.rnn import pad_sequence

X = pad_sequence(name_indices, batch_first=True, padding_value=0)
max_name_length = max(len(name) for name in names)
target_indices.append(torch.empty((max_name_length), dtype=torch.long))
Y = pad_sequence(target_indices, batch_first=True, padding_value=-1)[:-1]
print(X[0])
print(Y[0])

tensor([ 0, 15, 12,  9, 22,  9,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])
tensor([15, 12,  9, 22,  9,  1,  0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1])


In [8]:
def get_batch(batch_size=64):
    random_idx = torch.randint(0, X.size(0), (batch_size,))
    inputs = X[random_idx]
    labels = Y[random_idx]
    return inputs.to(device=device, non_blocking=True), labels.to(device=device, non_blocking=True)

In [9]:
inputs, labels = get_batch(3)
print(inputs)
print(labels)

tensor([[ 0,  3,  8,  1, 18, 22,  9, 11,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  5, 13, 13,  1, 12, 25, 14, 14,  5,  0,  0,  0,  0,  0,  0,  0],
        [ 0, 13,  9, 14,  4,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]],
       device='cuda:0')
tensor([[ 3,  8,  1, 18, 22,  9, 11,  0, -1, -1, -1, -1, -1, -1, -1, -1, -1],
        [ 5, 13, 13,  1, 12, 25, 14, 14,  5,  0, -1, -1, -1, -1, -1, -1, -1],
        [13,  9, 14,  4,  1,  0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]],
       device='cuda:0')


## Embeddings and multi-layer perceptrons

* PyTorch provides the `Embedding` module, which simplifies the embedding process

In [10]:
embedding_dim = 3
embedding = nn.Embedding(vocab_size, embedding_dim)
example_input = torch.tensor([1, 1, 0, 2])
input_embd = embedding(example_input)
print(input_embd.shape)
input_embd

torch.Size([4, 3])


tensor([[ 1.3064,  1.4244,  1.4227],
        [ 1.3064,  1.4244,  1.4227],
        [ 0.3055, -0.7097, -0.9754],
        [ 0.4251, -0.0573,  0.6399]], grad_fn=<EmbeddingBackward0>)

In [11]:
class SequenceMLP(nn.Module):
    def __init__(self, vocab_size, max_sequence_length,
     embedding_dim, hidden_dim=32):
        super().__init__()
        self.vocab_size = vocab_size
        self.max_sequence_length = max_sequence_length
        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim * max_sequence_length,
         hidden_dim)
        self.relu = nn.ReLU()
        self.out = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        device = self.linear.weight.device
        batch_size, seq_len = x.shape
        sequence_embeddings = torch.zeros(batch_size, seq_len,
         self.max_sequence_length * self.embedding_dim, device=device)
        for i in range(seq_len):
            subsequence = torch.zeros(batch_size, self.max_sequence_length,
             dtype=torch.int, device=device)
            prefix = x[:, :i+1]
            subsequence[:, :i+1] = prefix
            emb = self.embedding(subsequence)
            sequence_embeddings[:, i, :] = emb.view(batch_size, -1)
        x = self.linear(sequence_embeddings)
        x = self.relu(x)
        x = self.out(x)
        return x

In [12]:
embedding_dim = 3
max_sequence_length = X.shape[1]
model = SequenceMLP(vocab_size, max_sequence_length, embedding_dim).to(device=device)

In [13]:
import torch.optim as optim
from tqdm.auto import tqdm


def train(model, optimizer, num_steps=10_001, loss_report_interval=1_000):
    losses = []
    for i in tqdm(range(1, num_steps), desc="Epochs"):
        inputs, labels = get_batch()
        optimizer.zero_grad()
        logits = model(inputs)
        loss = F.cross_entropy(logits.view(-1, logits.shape[-1]),
                               labels.view(-1), ignore_index=-1)
        losses.append(loss.item())
        if i % loss_report_interval == 0:
            print(f'Average loss at step {i}: {
            sum(losses[-loss_report_interval:]) / loss_report_interval:.4f
            }')
        loss.backward()
        optimizer.step()


optimizer = optim.Adam(model.parameters(), lr=1e-2)

In [14]:
train(model, optimizer, num_steps=100_001, loss_report_interval=10_000)

Epochs:   0%|          | 0/100000 [00:00<?, ?it/s]

Average loss at step 10000: 2.3325
Average loss at step 20000: 2.2936
Average loss at step 30000: 2.2839
Average loss at step 40000: 2.2821
Average loss at step 50000: 2.2802
Average loss at step 60000: 2.2779
Average loss at step 70000: 2.2767
Average loss at step 80000: 2.2739
Average loss at step 90000: 2.2736
Average loss at step 100000: 2.2729


In [15]:
def generate_samples(model, num_samples=1, max_len=max_name_length):
    sequences = torch.zeros((num_samples, 1)).int()
    for _ in range(max_len):
        logits = model(sequences)
        logits = logits[:, -1, :]
        probs = F.softmax(logits, dim=-1)
        idx_next = torch.multinomial(probs, num_samples=1)
        sequences = torch.cat((sequences, idx_next), dim=1)

    for sequence in sequences:
        indices = torch.where(sequence == 0)[0]
        end = indices[1] if len(indices) > 1 else max_len
        sequence = sequence[1:end]
        print(decode(sequence))

In [16]:
generate_samples(model.to(device="cpu"), num_samples=10)

ruviah
nadar
zoriah
errly
assisteh
maillen
modyohah
koviane
luenza
lenian


## Attention

* Attention can be used as a mechanism to adjust the embedding of a token based on the surrounding context

### Dot Product Self-attention

* A self-attention block, denoted $\mathcal{sa[\cdot]}$ takes $N$ inputs $x_1, x_2, \dots, x_N$, each of dimension $D$
* $Values$ are a simple, linear transformation of the inputs
* They can thus be calculated by applying a `Linear(D, D)` layer to them
* The $n$-th output of the self-attention block, $\text{sa}_n[x_1, \dots, x_N]$, is a weighted sum of the inpute values

$$
\text{sa}_n\left[x_1, \dots, x_N\right]=\sum_{m=1}^{N}a\left[x_m,x_n\right]v_m
$$
* Here, $a[x_m, x_n]$ is a learned weight function
* This means that self-attention transforms each token in a sequence into a weighted average of all tokens
* To compute $a$, we generate a $query$ and a $key$ for each token, which are both simply `Linear(D, D)` transformations again
* For $a[x_m, x_n]$, we take take the dot products of $q_n$ with all $k$, then get the softmax for $k_m$

$$
a[x_m, x_n]=\operatorname{softmax}(q_n\cdot k_m)=\frac{q_n\cdot k_m}{\sum_{m'=1}^Nq_n\cdot k_{m'}}
$$

In [17]:
torch.manual_seed(0)
x = torch.rand(2, 3)

query = F.linear(x, weight=torch.rand(3, 3), bias=torch.rand(3))
key = F.linear(x, weight=torch.rand(3, 3), bias=torch.rand(3))
value = F.linear(x, weight=torch.rand(3, 3), bias=torch.rand(3))

In [18]:
def dot_product_attention_single(q, k, v):
    attn_weights = q @ k.T
    attn_weights = F.softmax(attn_weights, dim=-1)
    output = attn_weights @ v
    return output

In [19]:
dot_product_attention_single(query, key, value)

tensor([[1.3667, 0.6913, 1.0614],
        [1.3718, 0.6930, 1.0627]])

### Scaled dot product causal self-attention

* We can extend this attention mechanism by handling batching, causal masking (hiding the future), and scaling the weights to avoid vanishing and exploding gradients

In [20]:
torch.manual_seed(0)
x = torch.rand(1, 2, 3)

In [21]:
query = F.linear(x, weight=torch.rand(3, 3), bias=torch.rand(3))
key = F.linear(x, weight=torch.rand(3, 3), bias=torch.rand(3))
value = F.linear(x, weight=torch.rand(3, 3), bias=torch.rand(3))

In [22]:
def scaled_dot_product_causal_attention(q, k, v):
    attn_weights = q @ k.transpose(1, 2)
    mask = torch.tril(torch.ones(attn_weights.shape[1:]), diagonal=0)
    attn_weights = attn_weights.masked_fill(mask == 0, value=float('-inf'))
    attn_weights = attn_weights / torch.sqrt(torch.tensor(
        k.shape[-1]).float())
    attn_weights = F.softmax(attn_weights, dim=-1)
    output = attn_weights @ v
    return output, attn_weights

In [23]:
output, attn_weights = scaled_dot_product_causal_attention(query, key, value)
output

tensor([[[1.6253, 0.7788, 1.1252],
         [1.3849, 0.6974, 1.0659]]])

* Of course, there is a native PyTorch solution for this

In [24]:
expected_output = F.scaled_dot_product_attention(query, key, value, is_causal=True)
print(torch.allclose(output, expected_output))

True


* Using this new attention mechanism, we can hopefully generate better names

In [25]:
class AttentionMLP(nn.Module):
    def __init__(self, n_embd, vocab_size, block_size, n_hidden=64):
        super().__init__()
        self.tok_embd = nn.Embedding(vocab_size, n_embd)
        self.attn_weights = None

        self.query_proj = nn.Linear(n_embd, n_embd)
        self.key_proj = nn.Linear(n_embd, n_embd)
        self.value_proj = nn.Linear(n_embd, n_embd)

        self.register_buffer("mask", torch.tril(torch.ones(
            (block_size, block_size), device=device), diagonal=0))

        self.mlp = nn.Sequential(
            nn.Linear(n_embd, n_hidden),
            nn.ReLU(),
            nn.Linear(n_hidden, n_embd)
        )

        self.output_proj = nn.Linear(n_embd, vocab_size)

    def forward(self, x):
        device = self.output_proj.weight.device
        
        x = self.tok_embd(x)
        batch_size, seq_len, embd_dim = x.shape

        q = self.query_proj(x)
        k = self.key_proj(x)
        v = self.value_proj(x)

        attn_weights = q @ k.transpose(1, 2)
        attn_weights = attn_weights.masked_fill(
            self.mask[:seq_len, :seq_len] == 0, value=float('-inf'))
        attn_weights = attn_weights / torch.sqrt(
            torch.tensor(k.shape[-1], device=device).float())
        self.attn_weights = F.softmax(attn_weights, dim=-1)
        x = self.attn_weights @ v
        x = self.mlp(x)

        x = self.output_proj(x)
        return x

In [26]:
model = AttentionMLP(32, vocab_size, max_name_length).to(device=device)
optimizer = optim.Adam(model.parameters(), lr=1e-2)
train(model, optimizer, num_steps=100_001, loss_report_interval=10_000)

Epochs:   0%|          | 0/100000 [00:00<?, ?it/s]

Average loss at step 10000: 2.2593
Average loss at step 20000: 2.2265
Average loss at step 30000: 2.2205
Average loss at step 40000: 2.2178
Average loss at step 50000: 2.2173
Average loss at step 60000: 2.2161
Average loss at step 70000: 2.2145
Average loss at step 80000: 2.2145
Average loss at step 90000: 2.2139
Average loss at step 100000: 2.2149


In [27]:
generate_samples(model.to(device="cpu"), 10)

pema
distran
tarianta
narayah
analinalah
jianah
blacalyen
ronen
uuzre
kairzo
