In [3]:
import torch
import torch.nn as nn
from datasets import load_dataset

dataset = load_dataset("nampdn-ai/tiny-lessons")

%reload_ext autoreload
%autoreload 2

dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['text', 'source', 's', 'len', 'idx', 'textbook'],
        num_rows: 20000
    })
})

In [4]:
train = dataset["train"]

# Vocabulary

Here we create word tokens for simplicity

In [5]:
from tiktoken import get_encoding

# Using GPT2 tokenizer
tokenizer = get_encoding("gpt2")

def encode(batch):
    batch["token"] =  [tokenizer.encode(x) for x in batch["text"]]
    return batch

train_ds = train.map(encode, batched=True)
train_ds

Dataset({
    features: ['text', 'source', 's', 'len', 'idx', 'textbook', 'token'],
    num_rows: 20000
})

In [6]:
print(train_ds[0]["text"])
print("-"*20)
print(train_ds[0]["token"])

By the end of 2014, we plan to revamp our health education and disease prevention. program, which was suspended in September 2013. We will conduct health education. sessions at the community centers, group sessions for specific conditions (Diabetes,. Hypertension, etc.), as well as community forums and health fair. We will recruit. health agents to conduct needs assessment in our area of intervention, identify. the health and economic needs, educate and encourage people to access preventive. services, such as vaccination, prenatal care, and health maintenance for chronic. diseases.. The health agents will also identify pregnant women, newborns, infants with high risk of malnutrition to help them obtain proper care, vaccination, food assistance, etc. They will also identify people with certain conditions such as HIV, Tuberculosis, and Malaria to help them access available services.
--------------------
[3886, 262, 886, 286, 1946, 11, 356, 1410, 284, 2710, 696, 674, 1535, 3707, 290, 4369

# Transformer Decoder

## Config

In [7]:
n_vocab = tokenizer.n_vocab
context = 32

n_vocab

50257

## Architecture

We will use a single attention layer for this example

In [8]:
class SmallDecoderTransformer(nn.Module):
    def __init__(self):
        super().__init__()

        self.embedding = nn.Embedding(n_vocab, 20)

        self.position_embedding = nn.Embedding(context, 20) # context, 20
        # We limit the positional encoding to 100 which means that we cannot encode more than 100 tokens
        # In the paper they use a mix of sin/cos but they also show that we can learn positional encoding
        # Thus we train it instead

        # We train query/key/value to be able to learn the best representation of the tokens
        self.query = nn.Linear(20, 20)
        self.key = nn.Linear(20, 20)
        self.value = nn.Linear(20, 20)

        # Our mapping from the embedding dimension to the vocabulary size
        # Will be used to 'decode' the embedding into token ids
        self.fc = nn.Linear(20, n_vocab)
        
        # Dropout for optimal training
        self.dropout = nn.Dropout(0.1)
        self.norm = nn.LayerNorm(20)

    def forward(self, x):
        positions = torch.arange(x.shape[0], device=x.device)

        # x is the tokenized text
        # Get the embedding from the tokenized text
        # We are not using any batch size in this example
        x = self.dropout(self.norm(self.embedding(x) + self.position_embedding(positions))) # (T, E)

        # T is the length of the tokenized text
        # E is the embedding size

        # Get the query, key and value
        q = self.query(x) # (T, E)
        k = self.key(x) # ...
        v = self.value(x) # ...

        # Query will train and will represent what the current token 't' is looking for
        # Key will represent the current token 't' and what it is
        # Value will represent the current token 't' and what it is worth

        energy = q @ k.T # (T, T)
        # Energy represents the similarity between the query and the key

        # The dot product can be very large. Thus we divide by the square root of the embedding size to make the result smaller / normalized
        energy /= torch.sqrt(torch.tensor(20.).float())

        # We want to mask the energy so that we do not look at the future tokens
        mask = torch.tril(torch.ones(energy.shape, device=x.device)) # Triangular lower (All zeros above the diagonal / right)
        energy.masked_fill_(mask == 0, float("-inf")) # Replace mask zeros with -inf 🤔

        # When we apply a softmax, exp(-inf) = 0 making all upper right values in the tril be 0s
        attention = energy.softmax(dim=-1) @ v # (T, E)
        # We increase the similarities by the values of the tokens
        # This will make the tokens that are similar to the query have a higher value

        x = self.fc(attention) # (T, n_vocab)
        # Back to the vocabulary size


        # The result isn't softmax normalized
        return x

In [9]:
import torch.optim as optim

model = SmallDecoderTransformer().cuda()

In [10]:

def train(model, epochs=1):
    losses = []

    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    
    model.train()
    for _ in range(epochs):
        # Create a small train loop
        for i, data in enumerate(train_ds.with_format("pt")):
            tokens = data["token"]
            # We need to have at least context tokens.

            # We only take context tokens since that is our limit
            space = tokens.shape[0] - 1 - context

            # For example purposes, we will exclude less than context tokens
            if space <= 0:
                continue

            start_token = torch.randint(space, (1,)).item()

            # Select random part of sentence
            tokens = tokens[start_token:start_token+context+1].cuda()
            # Offset by 1
            targets = tokens[1:]
            tokens = tokens[:-1]

            optimizer.zero_grad()
            logits = model(tokens)
            loss = nn.functional.cross_entropy(logits, targets)
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
            if i % 5000 == 0 and i != 0:
                print("L=", sum(losses) / len(losses))
                losses.clear()

In [14]:
train(model, 5)

L= 8.219899945105583
L= 7.8757515724182126
L= 7.72456556763649
L= 7.548945513606071
L= 7.444138275671005
L= 7.426736467015743
L= 7.375375216209888
L= 7.327864888477325
L= 7.344251172828674
L= 7.320551260489226
L= 7.298398310899734
L= 7.309708138847351
L= 7.293686679601669
L= 7.271412104320526
L= 7.293289126253128


In [17]:
from collections import deque

past_tokens = deque(maxlen=context)

text_idx = 3

# With the current structure, we need to start with at least 100 tokens
sample_tokens = train_ds.with_format("pt")["token"][text_idx]

# Fill the queue with the first 100 tokens
past_tokens.extend(sample_tokens[:context].tolist())

print(sample_tokens.shape)

torch.Size([183])


In [31]:
from time import sleep

model.eval()
print(tokenizer.decode(past_tokens), end="")
for _ in range(100):
    # print(past_tokens)
    logits = model(torch.tensor(past_tokens).cuda())

    next_token_logits = logits[-1]

    probs_a = next_token_logits.softmax(dim=-1)
    next_token = torch.multinomial(probs_a, 1).item()

    past_tokens.append(next_token)

    print(tokenizer.decode([next_token]), end="")
    sleep(0.1)

Community Involvement. Kraemer Mining & Materials is committed to being a strong community partner by working with citizens, businesses and government to be a positive and contributing in removing, 2 competitiveness by antibodies in 1 bridge Action business andQual followed��t study orolt13 experience and a rope and aThe weather, and yet for support with prefer year and plan. 1-10port head in game.. we have a expectations: by episode at run thisps can a GPU by a wide year. This, Website opens stay to start for Reading is service, Henry Deepknown agencies from North Sh preparation with described in the fabulous 31 onAll:.
 embark

In [18]:
# Fill the queue with the first context tokens
past_tokens.extend(sample_tokens[:context])

# Look at the possible tokens to predict
model.eval()
probs_a = model(torch.tensor(past_tokens).cuda())[-1].softmax(dim=-1)

possible_tokens_a = probs_a.argsort(descending=True)[:10]
print("Possible tokens for text :")
print("-"*20)
print(tokenizer.decode(past_tokens))
print("-"*20)
print(" ; ".join([f'"{tokenizer.decode([t.item()])}" ({probs_a[t]:<.4f})' for t in possible_tokens_a]))

Possible tokens for text :
--------------------
Community Involvement. Kraemer Mining & Materials is committed to being a strong community partner by working with citizens, businesses and government to be a positive and contributing
--------------------
" and" (0.0667) ; "," (0.0595) ; " to" (0.0512) ; "." (0.0408) ; " in" (0.0387) ; " with" (0.0314) ; " for" (0.0251) ; ".." (0.0209) ; " of" (0.0180) ; " the" (0.0169)


## Query/Key/Value ?

Here we check if the query/key and value are always useful or not.

Why can't we just have 1 tensor to dot product by itself, then mask over the right diagonal ?
Shoudn't we get the same information out of the dot product ?

It would make

$$ Attention(Q,K,V) = softmax( \frac{Q \cdot K^T}{\sqrt{d}} ) $$

Be :

$$ Attention(A) = A \cdot A^T $$

In [11]:
class SmallDecoderTransformerA(nn.Module):
    def __init__(self):
        super().__init__()

        # Since we have 3 times less weigths. Let's make it fare for this test and use 3 times more embeddings for the single matrix
        E = 20 * 3

        self.embedding = nn.Embedding(n_vocab, E)

        self.position_embedding = nn.Embedding(context, E)
        self.A = nn.Linear(E, E)
        # Notice that the input layer now takes the context length as input
        self.fc = nn.Linear(context, n_vocab)
        
        # Dropout for optimal training
        self.dropout = nn.Dropout(0.1)
        self.norm = nn.LayerNorm(E)

    def forward(self, x):
        positions = torch.arange(x.shape[0], device=x.device)
        x = self.dropout(self.norm(self.embedding(x) + self.position_embedding(positions))) # (T, E)
        a = self.A(x) # (T, E)
        energy = a @ a.T # (T, T)
        # The dot product can still be very large. Thus we divide by the square root of the embedding size to make the result smaller / normalized
        energy /= torch.sqrt(torch.tensor(a.shape[-1]).float())
        mask = torch.tril(torch.ones(energy.shape, device=x.device))
        energy.masked_fill_(mask == 0, float("-inf"))
        energy = energy.softmax(dim=-1)
        return self.fc(energy) # (T, n_vocab)

In [12]:
model_a = SmallDecoderTransformerA().cuda()

In [13]:
train(model_a, 5)

L= 8.702836739971646
L= 8.055390785121917
L= 8.107386513900757
L= 8.083150675535203
L= 8.110930331039429
L= 8.124369639110565
L= 8.08668682923317
L= 8.059635010147094
L= 8.069471833419799
L= 8.031838086891174
L= 8.004206637191773
L= 8.018827591991425
L= 7.967624999570846
L= 7.946913831233978
L= 7.951134631061554


In [19]:
past_tokens = deque(maxlen=context)

text_idx = 3

# With the current structure, we need to start with at least 100 tokens
sample_tokens = train_ds.with_format("pt")["token"][text_idx]

# Fill the queue with the first 100 tokens
past_tokens.extend(sample_tokens[:context].tolist())

print(sample_tokens.shape)

torch.Size([183])


In [41]:
model.eval()
print(tokenizer.decode(past_tokens), end="")
for _ in range(100):
    # print(past_tokens)
    logits = model_a(torch.tensor(past_tokens).cuda())

    next_token_logits = logits[-1]

    probs_a = next_token_logits.softmax(dim=-1)
    next_token = torch.multinomial(probs_a, 1).item()

    past_tokens.append(next_token)

    print(tokenizer.decode([next_token]), end="")
    sleep(0.1)

Community Involvement. Kraemer Mining & Materials is committed to being a strong community partner by working with citizens, businesses and government to be a positive and contributing, the allow juices drug Mobile The. This a also fit each;, degree where. - with order and required as =OS to their pain of from the people need,Val. Some for very the state & video angle. Her Italy... structures was track wall the bring facility base to and the the 16 the a more life. The demonstrated area.
.. <ings of child, one of creation,GRited and your B92, for hospital In free access are the introduce most the

In [20]:
# Fill the queue with the first context tokens
past_tokens.extend(sample_tokens[:context])

# Look at the possible tokens to predict
model.eval()
model_a.eval()
probs = model(torch.tensor(past_tokens).cuda())[-1].softmax(dim=-1)
probs_a = model_a(torch.tensor(past_tokens).cuda())[-1].softmax(dim=-1)

possible_tokens = probs.argsort(descending=True)[:10]
possible_tokens_a = probs_a.argsort(descending=True)[:10]
print("Possible tokens for text :")
print("-"*20)
print(tokenizer.decode(past_tokens))
print("-"*20)
print(" ; ".join([f'"{tokenizer.decode([t.item()])}" ({probs[t]:<.4f})' for t in possible_tokens]))
print(" ; ".join([f'"{tokenizer.decode([t.item()])}" ({probs_a[t]:<.4f})' for t in possible_tokens_a]))

Possible tokens for text :
--------------------
Community Involvement. Kraemer Mining & Materials is committed to being a strong community partner by working with citizens, businesses and government to be a positive and contributing
--------------------
" and" (0.0667) ; "," (0.0595) ; " to" (0.0512) ; "." (0.0408) ; " in" (0.0387) ; " with" (0.0314) ; " for" (0.0251) ; ".." (0.0209) ; " of" (0.0180) ; " the" (0.0169)
"." (0.0703) ; "," (0.0643) ; " of" (0.0571) ; " to" (0.0521) ; " and" (0.0455) ; " in" (0.0284) ; ".." (0.0231) ; " is" (0.0204) ; " for" (0.0203) ; " with" (0.0169)


As we can see, even without the query / key / value embeddings, we can learn a high representation of the words that should follow each other.

## Visualize embeddings

In [32]:
from umap import UMAP

reduction = UMAP(10)

sample_tokens_np = torch.tensor(sample_tokens)
def token_emb(model):
    return model(sample_tokens_np).detach().numpy()


To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).



In [33]:
model_a_emb_cpu = model_a.embedding.cpu()
model_a_embs = token_emb(model_a_emb_cpu)
model_a_embs.shape

(183, 60)

In [52]:
import pandas as pd
import plotly.express as px


def plot_embedding(embs, tokens=sample_tokens, title='Transformer embedding'):
    reduction = UMAP(10)
    preds = reduction.fit_transform(embs)

    df = pd.DataFrame({"x": preds[:, 0], "y": preds[:, 1], "text": [tokenizer.decode([x]) for x in tokens]})

    fig = px.scatter(df, x="x", y="y", text="text", log_x=True, size_max=60)
    fig.update_traces(textposition='top center')
    fig.update_layout(
        height=800,
        title_text=title
    )

    fig.show()

### Equivalent Embeddings matrix

In [35]:
plot_embedding(model_a_embs)

In [37]:
model.embedding

Embedding(50257, 20)

In [39]:
plot_embedding(token_emb(model.embedding.cpu()))

### Embedding + pos + layer norm

In [47]:
def token_emb_pos(model, tokens=sample_tokens_np):
    emb = model.embedding.cpu()
    pos_emb = model.position_embedding.cpu()
    norm = model.norm.cpu()
    return norm(emb(tokens) + pos_emb(torch.arange(context))).detach().numpy() # (T, E)

In [53]:
tokens = sample_tokens_np[:context]
plot_embedding(token_emb_pos(model_a, tokens), tokens, title="Transformer with positional encoding")

In [54]:
plot_embedding(token_emb_pos(model, tokens), tokens, title="Transformer with positional encoding")

### Affinity matrices

In [62]:
pos_emb = torch.tensor(token_emb_pos(model_a, tokens))
A = model_a.A.cpu()
affinity = A(pos_emb).detach()

print(tokenizer.decode(tokens.tolist()))
plot_embedding(affinity, tokens, title="Transformer A affinity matrix")

Community Involvement. Kraemer Mining & Materials is committed to being a strong community partner by working with citizens, businesses and government to be a positive and contributing


In [77]:
pos_emb = torch.tensor(token_emb_pos(model, tokens))
Q = model.query.cpu()
K = model.key.cpu()

energy = Q(pos_emb) @ K(pos_emb).T
mask = torch.tril(torch.ones(energy.shape))
energy.masked_fill_(mask == 0, float("-inf"))
energy = (energy / torch.sqrt(torch.tensor(20.).float())).softmax(dim=-1)

print(tokenizer.decode(tokens.tolist()))
plot_embedding(energy.detach(), tokens, title="Transformer energy matrix")

Community Involvement. Kraemer Mining & Materials is committed to being a strong community partner by working with citizens, businesses and government to be a positive and contributing


In [78]:
V = model.value.cpu()
attention = energy @ V(pos_emb)
print(tokenizer.decode(tokens.tolist()))
plot_embedding(attention.detach(), tokens, title="Transformer attention matrix")

Community Involvement. Kraemer Mining & Materials is committed to being a strong community partner by working with citizens, businesses and government to be a positive and contributing


In [79]:
fc = model.fc.cpu()
logits = fc(attention)

plot_embedding(logits.detach(), tokens, title="Transformer emb output matrix")

In [76]:
fc = model_a.fc.cpu()
logits = fc(affinity)

plot_embedding(logits.detach(), tokens, title="Transformer emb output matrix")

RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x60 and 32x50257)