<div style="width: 100%; overflow: hidden;">
    <div style="width: 150px; float: left;"> <img src="https://raw.githubusercontent.com/DataForScience/Networks/master/data/D4Sci_logo_ball.png" alt="Data For Science, Inc" align="left" border="0" width=150px> </div>
    <div style="float: left; margin-left: 10px;"> <h1>ChatGPT and Friends</h1>
<h1>Transformer</h1>
        <p>Bruno Gonçalves<br/>
        <a href="http://www.data4sci.com/">www.data4sci.com</a><br/>
            @bgoncalves, @data4sci</p></div>
</div>

In [4]:
from collections import Counter
from pprint import pprint
import time

import math
import copy

import pandas as pd
import numpy as np

import matplotlib
import matplotlib.pyplot as plt 

from ipywidgets import interact

import os
import gzip

import tqdm as tq
from tqdm.notebook import tqdm
tqdm.pandas()

import torch
import torch.nn.functional as F
import torchtext
from torchtext import data

import spacy

import networkx as nx

import watermark

%load_ext watermark
%matplotlib inline

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark


We start by printing out the versions of the libraries we're using for future reference

In [5]:
%watermark -n -v -m -g -iv

Python implementation: CPython
Python version       : 3.11.7
IPython version      : 8.21.0

Compiler    : Clang 15.0.0 (clang-1500.1.0.2.5)
OS          : Darwin
Release     : 23.3.0
Machine     : x86_64
Processor   : i386
CPU cores   : 16
Architecture: 64bit

Git hash: 1f87e80538ad172ebadf16b8ffe7f1e01f363ed6

matplotlib: 3.8.2
tqdm      : 4.66.1
torchtext : 0.6.0
spacy     : 3.7.2
pandas    : 2.2.0
torch     : 2.2.0
numpy     : 1.26.4
json      : 2.0.9
watermark : 2.4.3
networkx  : 3.2.1



Load default figure style

In [6]:
plt.style.use('d4sci.mplstyle')
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

# Code

### Embedding

In [7]:
class Embedder(torch.nn.Module):
    
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.embed = torch.nn.Embedding(vocab_size, d_model)
        
    def forward(self, x):
        return self.embed(x)

### Positional encoding

$$ PE_{(pos, 2i)} = sin(pos/10000^{2i/d_{model}}) $$

$$ PE_{(pos, 2i + 1)} = cos(pos/10000^{2i/d_{model}}) $$

In [8]:
class PositionalEncoder(torch.nn.Module):
    def __init__(self, d_model, max_seq_len=80):
        super().__init__()
        self.d_model = d_model
        
        # create constant positional encoding matrix
        pe_matrix = torch.zeros(max_seq_len, d_model)
        
        for pos in range(max_seq_len):
            for i in range(0, d_model, 2):
                pe_matrix[pos, i] = math.sin(pos/10000**(2*i/d_model))
                pe_matrix[pos, i+1] = math.cos(pos/10000**(2*i/d_model))
        pe_matrix = pe_matrix.unsqueeze(0)     # Add one dimension for batch size
        self.register_buffer('pe', pe_matrix)  # Register as persistent buffer
        
    def forward(self, x):
        # x is a sentence after embedding with dim (batch, number of words, vector dimension)
        seq_len = x.size()[1]
        x = x + self.pe[:, :seq_len]
        return x

## Model layers

### Scaled Dot-Product Attention layer

In [9]:
# Given Query, Key, Value, calculate the final weighted value
def scaled_dot_product_attention(q, k, v, mask=None, dropout=None):
    # Shape of q and k are the same, both are (batch_size, seq_len, d_k)
    # Shape of v is (batch_size, seq_len, d_v)
    attention_scores = torch.matmul(q, k.transpose(-2, -1))/math.sqrt(q.shape[-1])  # size (batch_size, seq_len, seq_len)
    
    # Apply mask to scores
    # <pad>
    if mask is not None:
        attention_scores = attention_scores.masked_fill(mask == 0, value=-1e9)
        
    # Softmax along the last dimension
    attention_weights = F.softmax(attention_scores, dim=-1)
    
    if dropout is not None:
        attention_weights = dropout(attention_weights)
        
    output = torch.matmul(attention_weights, v)
    return output

### Multi-Head Attention layer

![](images/scaled_dot_product_attention.png)

In [10]:
class MultiHeadAttention(torch.nn.Module):
    def __init__(self, n_heads, d_model, dropout=0.1):
        super().__init__()
        
        self.n_heads = n_heads
        self.d_model = d_model
        self.d_k = self.d_v = d_model//n_heads
        
        # self attention linear layers
        # Linear layers for q, k, v vectors generation in different heads
        self.q_linear_layers = []
        self.k_linear_layers = []
        self.v_linear_layers = []
        for i in range(n_heads):
            self.q_linear_layers.append(torch.nn.Linear(d_model, self.d_k))
            self.k_linear_layers.append(torch.nn.Linear(d_model, self.d_k))
            self.v_linear_layers.append(torch.nn.Linear(d_model, self.d_v))
        
        self.dropout = torch.nn.Dropout(dropout)
        self.out = torch.nn.Linear(n_heads*self.d_v, d_model)
        
    def forward(self, q, k, v, mask=None):
        multi_head_attention_outputs = []
        for q_linear, k_linear, v_linear in zip(self.q_linear_layers,
                                                self.k_linear_layers,
                                                self.v_linear_layers):
            new_q = q_linear(q)  # size: (batch_size, seq_len, d_k)
            new_k = k_linear(k)  # size: (batch_size, seq_len, d_k)
            new_v = v_linear(v)  # size: (batch_size, seq_len, d_v)
            
            # Scaled Dot-Product attention
            head_v = scaled_dot_product_attention(new_q, new_k, new_v, mask, self.dropout)  # (batch_size, seq_len, d_v)
            multi_head_attention_outputs.append(head_v)
            
        # Concat
        #import pdb; pdb.set_trace()
        concat = torch.cat(multi_head_attention_outputs, -1)  # (batch_size, seq_len, n_heads*d_v)
        
        # Linear layer to recover to original shap
        output = self.out(concat)  # (batch_size, seq_len, d_model)
        
        return output

### Feed Forward layer

In [11]:
class FeedForward(torch.nn.Module):
    def __init__(self, d_model, d_ff=2048, dropout=0.1):
        super().__init__()
        
        self.linear_1 = torch.nn.Linear(d_model, d_ff)
        self.dropout = torch.nn.Dropout(dropout)
        self.linear_2 = torch.nn.Linear(d_ff, d_model)
        
    def forward(self, x):
        x = self.dropout(F.relu(self.linear_1(x)))
        x = self.linear_2(x)
        return x

### Layer Normalization layer

#### Normalization

$$\mu = \frac{1}{m} \sum_{i=1}^{m}x_i$$

$$
\sigma^{2} = \frac{1}{m} \sum^{m}_{i=1}(x_{i} - \mu)^2
$$

$$
\hat{Z}_i = \frac{x_i - \mu_i}{\sqrt{\sigma^{2}_{i} + \epsilon}}
$$

#### Add two learnable parameters

$$
\tilde{Z}_i = \alpha_i * \hat{Z}_i + \beta_i
$$

In [12]:
class LayerNorm(torch.nn.Module):
    def __init__(self, d_model, eps=1e-6):
        super().__init__()
        self.d_model = d_model
        self.alpha = torch.nn.Parameter(torch.ones(self.d_model))
        self.beta = torch.nn.Parameter(torch.zeros(self.d_model))
        self.eps = eps
        
    def forward(self, x):
        # x size: (batch_size, seq_len, d_model)
        x_hat = (x - x.mean(dim=-1, keepdim=True))/(x.std(dim=-1, keepdim=True) + self.eps)
        x_tilde = self.alpha*x_hat + self.beta
        return x_tilde

## Encoder & Decoder layers

### Encoder layer

An encoder layer contains a multi-head attention layer and feed forward layer

![](images/encoder.png)

In [13]:
class EncoderLayer(torch.nn.Module):
    def __init__(self, d_model, n_heads, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.norm_1 = LayerNorm(d_model)
        self.norm_2 = LayerNorm(d_model)
        self.multi_head_attention = MultiHeadAttention(n_heads, d_model)
        self.feed_forward = FeedForward(d_model)
        self.dropout_1 = torch.nn.Dropout(dropout)
        self.dropout_2 = torch.nn.Dropout(dropout)
        
    def forward(self, x, mask):
        #import pdb; pdb.set_trace()
        x = x + self.dropout_1(self.multi_head_attention(x, x, x, mask))
        x = self.norm_1(x)
        
        x = x + self.dropout_2(self.feed_forward(x))
        x = self.norm_2(x)
        return x

### Decoder layer

An decoder layer contains two multi-head attention layers and one feed forward layer

![](images/decoder.png)

In [14]:
class DecoderLayer(torch.nn.Module):
    def __init__(self, d_model, n_heads, dropout=0.1):
        super().__init__()
        self.norm_1 = LayerNorm(d_model)
        self.norm_2 = LayerNorm(d_model)
        self.norm_3 = LayerNorm(d_model)
        
        self.dropout_1 = torch.nn.Dropout(dropout)
        self.dropout_2 = torch.nn.Dropout(dropout)
        self.dropout_3 = torch.nn.Dropout(dropout)
        
        self.multi_head_attention_1 = MultiHeadAttention(n_heads, d_model)
        self.multi_head_attention_2 = MultiHeadAttention(n_heads, d_model)
        
        self.feed_forward = FeedForward(d_model)
        
    def forward(self, x, encoder_output, src_mask, trg_mask):
        x = self.dropout_1(self.multi_head_attention_1(x, x, x, trg_mask))
        x = x + self.norm_1(x)
        
        x = self.dropout_2(self.multi_head_attention_2(x, encoder_output, encoder_output, src_mask))
        x = x + self.norm_2(x)
        
        x = self.dropout_3(self.feed_forward(x))
        x = x + self.norm_3(x)
        
        return x

In [16]:
def clone_layer(module, N):
    return torch.nn.ModuleList([copy.deepcopy(module) for i in range(N)])

## Encoder & Decoder

In [17]:
class Encoder(torch.nn.Module):
    def __init__(self, vocab_size, d_model, N, n_heads):
        super().__init__()
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model)
        self.encoder_layers = clone_layer(EncoderLayer(d_model, n_heads), N)
        self.norm = LayerNorm(d_model)
        
    def forward(self, src, mask):
        x = self.embed(src)
        x = self.pe(x)
        for encoder in self.encoder_layers:
            x = encoder(x, mask)
        return self.norm(x)

In [18]:
class Decoder(torch.nn.Module):
    def __init__(self, vocab_size, d_model, N, n_heads):
        super().__init__()
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model)
        self.decoder_layers = clone_layer(DecoderLayer(d_model, n_heads), N)
        self.norm = LayerNorm(d_model)
        
    def forward(self, trg, encoder_output, src_mask, trg_mask):
        x = self.embed(trg)
        x = self.pe(x)
        for decoder in self.decoder_layers:
            x = decoder(x, encoder_output, src_mask, trg_mask)
        return self.norm(x)

## Transformer

![](images/transformer.png)

In [19]:
class Transformer(torch.nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, d_model, N, n_heads):
        super().__init__()
        self.encoder = Encoder(src_vocab_size, d_model, N, n_heads)
        self.decoder = Decoder(trg_vocab_size, d_model, N, n_heads)
        self.linear = torch.nn.Linear(d_model, trg_vocab_size)
        
    def forward(self, src, trg, src_mask, trg_mask):
        encoder_output = self.encoder(src, src_mask)
        decoder_output = self.decoder(trg, encoder_output, src_mask, trg_mask)
        output = self.linear(decoder_output)
        return output

# Data processing

In [20]:
nlp = spacy.load('pt_core_news_sm')

In [21]:
tokenizer = lambda sentence: [tok.text for tok in nlp.tokenizer(sentence) if tok.text != " "]

In [22]:
SRC = data.Field(lower=True, tokenize=tokenizer)
TRG = data.Field(lower=True, tokenize=tokenizer, init_token='<sos>', eos_token='<eos>')

In [24]:
src_data = open('data/english.txt', 'r')

In [25]:
trg_data = open('data/french.txt', 'r')

In [26]:
raw_data = {'src': [line for line in src_data], 'trg': [line for line in trg_data]}

In [28]:
df = pd.DataFrame(raw_data, columns=['src', 'trg'])

In [29]:
df

Unnamed: 0,src,trg
0,Go.\n,Va !\n
1,Run!\n,Cours !\n
2,Run!\n,Courez !\n
3,Fire!\n,Au feu !\n
4,Help!\n,À l'aide !\n
...,...,...
154878,"""Top-down economics never works,"" said Obama. ...","« L'économie en partant du haut vers le bas, ç..."
154879,A carbon footprint is the amount of carbon dio...,Une empreinte carbone est la somme de pollutio...
154880,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...
154881,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...


In [30]:
df.to_csv('data/en_to_fr.csv', index=False)

In [32]:
data_fields = [('src', SRC), ('trg', TRG)]

In [33]:
train_set = data.TabularDataset('data/en_to_fr.csv', format='csv', fields=data_fields)

In [34]:
SRC.build_vocab(train_set)

In [35]:
len(SRC.vocab)

14115

In [36]:
TRG.build_vocab(train_set)

In [37]:
len(TRG.vocab)

28354

In [38]:
train_set

<torchtext.data.dataset.TabularDataset at 0x13de72290>

# Train transformer

In [39]:
# set some parameters
d_model = 512
n_heads = 8
N = 6
src_vocab_size = len(SRC.vocab)
trg_vocab_size = len(TRG.vocab)

In [40]:
model = Transformer(src_vocab_size, trg_vocab_size, d_model, N, n_heads)

In [41]:
for p in model.parameters():
    if p.dim() > 1:
        torch.nn.init.xavier_uniform(p)

  torch.nn.init.xavier_uniform(p)


In [42]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

In [43]:
train_iter = data.Iterator(train_set, batch_size=32, sort_key=lambda x: (len(x.src), len(x.trg)), shuffle=True, train=True)

In [44]:
def create_mask(src_input, trg_input):
    # Source input mask
    pad = SRC.vocab.stoi['<pad>']
    src_mask = (src_input != pad).unsqueeze(1)
    
    # Target input mask
    trg_mask = (trg_input != pad).unsqueeze(1)
    
    seq_len = trg_input.size(1)
    nopeak_mask = np.tril(np.ones((1, seq_len, seq_len)), k=0).astype('uint8')
    nopeak_mask = torch.from_numpy(nopeak_mask) != 0
    trg_mask = trg_mask & nopeak_mask
    
    return src_mask, trg_mask

In [45]:
def train_model(n_epochs, output_interval=100):
    model.train()
    start = time.time()
    
    for epoch in range(n_epochs):
        
        total_loss = 0
        for i, batch in tqdm(enumerate(train_iter)):
            
            src_input = batch.src.transpose(0, 1)  # size (batch_size, seq_len)
            trg = batch.trg.transpose(0, 1)  # size (batch_size, seq_len)
            
            trg_input = trg[:, :-1]
            ys = trg[:, 1:].contiguous().view(-1)
            
            # create src & trg masks
            src_mask, trg_mask = create_mask(src_input, trg_input)
            preds = model(src_input, trg_input, src_mask, trg_mask)
            
            optimizer.zero_grad()
            loss = F.cross_entropy(preds.view(-1, preds.size(-1)), ys, ignore_index=1)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.data.item()

            if (i + 1) % output_interval == 0:
                avg_loss = total_loss/output_interval
                print('time = {}, epoch = {}, iter = {}, loss = {}'.format((time.time() - start)/60,
                                                                           epoch + 1,
                                                                           i + 1,
                                                                           avg_loss))
                total_loss = 0
                start = time.time()

In [46]:
train_model(3, output_interval=10)

0it [00:00, ?it/s]

time = 0.7181087493896484, epoch = 1, iter = 10, loss = 9.744252777099609
time = 0.5791713476181031, epoch = 1, iter = 20, loss = 9.11976146697998
time = 0.49680381615956626, epoch = 1, iter = 30, loss = 8.52299451828003
time = 0.5703819155693054, epoch = 1, iter = 40, loss = 7.964556455612183
time = 0.7615645686785381, epoch = 1, iter = 50, loss = 7.460255908966064
time = 0.5137118617693583, epoch = 1, iter = 60, loss = 6.950308465957642
time = 0.5664933999379476, epoch = 1, iter = 70, loss = 6.59886417388916
time = 0.6537155469258626, epoch = 1, iter = 80, loss = 6.3433913230896
time = 0.6993738691012065, epoch = 1, iter = 90, loss = 6.246134471893311
time = 0.5269575317700704, epoch = 1, iter = 100, loss = 6.102986574172974
time = 0.5229846477508545, epoch = 1, iter = 110, loss = 6.151948738098144
time = 0.552651564280192, epoch = 1, iter = 120, loss = 6.029599809646607
time = 0.6153821349143982, epoch = 1, iter = 130, loss = 6.177250814437866
time = 0.4864656647046407, epoch = 1, i

time = 0.4171146869659424, epoch = 1, iter = 1110, loss = 6.0333961009979244
time = 0.4102659344673157, epoch = 1, iter = 1120, loss = 5.968945741653442
time = 0.4905858357747396, epoch = 1, iter = 1130, loss = 5.959720468521118
time = 0.5472680846850078, epoch = 1, iter = 1140, loss = 5.998389482498169
time = 0.56587895154953, epoch = 1, iter = 1150, loss = 6.0517371654510494
time = 0.4736044486363729, epoch = 1, iter = 1160, loss = 5.965520238876342
time = 0.3972519318262736, epoch = 1, iter = 1170, loss = 6.0142923355102536
time = 0.4450522502263387, epoch = 1, iter = 1180, loss = 6.068280220031738
time = 0.5478315989176432, epoch = 1, iter = 1190, loss = 5.890934181213379
time = 0.5642948150634766, epoch = 1, iter = 1200, loss = 6.063024091720581
time = 0.5525031010309855, epoch = 1, iter = 1210, loss = 5.967985343933106
time = 0.7016456166903178, epoch = 1, iter = 1220, loss = 6.040849018096924
time = 0.5861648003260295, epoch = 1, iter = 1230, loss = 5.929404640197754
time = 0.54

KeyboardInterrupt: 

<center>
     <img src="https://raw.githubusercontent.com/DataForScience/Networks/master/data/D4Sci_logo_full.png" alt="Data For Science, Inc" align="center" border="0" width=300px> 
</center>