# Encoder-decoder Transformer

Copyright 2023, Denis Rothman

Generated by OpenAI GPT-4 through advanced prompt engineering



#Library installation

In [None]:
!pip install beautifulsoup4 requests nltk



In [None]:
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import requests
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from collections import Counter
from torchtext.vocab import Vocab

# Training

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import requests
import re
from collections import Counter
import time

In [None]:
def create_vocab(text, vocab_size):
    tokenized_text = nltk.word_tokenize(text)
    word_freq = Counter(tokenized_text)
    vocab = {word: i for i, (word, _) in enumerate(word_freq.most_common(vocab_size))}
    return vocab

def scrape_wikipedia(urls):
    text = ""
    for url in urls:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        paragraphs = soup.find_all('p')
        for paragraph in paragraphs:
            text += paragraph.get_text()
    return text

def create_dataset(vocab_size, input_seq_length, text):
    dataset = []
    tokens = word_tokenize(text)
    vocab = {word: i for i, word in enumerate(set(tokens))}
    for i in range(0, len(tokens) - input_seq_length):
        input_sequence = [vocab[word] for word in tokens[i: i + input_seq_length]]
        target = input_sequence[1:] + [vocab_size - 2]
        dataset.append((torch.tensor(input_sequence), torch.tensor(target)))
    return dataset, vocab, tokens  # returning dataset and vocabulary

class TextDataset(Dataset):
    def __init__(self, dataset):
        self.data = dataset

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, h, d_ff, dropout_rate):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.self_attention = nn.MultiheadAttention(d_model, h)
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, input):
        x = self.embedding(input)
        x = self.dropout(x)
        attn_output, _ = self.self_attention(x, x, x)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

class Decoder(nn.Module):
    def __init__(self, vocab_size, input_seq_length, h, d_k, d_v, d_model, d_ff, dropout_rate):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.self_attention = nn.MultiheadAttention(d_model, h)
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout_rate)
        self.out = nn.Linear(d_model, vocab_size)

    def forward(self, input, encoder_output, lookahead_mask, padding_mask, training):
        x = self.embedding(input)
        x = self.dropout(x)
        attn_output1, _ = self.self_attention(x, x, x, attn_mask=lookahead_mask, key_padding_mask=padding_mask)
        x = self.norm1(x + self.dropout(attn_output1))
        if encoder_output is not None:
            attn_output2, _ = self.self_attention(x, encoder_output, encoder_output)
            x = self.norm2(x + self.dropout(attn_output2))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        x = self.out(x)
        return x

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def main():
    urls = [
    'https://en.wikipedia.org/wiki/American_Revolution',
    'https://en.wikipedia.org/wiki/American_Civil_War',
    'https://en.wikipedia.org/wiki/World_War_I',
    'https://en.wikipedia.org/wiki/World_War_II',
    'https://en.wikipedia.org/wiki/Renaissance',
    'https://en.wikipedia.org/wiki/Industrial_Revolution',
    'https://en.wikipedia.org/wiki/French_Revolution',
    'https://en.wikipedia.org/wiki/Ancient_Greece',
    'https://en.wikipedia.org/wiki/Roman_Empire',
    'https://en.wikipedia.org/wiki/Enlightenment'
    ]
    vocab_size = 30000
    input_seq_length = 512
    h = 8
    d_k = 64
    d_v = 64
    d_model = 512
    d_ff = 2048
    dropout_rate = 0.1
    epochs = 20
    batch_size = 32
    loss_threshold=4
    showlogits=1
    text = scrape_wikipedia(urls)
    raw_dataset, vocab,tokens = create_dataset(vocab_size, input_seq_length, text)
    total_words=len(vocab)
    total_tokens=len(tokens)
    print(f'Total vocab scraped: {total_words:,}')
    print(f'Total tokens scraped: {total_tokens:,}')
    torch.save(raw_dataset, "raw_dataset.pt")
    dataset = TextDataset(raw_dataset)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    num_batches = len(data_loader)
    print(f'Number of batches: {num_batches}') # input and target sentences

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)
    encoder = Encoder(vocab_size, d_model, h, d_ff, dropout_rate).to(device)
    decoder = Decoder(vocab_size, input_seq_length, h, d_k, d_v, d_model, d_ff, dropout_rate).to(device)
    optimizer = torch.optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=0.001)

    num_parameters_encoder = count_parameters(encoder)
    num_parameters_decoder = count_parameters(decoder)

    print(f'The encoder has {num_parameters_encoder:,} trainable parameters.')
    print(f'The decoder has {num_parameters_decoder:,} trainable parameters.')
    total_parameters = num_parameters_encoder + num_parameters_decoder
    print(f'The total model has {total_parameters:,} trainable parameters.')

    # Start time
    start_time = time.time()
    with open("loss.txt", "w") as f:
      for epoch in range(epochs):
        for i, (inputs, targets) in enumerate(data_loader):
            inputs = inputs.to(device).long()  # Move inputs to device
            targets = targets.to(device).long()  # Move targets to device
            encoder_output = encoder(inputs)
            output = decoder(inputs, encoder_output, None, None, training=True)
            output = output.view(-1, output.size(-1))
            targets = targets.view(-1)
            loss = F.cross_entropy(output, targets)
            # Print the batch number and loss every 100 steps
            if i % 10 == 0:
              print(f"Epoch: {epoch},Batch: {i+1}, Loss: {loss.item()}")
            # Write the loss to the file
            f.write(f"Epoch: {epoch}, Batch: {i}, Loss: {loss.item()}\n")
            if loss<loss_threshold :
              # Printing the raw logits
              print('Raw logits:', output)
              break
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print("Epoch: {}, Loss: {}".format(epoch, loss.item()))
    # End time
    end_time = time.time()
    print("Time taken for training: {} seconds".format(end_time - start_time))
    return encoder, decoder
99
encoder, decoder = main()

torch.save({
    "encoder": encoder.state_dict(),
    "decoder": decoder.state_dict()
}, "model.pt")

Total vocab scraped: 16,680
Total tokens scraped: 165,004
Number of batches: 5141
cuda
The encoder has 18,512,384 trainable parameters.
The decoder has 33,903,408 trainable parameters.
The total model has 52,415,792 trainable parameters.
Epoch: 0,Batch: 1, Loss: 10.487588882446289
Epoch: 0,Batch: 11, Loss: 6.941403388977051
Epoch: 0,Batch: 21, Loss: 6.688475608825684
Epoch: 0,Batch: 31, Loss: 6.32985782623291
Epoch: 0,Batch: 41, Loss: 6.070192337036133
Epoch: 0,Batch: 51, Loss: 5.888068199157715
Epoch: 0,Batch: 61, Loss: 5.668067932128906
Epoch: 0,Batch: 71, Loss: 5.494823455810547
Epoch: 0,Batch: 81, Loss: 5.295373439788818
Epoch: 0,Batch: 91, Loss: 5.165517807006836
Epoch: 0,Batch: 101, Loss: 4.8827080726623535
Epoch: 0,Batch: 111, Loss: 4.8755950927734375
Epoch: 0,Batch: 121, Loss: 4.72114896774292
Epoch: 0,Batch: 131, Loss: 4.641555309295654
Epoch: 0,Batch: 141, Loss: 4.641575813293457
Epoch: 0,Batch: 151, Loss: 4.432111740112305
Epoch: 0,Batch: 161, Loss: 4.37265682220459
Epoch: 0

Total sequence pairs = (165,050 - 512)
Number of batches = Total sequence pairs / 32
                  ≈ 5142

The code snippet provided is constructing sequences from tokens and then preparing batches from these sequences. Here's a breakdown of what's happening:

- You have a total of \( 165,050 \) tokens.
- You're using sequences of \( 512 \) tokens to create input sequences.
- For each input sequence, you have an associated target sequence, which is often a shifted version of the input sequence in text generation tasks.
- Since you are using sequences of length \( 512 \), the last \( 512 \) tokens of your text won't have enough subsequent tokens to form a full sequence, so you subtract this value.
- The total number of input-target pairs is \( 165,050 - 512 \), and these pairs are then divided into batches.

The number of batches is then calculated by dividing the total number of input-target pairs by the batch size. The calculation only counts the input sequences, but it implicitly counts the corresponding target sequences since there is a one-to-one correspondence between input sequences and target sequences.

In summary, the subtraction of \( 512 \) accounts for the fact that you're creating sequences of that length, and it ensures that you don't attempt to create a sequence that would extend beyond the end of your tokens. This subtraction doesn't specifically relate to the distinction between input and target sequences; instead, it relates to the sequence length you are using to construct both input and target sequences.

# Dataset


In [None]:
raw_dataset = torch.load("raw_dataset.pt")
dataset = TextDataset(raw_dataset)
# Print the first 5 items
for i, (input, target) in enumerate(raw_dataset):
    print(f"Input: {input}")
    print(f"Target: {target}")
    if i >= 4:  # stop after 5 items
        break

Input: tensor([ 6363, 11622,  9706, 15723,  1957,   426, 12534,  7721, 11662, 12576,
         2818,  4281,  5817,  8019,   593, 14137, 12534, 10173,  2167,  4872,
         8996,  9522, 11622,  9900,  9655, 16404, 15119,  9779,  5528,  8786,
         7047, 10173,  4644,  8996,  6870, 15602, 10544, 12750,  8996, 11622,
         9706, 16404,  8996,   488,  6266, 15046,  7379, 13757,  5528,  8996,
         5817,  2089, 12534,  5744,  8996,  7458,  5413, 13182,  8996,  7587,
        10444,  1154, 12353,  8504,  3759,  1578, 13447, 12750,  2307, 12534,
         2732,  2112,  2167, 11622,  4909, 13782,  7047, 13400,  5316, 15576,
         8996,  5817,  3262, 16404,  1149,   115,  4281, 15119, 11420, 11221,
         1938, 10875,  5320,  2167, 13162,  7047,  8996, 14869, 16404,  5817,
         3809,  4792,  4950,  8996,   443,  1149, 13755,  3289, 11717, 12750,
         1254,  4281,  7379,  1477, 11655, 16404, 15119, 13551, 14436, 11971,
        15576,  3809, 14059,  2167,  7718,  8996, 14869, 

# Model

In [None]:
# Load the model checkpoint
checkpoint = torch.load("model.pt")

# Print all keys and values in the checkpoint
print("Model checkpoint:")
for key, value in checkpoint.items():
    print(f"Key: {key}")
    if isinstance(value, dict):
        # If the value is a dictionary (as it is for encoder and decoder state_dicts), print its keys
        print(f"Value keys: {value.keys()}")
    else:
        # Otherwise, print the value itself
        print(f"Value: {value}")

Model checkpoint:
Key: encoder
Value keys: odict_keys(['embedding.weight', 'self_attention.in_proj_weight', 'self_attention.in_proj_bias', 'self_attention.out_proj.weight', 'self_attention.out_proj.bias', 'feed_forward.0.weight', 'feed_forward.0.bias', 'feed_forward.2.weight', 'feed_forward.2.bias', 'norm1.weight', 'norm1.bias', 'norm2.weight', 'norm2.bias'])
Key: decoder
Value keys: odict_keys(['embedding.weight', 'self_attention.in_proj_weight', 'self_attention.in_proj_bias', 'self_attention.out_proj.weight', 'self_attention.out_proj.bias', 'feed_forward.0.weight', 'feed_forward.0.bias', 'feed_forward.2.weight', 'feed_forward.2.bias', 'norm1.weight', 'norm1.bias', 'norm2.weight', 'norm2.bias', 'norm3.weight', 'norm3.bias', 'out.weight', 'out.bias'])


Reference  Original Transformer

5.1    Training Data and BatchingWe trained on the standard WMT 2014 English-German dataset consisting of about **4.5 millions entence pairs.** Sentences were encoded using byte-pair encoding [3], which has a shared **source-target vocabulary of about 37000 tokens**. For English-French, we used the significantly larger WMT2014 English-French dataset consisting of 36M sentences and split tokens into a **32000 word-piece vocabulary** [38]. Sentence pairs were batched together by approximate sequence length. **Each training batch contained a set of sentence pairs containing approximately 25000 source tokens and 25000 target tokens.**

Comparaison

Your Dataset:

You have 15,000 words, and you are constructing sequences of 512 tokens for both input and target.
This results in 14,488 sequence pairs, where each sequence pair consists of an input sequence of 512 tokens and a target sequence of 512 tokens.
Therefore, the total number of tokens in a single sequence pair is 1,024 (512 + 512).
Multiplying the total number of sequence pairs (14,488) by the number of tokens per sequence pair (1,024) gives you the total number of tokens in your entire dataset: 14,830,592.

The original text describes two different training scenarios, one for English-German and another for English-French translation. Here's an analysis and comparison between the described scenarios and your current setup:

### English-German Training:
- **Dataset Size:** About 4.5 million sentence pairs.
- **Vocabulary Size:** About 37,000 tokens, encoded using byte-pair encoding.
- **Model Configuration:** Not mentioned explicitly, but the shared vocabulary size gives us some insight into the model's capabilities.

### English-French Training:
- **Dataset Size:** 36 million sentences.
- **Vocabulary Size:** Split into a 32,000 word-piece vocabulary.
- **Model Configuration:** Not explicitly stated.
- **Batching:** Sentences batched by approximate length, with each batch containing approximately 25,000 source tokens and 25,000 target tokens (50,000 tokens total per batch).

### Your Scenario:
- **Dataset Size:** You've scraped around 15,748 words, with input sequences of 512 tokens, resulting in a dataset size of around 14,488 sequence pairs (assuming no overlap).
- **Vocabulary Size:** Depending on the tokenization method used, the vocabulary size may vary.
- **Model Configuration:** `d_model` is set to 512.
- **Batching:** If you decide to match the batch size described for English-French (50,000 tokens per batch), you would have approximately 296 batches in your dataset.

### Comparison:
Your dataset is much smaller than either of the datasets described in the original text. The volume of data in terms of tokens and sentence pairs is significantly lower in your case.

The described English-French training scenario is the one that mentions specific details about batching and token counts. Compared to that, your dataset's structure is different, and your total number of tokens (around 14.8 million) would be divided into different batch sizes, depending on how you decide to structure the training.

In summary, while your setup shares some similarities in terms of handling tokens and sequence lengths, the scale and complexity of the datasets and possibly the model configurations are quite different. If your aim is to replicate or approximate the described training scenarios, you would likely need to consider adjustments in terms of the dataset size, vocabulary, and model architecture.

5.2    Hardware and ScheduleWe trained our models on **one machine with 8 NVIDIA P100 GPUs.**  For our base models usingthe hyperparameters described throughout the paper, each training step took about 0.4 seconds. Wetrained the base models for a total of**100,000 steps** or 12 hours. For our big models,(described on thebottom line of table 3), step time was 1.0 seconds. The big models were trained for 300,000 steps(3.5 days).

5.3    OptimizerWe used the **Adam optimizer** [20] withβ1= 0.9,β2= 0.98and= 10−9. We varied the learningrate over the course of training, according to the formula:lrate=d−0.5model·min(step_num−0.5,step_num·warmup_steps−1.5)(3)This corresponds to increasing the learning rate linearly for the firstwarmup_stepstraining steps,and decreasing it thereafter proportionally to the inverse square root of the step number. We usedwarmup_steps= 4000.

5.4    RegularizationWe employ **three types of regularization** during training:Residual **Dropout** We apply dropout [33] to the output of each sub-layer, before it is added to thesub-layer input and normalized. In addition, we apply dropout to the sums of the embeddings and thepositional encodings in both the encoder and decoder stacks. For the base model, we use a rate ofPdrop= 0.1.