In [1]:
from datasets import load_dataset, load_from_disk

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from minbpe import BasicTokenizer

import torch
from torch import nn
import torch.optim as optim
from torch.nn import functional as F
from torch.utils.data import  Dataset, DataLoader

from evaluate import load

## Read Data

In [None]:
ds = load_dataset("abisee/cnn_dailymail", "3.0.0")

# Save to disk
# ds.save_to_disk("data/cnn_dailymail_dataset")

ds

In [2]:
# If you already save to disk
ds = load_from_disk("data/cnn_dailymail_dataset")
ds

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [3]:
train_data = ds['train'].select_columns(["article", "highlights"])
val_data = ds['validation'].select_columns(["article", "highlights"])
test_data = ds['test'].select_columns(["article", "highlights"])
train_data, val_data, test_data

(Dataset({
     features: ['article', 'highlights'],
     num_rows: 287113
 }),
 Dataset({
     features: ['article', 'highlights'],
     num_rows: 13368
 }),
 Dataset({
     features: ['article', 'highlights'],
     num_rows: 11490
 }))

## Tokenization

### Tokenizer

In [None]:
# The data used for training
all_articles_text = " ".join(train_data["article"][:1000])
len(all_articles_text)

3530528

In [None]:
tokenizer = BasicTokenizer()
tokenizer.train(all_articles_text, vocab_size=1024)

In [None]:
# See how it encode
encoded_article_0 = tokenizer.encode(train_data['article'][0])
encoded_article_0[:5]

[76, 79, 78, 68, 79]

In [26]:
# See how it decode
decoded_article_0 = tokenizer.decode(encoded_article_0)
decoded_article_0, decoded_article_0 == train_data['article'][0]

('LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details 

In [None]:
# Add special tokens, we do not need <unk> here because data is in english and fit in ASCII
max_vocab_id = list(tokenizer.vocab.keys())[-1]
tokenizer.special_tokens = {
    "<sos>": max_vocab_id + 1,
    "<eos>": max_vocab_id + 2,
    "<unk>": max_vocab_id + 3,
    "<pad>": max_vocab_id + 4,
}

In [None]:
# See how it works with special tokens
encoded_special_0 = tokenizer.encode(f"<sos> {train_data['article'][0]} <eos> <pad> <pad>")
decoded_special_0 = tokenizer.decode(encoded_special_0)
decoded_special_0, decoded_special_0 == f"<sos> {train_data['article'][0]} <eos> <pad> <pad>"

('<sos> LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. De

In [30]:
# Save to disk
tokenizer.save("model/model_article_1000")

In [4]:
# Load from disk
tokenizer = BasicTokenizer()
tokenizer.load("model/model_article_1000.model")

### Tokenize Data

In [None]:
def tokenize_fields(example):
    example["article"] = tokenizer.encode(example["article"])
    example["highlights"] = tokenizer.encode(example["highlights"])
    return example

# Tokenize each split
train_tokenized = train_data.select(range(10_000)).map(tokenize_fields)
val_tokenized = val_data.select(range(2500)).map(tokenize_fields)
test_tokenized = test_data.select(range(2500)).map(tokenize_fields)

In [None]:
# Save to disk
train_tokenized.save_to_disk("data/cnn_train_tokenized_10k")
val_tokenized.save_to_disk("data/cnn_val_tokenized_2500")
test_tokenized.save_to_disk("data/cnn_test_tokenized_2500")

In [5]:
# If you already save to disk
train_tokenized = load_from_disk("data/cnn_train_tokenized_10k")
val_tokenized = load_from_disk("data/cnn_val_tokenized_2500")
test_tokenized = load_from_disk("data/cnn_test_tokenized_2500")

## Vectorizer

In [6]:
class Vectorizer:
  def __init__(self, tokenizer: BasicTokenizer):
    self.tokenizer = tokenizer
    self.vocab_size = len(tokenizer.vocab)
    self.sos_idx = tokenizer.special_tokens["<sos>"]
    self.eos_idx = tokenizer.special_tokens["<eos>"]
    self.pad_idx = tokenizer.special_tokens["<pad>"]

  # Add <sos> and <eos> at the start and end, pad data if necessasry
  def index_vectorize(self, tokens, max_length=1024):
    indices = tokens[:max_length - 2]
    indices = [self.sos_idx] + indices + [self.eos_idx]
    indices += [self.pad_idx] * (max_length - len(indices))
    return indices

In [7]:
article_vectorizer = Vectorizer(tokenizer)

In [61]:
tokenizer.decode(article_vectorizer.index_vectorize(encoded_article_0))

'<sos>LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Deta

## Dataset

In [8]:
class IndexArticleDataset(Dataset):
  def __init__(self, input, target, vectorizer: Vectorizer, max_input_length=1024, max_target_length=128):
    self.input = input
    self.target = target
    self.vectorizer = vectorizer
    
    self.max_input_length = max_input_length
    self.max_target_length = max_target_length
    
    self.sos_index = vectorizer.sos_idx
    self.eos_index = vectorizer.eos_idx
    self.pad_index = vectorizer.pad_idx

  def __len__(self):
    return len(self.input)

  def __getitem__(self, index):
    return {'x': torch.as_tensor(self.vectorizer.index_vectorize(self.input[index], self.max_input_length)),
            'y': torch.as_tensor(self.vectorizer.index_vectorize(self.target[index], self.max_target_length))}

  def get_vectorizer(self):
    return self.vectorizer 
  
  def get_num_batches(self, batch_size):
    return len(self) // batch_size 

In [9]:
train_dataset = IndexArticleDataset(train_tokenized['article'], 
                                    train_tokenized['highlights'], 
                                    article_vectorizer,
                                    max_input_length=512,
                                    max_target_length=128)

val_dataset = IndexArticleDataset(val_tokenized['article'], 
                                  val_tokenized['highlights'], 
                                  article_vectorizer,
                                  max_input_length=512,
                                  max_target_length=128)

test_dataset = IndexArticleDataset(test_tokenized['article'], 
                                   test_tokenized['highlights'], 
                                   article_vectorizer,
                                   max_input_length=512,
                                   max_target_length=128)

In [85]:
out_dict = train_dataset.__getitem__(0)
len(out_dict['x']), len(out_dict['y'])

(512, 128)

In [87]:
tokenizer.decode(out_dict['x'].numpy()), tokenizer.decode(out_dict['y'].numpy())

('<sos>LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Det

## Dataloader

In [10]:
def generate_batches(dataset, batch_size, 
                     shuffle=True,
                     drop_last=True, 
                     device="cpu"):
  
  dataloader = DataLoader(dataset=dataset, 
                          batch_size=batch_size,
                          shuffle=shuffle, 
                          drop_last=drop_last)

  for data_dict in dataloader:
    out_data_dict = {}
    for name, tensor in data_dict.items():
      out_data_dict[name] = data_dict[name].to(device)
    yield out_data_dict

In [91]:
# Using generate_batches to get 1 sample at a time
for out_dict in generate_batches(dataset=train_dataset, batch_size=32):
  x_data = out_dict['x']
  y_target = out_dict['y']
  print(f"x shape: {x_data.shape}, y shape: {y_target.shape}")
  print(x_data)
  print(y_target)
  break

x shape: torch.Size([32, 512]), y shape: torch.Size([32, 128])
tensor([[1024,   40,   69,  ...,   78,  900, 1025],
        [1024,   40,   84,  ...,  401,  276, 1025],
        [1024,   78,   69,  ..., 1027, 1027, 1027],
        ...,
        [1024,   76,   79,  ...,  735,  373, 1025],
        [1024,  702,   65,  ...,  456,  412, 1025],
        [1024,  651,   32,  ...,  926,  549, 1025]])
tensor([[1024,   69,  115,  ..., 1027, 1027, 1027],
        [1024,   65,   99,  ..., 1027, 1027, 1027],
        [1024,   80,  304,  ...,  331,   67, 1025],
        ...,
        [1024,   82,  265,  ..., 1027, 1027, 1027],
        [1024,   78,   69,  ..., 1027, 1027, 1027],
        [1024,   75,  612,  ...,  334,  473, 1025]])


## Model & Training

In [11]:
SEED = 42

np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
class Head(nn.Module):
  def __init__(self, embed_dim, head_dim, max_input_length, dropout=0.5, is_masked=True):
    super().__init__()
    self.key = nn.Linear(embed_dim, head_dim, bias=False)
    self.query = nn.Linear(embed_dim, head_dim, bias=False)
    self.value = nn.Linear(embed_dim, head_dim, bias=False)
    self.is_masked = is_masked
    if is_masked:
      self.register_buffer('mask', torch.tril(torch.ones(max_input_length, max_input_length)))
    self.dropout = nn.Dropout(dropout)
  
  def forward(self, x):
    B, T, C = x.shape # (batch_size, time_steps, channels=embed_dim)

    K = self.key(x)
    Q = self.query(x)

    KQ = (Q @ K.transpose(-2, -1)) 
    KQ_normalized = KQ / (np.sqrt(K.shape[-1]))
    if self.is_masked:
      KQ_normalized = KQ_normalized.masked_fill(self.mask[:T, :T] == 0, float('-inf'))
    KQ_normalized_masked_softmaxed = torch.softmax(KQ_normalized, dim=-1)
    KQ_normalized_masked_softmaxed_dropouted = self.dropout(KQ_normalized_masked_softmaxed)

    V = self.value(x)
    out = KQ_normalized_masked_softmaxed_dropouted @ V
    return out
  
class MultiHeadAttention(nn.Module):
  def __init__(self, num_heads, embed_dim, max_input_length, dropout=0.5, is_masked=True):
    super().__init__()
    head_dim = embed_dim // num_heads
    self.heads = nn.ModuleList([Head(embed_dim, head_dim, max_input_length, dropout, is_masked) for _ in range(num_heads)])
    self.linear = nn.Linear(head_dim * num_heads, embed_dim)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.linear(out)
    out = self.dropout(out)
    return out
  
class FeedFoward(nn.Module):
  def __init__(self, embed_dim, dropout=0.5):
    super().__init__()
    self.layers = nn.Sequential(
      nn.Linear(embed_dim, 4*embed_dim), # default: embed_dim=512, feed_forward_dim=2048)
      nn.ReLU(),
      nn.Linear(4*embed_dim, embed_dim),
      nn.Dropout(dropout),
    )

  def forward(self, x):
    return self.layers(x)
  
class SubLayer(nn.Module):
  def __init__(self, layer: nn.Module, embed_dim):
    super().__init__()
    self.Layer = layer
    self.ln = nn.LayerNorm(embed_dim)

  def forward(self, x):
    out = self.ln(x + self.Layer(x)) # normalization after residual connection
    return out
  
class Encoder(nn.Module):
  def __init__(self, num_heads, embed_dim, max_input_length, dropout=0.5):
    super().__init__()
    self.multi_head_attention = SubLayer(MultiHeadAttention(num_heads, embed_dim, max_input_length, dropout, is_masked=False), embed_dim)
    self.feed_forward = SubLayer(FeedFoward(embed_dim, dropout), embed_dim)

  def forward(self, x):
    out = self.multi_head_attention(x)
    out = self.feed_forward(out)
    return out

In [14]:
max_input_length = 512
max_target_length = 128
batch_size = 32

vocab_size = len(tokenizer.vocab)
num_heads = 8
embed_dim = 512
dropout = 0.5

model = Encoder(num_heads, embed_dim, max_input_length, dropout)
embed = nn.Embedding(vocab_size, embed_dim, padding_idx=article_vectorizer.pad_idx)
for out_dict in generate_batches(dataset=train_dataset, batch_size=batch_size):
  output = model(embed(out_dict['x']))
  break
output.shape, output

(torch.Size([32, 512, 512]),
 tensor([[[ 0.6405, -0.1673,  1.5610,  ..., -1.5898,  2.7004,  1.2969],
          [ 0.8247, -2.4709, -1.2504,  ..., -0.8810, -0.5874, -1.6542],
          [ 2.9421,  0.5028, -0.1382,  ..., -0.0732, -1.9269, -0.1504],
          ...,
          [-4.5145,  2.2122,  1.4346,  ...,  1.3832,  3.5295, -3.6478],
          [-0.6273,  0.3386,  0.1103,  ..., -1.1479,  1.1510, -0.9009],
          [-3.4660,  0.1393,  3.3567,  ...,  1.1316,  3.6283,  0.0147]],
 
         [[-3.3873, -0.4156,  0.4918,  ...,  0.0259,  1.5445,  3.3582],
          [ 0.1686,  4.2974,  1.2911,  ..., -2.2388,  0.6872, -0.2633],
          [-1.6014, -0.5566,  3.7714,  ...,  0.8337, -0.1632, -1.3584],
          ...,
          [ 0.5006, -0.2951, -2.3882,  ...,  1.0986, -0.7568,  1.6341],
          [-0.8417, -2.5384,  0.3607,  ...,  1.3841, -0.2138,  3.9314],
          [-1.1533, -0.3058,  2.7287,  ...,  1.2510,  3.5344,  0.7337]],
 
         [[ 2.1097,  1.4932,  0.4418,  ..., -2.0781,  3.0396,  0.5167],

In [71]:
model = SubLayer(FeedFoward(embed_dim, dropout), embed_dim)
embed = nn.Embedding(vocab_size, embed_dim, padding_idx=article_vectorizer.pad_idx)
for out_dict in generate_batches(dataset=train_dataset, batch_size=32):
  output = model(embed(out_dict['x']))
  break
output.shape, output

(torch.Size([32, 512, 512]),
 tensor([[[-3.2848e-03, -6.2605e-01,  3.2050e+00,  ...,  2.8122e+00,
            8.4817e-01,  4.2711e-01],
          [ 1.5457e+00,  1.5078e+00, -5.5269e-01,  ...,  1.7982e+00,
            1.1940e+00,  5.9078e-01],
          [ 1.2618e+00, -4.9616e-01, -3.4182e+00,  ..., -7.2561e-01,
            2.7547e+00, -1.4425e+00],
          ...,
          [-1.7603e+00, -1.8944e+00,  2.3924e+00,  ..., -1.3581e-01,
            8.2968e-01, -2.5232e+00],
          [-5.3222e-01, -7.9528e-01,  5.1655e-01,  ...,  1.3098e+00,
           -1.6992e+00,  1.6046e+00],
          [ 1.6983e+00, -1.1008e+00,  2.2834e-01,  ...,  6.8362e-01,
           -7.9560e-01, -2.7744e+00]],
 
         [[ 4.6308e-02, -5.7645e-01,  1.6314e+00,  ...,  1.0671e+00,
           -6.3364e-02, -1.9410e+00],
          [ 2.4141e+00,  1.4357e+00,  2.4945e+00,  ...,  5.2852e-01,
           -8.9884e-02,  1.8258e-01],
          [-1.7752e-01,  2.3532e-01, -7.8018e-01,  ..., -6.5597e-02,
           -2.2782e-01,  1.0