In [2]:
from datasets import load_dataset, load_from_disk

import numpy as np

from minbpe import BasicTokenizer

import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import  Dataset, DataLoader

from Transformer import Transformer_Encoder_Decoder

## Read Data

In [3]:
# ds = load_dataset("abisee/cnn_dailymail", "3.0.0")

# Save to disk
# ds.save_to_disk("data/cnn_dailymail_dataset")

# If you already save to disk
ds = load_from_disk("data/cnn_dailymail_dataset")

ds

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [4]:
train_data = ds['train'].select_columns(["article", "highlights"])
val_data = ds['validation'].select_columns(["article", "highlights"])
test_data = ds['test'].select_columns(["article", "highlights"])
train_data, val_data, test_data

(Dataset({
     features: ['article', 'highlights'],
     num_rows: 287113
 }),
 Dataset({
     features: ['article', 'highlights'],
     num_rows: 13368
 }),
 Dataset({
     features: ['article', 'highlights'],
     num_rows: 11490
 }))

## Tokenizer Training

In [5]:
# The data used for training
all_articles_text = " ".join(train_data["article"][:1000])
len(all_articles_text)

3530528

In [None]:
tokenizer = BasicTokenizer()
tokenizer.train(all_articles_text, vocab_size=1024)

In [None]:
# Add special tokens, we do not need <unk> here because data is in english and fit in ASCII
max_vocab_id = list(tokenizer.vocab.keys())[-1]
tokenizer.special_tokens = {
    "<sos>": max_vocab_id + 1,
    "<eos>": max_vocab_id + 2,
    "<unk>": max_vocab_id + 3,
    "<pad>": max_vocab_id + 4,
}

# Save to disk
tokenizer.save("model/model_article_1000")

## Tokenizer

In [None]:
tokenizer = BasicTokenizer()

# Load from disk
tokenizer.load("model/model_article_1000.model")

## Tokenize the data

In [None]:
def tokenize_fields(example):
    example["article"] = tokenizer.encode(example["article"])
    example["highlights"] = tokenizer.encode(example["highlights"])
    return example

# Tokenize each split
train_tokenized = train_data.select(range(10_000)).map(tokenize_fields)
val_tokenized = val_data.select(range(2500)).map(tokenize_fields)
test_tokenized = test_data.select(range(2500)).map(tokenize_fields)

# Save to disk
train_tokenized.save_to_disk("data/cnn_train_tokenized_10k")
val_tokenized.save_to_disk("data/cnn_val_tokenized_2500")
test_tokenized.save_to_disk("data/cnn_test_tokenized_2500")

In [None]:
# Load from disk
train_tokenized = load_from_disk("data/cnn_train_tokenized_10k")
val_tokenized = load_from_disk("data/cnn_val_tokenized_2500")
test_tokenized = load_from_disk("data/cnn_test_tokenized_2500")

## Vectorizer

In [4]:
class Vectorizer:
  def __init__(self, tokenizer: BasicTokenizer):
    self.tokenizer = tokenizer
    self.vocab_size = len(tokenizer.vocab)
    self.sos_idx = tokenizer.special_tokens["<sos>"]
    self.eos_idx = tokenizer.special_tokens["<eos>"]
    self.pad_idx = tokenizer.special_tokens["<pad>"]

  def index_vectorize(self, tokens, max_length=1024):
    indices = tokens[:max_length - 2]
    indices = [self.sos_idx] + indices + [self.eos_idx]
    indices += [self.pad_idx] * (max_length - len(indices))
    return indices
  
article_vectorizer = Vectorizer(tokenizer)

## Dataset

In [5]:
class IndexArticleDataset(Dataset):
  def __init__(self, input, target, vectorizer: Vectorizer, max_input_length=1024, max_target_length=128):
    self.input = input
    self.target = target
    self.vectorizer = vectorizer
    
    self.max_input_length = max_input_length
    self.max_target_length = max_target_length
    
    self.sos_index = vectorizer.sos_idx
    self.eos_index = vectorizer.eos_idx
    self.pad_index = vectorizer.pad_idx

    # Precompute indexed and padded input/target sequences
    self.indexed_input = [
      torch.as_tensor(
        vectorizer.index_vectorize(example, max_length=max_input_length), dtype=torch.long
      )
      for example in input
    ]
    self.indexed_target = [
      torch.as_tensor(
        vectorizer.index_vectorize(example, max_length=max_target_length), dtype=torch.long
      )
      for example in target
    ]

  def __len__(self):
    return len(self.input)

  def __getitem__(self, index):
    return {
            'x': self.indexed_input[index],
            'y': self.indexed_target[index]
        }
    # return {'x': torch.as_tensor(self.vectorizer.index_vectorize(self.input[index], self.max_input_length)),
    #         'y': torch.as_tensor(self.vectorizer.index_vectorize(self.target[index], self.max_target_length))}

  def get_vectorizer(self):
    return self.vectorizer 
  
  def get_num_batches(self, batch_size):
    return len(self) // batch_size 

In [6]:
train_dataset = IndexArticleDataset(train_tokenized['article'], 
                                    train_tokenized['highlights'], 
                                    article_vectorizer,
                                    max_input_length=512,
                                    max_target_length=128)

val_dataset = IndexArticleDataset(val_tokenized['article'], 
                                  val_tokenized['highlights'], 
                                  article_vectorizer,
                                  max_input_length=512,
                                  max_target_length=128)

test_dataset = IndexArticleDataset(test_tokenized['article'], 
                                   test_tokenized['highlights'], 
                                   article_vectorizer,
                                   max_input_length=512,
                                   max_target_length=128)

## Dataloader

In [7]:
def generate_batches(dataset, batch_size, 
                     shuffle=True,
                     drop_last=True, 
                     device="cpu"):
  
  dataloader = DataLoader(dataset=dataset, 
                          batch_size=batch_size,
                          shuffle=shuffle, 
                          drop_last=drop_last)

  for data_dict in dataloader:
    out_data_dict = {}
    for name, tensor in data_dict.items():
      out_data_dict[name] = data_dict[name].to(device)
    yield out_data_dict

## Env

In [9]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

max_input_length = 512
max_target_length = 128

vocab_size = len(tokenizer.vocab)
pad_idx = tokenizer.special_tokens['<pad>']

num_layers = 6
num_heads = 8
embed_dim = 512
input_dropout = 0.1

lr = 0.001
epochs = 10
eval_interval = 1
batch_size = 32
batch_number_train = len(train_dataset) // batch_size
batch_number_val = len(val_dataset) // batch_size

## Training

In [None]:
SEED = 42

np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

model = Transformer_Encoder_Decoder(vocab_size, num_layers, num_heads, embed_dim, max_input_length, max_target_length, pad_idx, input_dropout).to(device)
loss_fn = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.AdamW(model.parameters(), lr=lr)

for epoch in range(epochs):
  model.train()

  train_loss = 0

  for out_dict in generate_batches(dataset=train_dataset, batch_size=batch_size, device=device):
    x = out_dict['x']
    y = out_dict['y']

    y_logits = model(x, y[:, :-1])
    
    B, T, C = y_logits.shape
    loss = loss_fn(y_logits.reshape(B * T, C), y[:, 1:].reshape(B * T))
    # print(loss)
    train_loss += loss.item()

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

  train_loss /= batch_number_train

  if epoch % eval_interval == 0 or epoch == epochs-1:
    model.eval()
    val_loss = 0

    with torch.inference_mode():
      for out_dict in generate_batches(dataset=val_dataset, batch_size=batch_size, device=device):
        x = out_dict['x']
        y = out_dict['y']

        y_logits = model(x, y[:, :-1])
    
        B, T, C = y_logits.shape
        loss = loss_fn(y_logits.reshape(B * T, C), y[:, 1:].reshape(B * T))
        val_loss += loss.item()

    val_loss /= batch_number_val
    
    print(f"Epoch {epoch} | Train loss: {train_loss:.4f} | Val loss: {val_loss:.4f}")

Epoch 0 | Train loss: 6.0024 | Val loss: 6.0043
Epoch 1 | Train loss: 5.9716 | Val loss: 5.9968
Epoch 2 | Train loss: 5.9515 | Val loss: 6.9783
Epoch 3 | Train loss: 5.9316 | Val loss: 6.7167
Epoch 4 | Train loss: 5.9271 | Val loss: 6.8649
Epoch 5 | Train loss: 5.9247 | Val loss: 6.5532
Epoch 6 | Train loss: 5.9230 | Val loss: 6.9263
Epoch 7 | Train loss: 5.9216 | Val loss: 7.1859
Epoch 8 | Train loss: 5.9205 | Val loss: 6.3286
Epoch 9 | Train loss: 5.9204 | Val loss: 6.6835


In [None]:
# Save model weights and checkpoint

torch.save(model.state_dict(), 'model/model_weights.pth')

torch.save({
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': loss,
}, 'model/checkpoint.pth')

## Testing

In [29]:
SEED = 42

np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

batch_number_test = len(test_dataset) // batch_size

model = Transformer_Encoder_Decoder(vocab_size, num_layers, num_heads, embed_dim, max_input_length, max_target_length, pad_idx, input_dropout).to(device)
model.load_state_dict(torch.load('model/model_weights.pth'))
loss_fn = nn.CrossEntropyLoss(ignore_index=pad_idx)

model.eval()
test_loss = 0

with torch.inference_mode():
  for out_dict in generate_batches(dataset=test_dataset, batch_size=batch_size, device=device):
    x = out_dict['x']
    y = out_dict['y']

    y_logits = model(x, y[:, :-1])
    
    B, T, C = y_logits.shape
    loss = loss_fn(y_logits.reshape(B * T, C), y[:, 1:].reshape(B * T))
    test_loss += loss.item()

test_loss /= batch_number_test

print(f"Test loss: {test_loss:.4f}")

Test loss: 6.6941


## Generate Summary

In [26]:
new_model = Transformer_Encoder_Decoder(vocab_size, num_layers, num_heads, embed_dim, max_input_length, max_target_length, pad_idx, input_dropout).to(device)
new_model.load_state_dict(torch.load('model/model_weights.pth'))

<All keys matched successfully>

In [27]:
input = article_vectorizer.index_vectorize(train_tokenized['article'][0], max_length=512)
input_tensor = torch.as_tensor(input).unsqueeze(0).to(device='cuda')
output_tensor = new_model.generate(input_tensor, tokenizer.special_tokens['<sos>'], tokenizer.special_tokens['<eos>'])

print("Input : ", tokenizer.decode(input_tensor.squeeze().cpu().numpy()))
print("Output: ", tokenizer.decode(output_tensor.squeeze().cpu().numpy()))

Input :  <sos>LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don't think I'll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. 

In [28]:
input = article_vectorizer.index_vectorize(test_tokenized['article'][0], max_length=512)
input_tensor = torch.as_tensor(input).unsqueeze(0).to(device='cuda')
output_tensor = new_model.generate(input_tensor, tokenizer.special_tokens['<sos>'], tokenizer.special_tokens['<eos>'])

print("Input : ", tokenizer.decode(input_tensor.squeeze().cpu().numpy()))
print("Output: ", tokenizer.decode(output_tensor.squeeze().cpu().numpy()))

Input :  <sos>(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday'