In [2]:
import sys
import os
import warnings

# Add the folder to Python path
if 'GRU_vs_Transformer_amazon_polarity/' not in sys.path:
  sys.path.insert(0, 'GRU_vs_Transformer_amazon_polarity/')

# Check if CUDA is available
import torch
if not torch.cuda.is_available():
  warnings.warn('CUDA is not available.')



In [4]:
%matplotlib inline
import urllib.request
import time
import json

import numpy as np
import matplotlib.pyplot as plt

from typing import List, Dict, Union, Optional, Tuple
import datasets
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader


In [3]:
import transformers
from transformers import AutoModel
from datasets import load_dataset
from tokenizers import Tokenizer

from GRU_encoder_decoder import EncoderDecoder
from transformer import Transformer

In [5]:
dataset_train = datasets.load_dataset("amazon_polarity", split="train", cache_dir="assignment/data")
dataset_test = datasets.load_dataset("amazon_polarity", split="test[:1000]", cache_dir="assignment/data")

Found cached dataset amazon_polarity (/Users/hamzaabdelhedi/Projects/ml_projects/GRU_vs_Transformer_amazon_polarity/assignment/data/amazon_polarity/amazon_polarity/3.0.0/a27b32b7e7b88eb274a8fa8ba0f654f1fe998a87c22547557317793b5d2772dc)
Found cached dataset amazon_polarity (/Users/hamzaabdelhedi/Projects/ml_projects/GRU_vs_Transformer_amazon_polarity/assignment/data/amazon_polarity/amazon_polarity/3.0.0/a27b32b7e7b88eb274a8fa8ba0f654f1fe998a87c22547557317793b5d2772dc)


In [6]:
# Lets have quick look at a few samples in our test set.
n_samples_to_see = 3 
for i in range(n_samples_to_see):
  print("-"*30)
  print("title:", dataset_test[i]["title"])
  print("content:", dataset_test[i]["content"])
  print("label:", dataset_test[i]["label"])

------------------------------
title: Great CD
content: My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I'm in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life's hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing "Who was that singing ?"
label: 1
------------------------------
title: One of the best game music soundtracks - for a game I didn't really play
content: Despite the fact that I have only played a small portion of the game, the music I heard (plus the connection to Chrono Trigger which was great as well) led me to purchase the soundtrack, and it remains one of my favorite albums. There is an incredible mix of fun, epic, and emotional songs. 

### 1️⃣ Tokenize the `text`
Tokenize the `text` portion of each sample (i.e. parsing the text to smaller chunks). Tokenization can happen in many ways; traditionally, this was done based on the white spaces. With transformer-based models, tokenization is performed based on the frequency of occurrence of "chunk of text". This frequency can be learned in many different ways. However the most common one is the [**wordpiece**](https://arxiv.org/pdf/1609.08144v2.pdf) model. 
> The wordpiece model is generated using a data-driven approach to maximize the language-model likelihood
of the training data, given an evolving word definition. Given a training corpus and a number of desired
tokens $D$, the optimization problem is to select $D$ wordpieces such that the resulting corpus is minimal in the
number of wordpieces when segmented according to the chosen wordpiece model.

Under this model:
1. Not all things can be converted to tokens depending on the model. For example, most models have been pretrained without any knowledge of emojis. So their token will be `[UNK]`, which stands for unknown.
2. Some words will be mapped to multiple tokens!
3. Depending on the kind of model, your tokens may or may not respect capitalization

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [8]:
#title 🔍 Quick look at tokenization
input_sample = "Welcome to Baba Sanfour Github. Please leave a star on the repo. This is a tutorial on 🤗(HUGGING FACE) Library :DDD."
tokenizer.tokenize(input_sample)

['welcome',
 'to',
 'baba',
 'san',
 '##fo',
 '##ur',
 'gi',
 '##th',
 '##ub',
 '.',
 'please',
 'leave',
 'a',
 'star',
 'on',
 'the',
 'rep',
 '##o',
 '.',
 'this',
 'is',
 'a',
 'tutor',
 '##ial',
 'on',
 '[UNK]',
 '(',
 'hugging',
 'face',
 ')',
 'library',
 ':',
 'dd',
 '##d',
 '.']

### 2️⃣ Encoding
Once we have tokenized the text, we then need to convert these chuncks to numbers so we can feed them to our model. This conversion is basically a look-up in a dictionary **from `str` $\to$ `int`**. The tokenizer object can also perform this work. While it does so it will also add the *special* tokens needed by the model to the encodings. 

In [9]:
# 🔍 Quick look at token encoding { run: "auto"}
print("--> Token Encodings:\n",tokenizer.encode(input_sample))
print("-."*15)
print("--> Token Encodings Decoded:\n",tokenizer.decode(tokenizer.encode(input_sample)))

--> Token Encodings:
 [101, 6160, 2000, 14208, 2624, 14876, 3126, 21025, 2705, 12083, 1012, 3531, 2681, 1037, 2732, 2006, 1996, 16360, 2080, 1012, 2023, 2003, 1037, 14924, 4818, 2006, 100, 1006, 17662, 2227, 1007, 3075, 1024, 20315, 2094, 1012, 102]
-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.
--> Token Encodings Decoded:
 [CLS] welcome to baba sanfour github. please leave a star on the repo. this is a tutorial on [UNK] ( hugging face ) library : ddd. [SEP]


### 3️⃣ Truncate/Pad samples
Since all the sample in the batch will not have the same sequence length, we would need to truncate the longer sequences (i.e. the ones that exeed a predefined maximum length) and pad the shorter ones so we that we can equal length for all the samples in the batch. Once this is achieved, we would need to convert the result to `torch.Tensor`s and return. These tensors will then be retrieved from the [dataloader](https://https//pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader).

In [10]:
from typing import List, Dict, Union
class Collate:
    def __init__(self, tokenizer: str, max_len: int) -> None:
        self.tokenizer_name = tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name)
        self.max_len = max_len

    def __call__(self, batch: List[Dict[str, Union[str, int]]]) -> Dict[str, torch.Tensor]:
        texts = list(map(lambda batch_instance: batch_instance["title"], batch))
        tokenized_inputs = self.tokenizer(
            texts,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt",
            return_token_type_ids=False,
        )
        
        labels = list(map(lambda batch_instance: int(batch_instance["label"]), batch))
        labels = torch.LongTensor(labels)
        return dict(tokenized_inputs, **{"labels": labels})

In [11]:
#🧑‍🍳 Setting up the collate function 
tokenizer_name = "bert-base-uncased" 
sample_max_length = 256
collate = Collate(tokenizer=tokenizer_name, max_len=sample_max_length)

In [12]:
torch.random.manual_seed(0)

class basicClassifier(nn.Module):
    def __init__(self, backbone: str, backbone_hidden_size: int, nb_classes: int):
        super(basicClassifier, self).__init__()
        self.backbone = backbone
        self.backbone_hidden_size = backbone_hidden_size
        self.nb_classes = nb_classes
        self.back_bone = AutoModel.from_pretrained(
            self.backbone,
            output_attentions=False,
            output_hidden_states=False,
        )
        self.classifier = torch.nn.Linear(self.backbone_hidden_size, self.nb_classes)

    def forward(
        self, input_ids: torch.Tensor, attention_mask: torch.Tensor, labels: Optional[torch.Tensor] = None
    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        back_bone_output = self.back_bone(input_ids, attention_mask=attention_mask)
        hidden_states = back_bone_output[0]
        pooled_output = hidden_states[:, 0]  # getting the [CLS] token
        logits = self.classifier(pooled_output)
        if labels is not None:
            loss_fn = torch.nn.CrossEntropyLoss()
            loss = loss_fn(
                logits.view(-1, self.nb_classes),
                labels.view(-1),
            )
            return loss, logits
        return logits

class ClassifierLSTM(nn.Module):
    def __init__(self, nb_classes: int, encoder_only: bool = False, dropout=0.5):
        super(ClassifierLSTM, self).__init__()
        self.nb_classes = nb_classes
        self.encoder_only = encoder_only
        self.back_bone = EncoderDecoder(dropout=dropout, encoder_only=encoder_only) 
        self.classifier = torch.nn.Linear(256, self.nb_classes)
       
    def forward(
        self, input_ids: torch.Tensor, attention_mask: torch.Tensor, labels: Optional[torch.Tensor] = None
    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        hidden_states, _ = self.back_bone(input_ids, attention_mask)
        pooled_output = hidden_states 
        logits = self.classifier(pooled_output)
        if labels is not None:
            loss_fn = torch.nn.CrossEntropyLoss()
            loss = loss_fn(
                logits.view(-1, self.nb_classes),
                labels.view(-1),
            )
            return loss, logits
        return logits


class ClassifierTransformer(nn.Module):
    def __init__(self, nb_classes: int, num_heads: int = 4, num_layers: int = 4, block: str="prenorm", dropout: float = 0.3):
        super(ClassifierTransformer, self).__init__()
        self.nb_classes = nb_classes
        self.back_bone = Transformer(num_heads=num_heads, num_layers=num_layers, block=block, dropout=dropout)
        self.classifier = torch.nn.Linear(256, self.nb_classes)

    def forward(
        self, input_ids: torch.Tensor, attention_mask: torch.Tensor, labels: Optional[torch.Tensor] = None
    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        back_bone_output = self.back_bone(input_ids, attention_mask)
        hidden_states = back_bone_output
        pooled_output = hidden_states
        logits = self.classifier(pooled_output)
        if labels is not None:
            loss_fn = torch.nn.CrossEntropyLoss()
            loss = loss_fn(
                logits.view(-1, self.nb_classes),
                labels.view(-1),
            )
            return loss, logits
        return logits

In [13]:

def train_one_epoch(
    model: torch.nn.Module, training_data_loader: DataLoader, optimizer: torch.optim.Optimizer, logging_frequency: int, testing_data_loader: DataLoader, logger: dict):
    model.train()
    optimizer.zero_grad()
    epoch_loss = 0
    logging_loss = 0
    start_time = time.time()
    mini_start_time = time.time()
    torch.autograd.set_detect_anomaly(True)
    for step, batch in enumerate(training_data_loader):
        batch = {key: value.to(device) for key, value in batch.items()}
        outputs = model(**batch)
        loss = outputs[0]
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        logging_loss += loss.item()

        if (step + 1) % logging_frequency == 0:
            freq_time = time.time()-mini_start_time
            logger['train_time'].append(freq_time+logger['train_time'][-1])
            logger['train_losses'].append(logging_loss/logging_frequency)
            print(f"Training loss @ step {step+1}: {logging_loss/logging_frequency}")
            eval_acc, eval_loss, eval_time = evaluate(model, testing_data_loader)
            logger['eval_accs'].append(eval_acc)
            logger['eval_losses'].append(eval_loss)
            logger['eval_time'].append(eval_time+logger['eval_time'][-1])

            logging_loss = 0
            mini_start_time = time.time()

    return epoch_loss / len(training_data_loader), time.time()-start_time


def evaluate(model: torch.nn.Module, test_data_loader: DataLoader):
    model.eval()
    model.to(device)
    eval_loss = 0
    correct_predictions = {i: 0 for i in range(2)}
    total_predictions = {i: 0 for i in range(2)}
    start_time = time.time()
    with torch.no_grad():
        for step, batch in enumerate(test_data_loader):
            batch = {key: value.to(device) for key, value in batch.items()}
            outputs = model(**batch)
            loss = outputs[0]
            eval_loss += loss.item()
            predictions = np.argmax(outputs[1].detach().cpu().numpy(), axis=1)
            for target, prediction in zip(batch["labels"].cpu().numpy(), predictions):
                if target == prediction:
                    correct_predictions[target] += 1
                total_predictions[target] += 1
    accuracy = (100.0 * sum(correct_predictions.values())) / sum(total_predictions.values())
    model.train()
    return accuracy,  eval_loss / len(test_data_loader), time.time() - start_time


In [14]:
def get_logger(model):
  logger = dict()
  logger['train_time'] = [0]
  logger['eval_time'] = [0]
  logger['train_losses'] = []
  logger['eval_accs'] = []
  logger['eval_losses'] = []
  logger['parameters'] = sum([p.numel() for p in model.back_bone.parameters() if p.requires_grad])
  return logger

def put_in_dictionary(train_loss, train_time, eval_loss, eval_time, eval_acc):
  logger["total_train_loss"] = train_loss
  logger["total_train_time"] = train_time
  logger["final_eval_loss"] = eval_loss
  logger["final_eval_time"] = eval_time
  logger["final_eval_acc"] = eval_acc
  logger['train_time'] = logger['train_time'][1:]
  logger['eval_time'] = logger['eval_time'][1:]

def save_logs(dictionary, log_dir, exp_id):
  log_dir = os.path.join(log_dir, exp_id)
  os.makedirs(log_dir, exist_ok=True)
  # Log arguments
  with open(os.path.join(log_dir, "args.json"), "w") as f:
    json.dump(dictionary, f, indent=2)


In [None]:
# This is the code chunk to compare the different implementations:
# - GRU with encoder only 
# - GRU with encoder decoder
# - Transfomer with 4 heads 2 layers prenorm block
# - Transfomer with 4 heads 4 layers prenorm block
# - Transfomer with 4 heads 2 layers postnorm block
# - A basic Hugging Face implementation of bert.
# Running this cell will take hours or even days. You can run each model on its own. Change HP if you don't have enough memory.
# To plot the results you will find all of them in the log folder

device = torch.device('cuda')
print(f"--> Device selected: {device}")

nb_epoch = 1
batch_size = 512
logging_frequency = 5 
learning_rate = 1e-5

train_loader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, collate_fn=collate)
test_loader = DataLoader(dataset_test, batch_size=batch_size, shuffle=False, collate_fn=collate)
for experimental_setting in range(1,7):
  torch.random.manual_seed(0)

  if experimental_setting == 1:
    model = ClassifierLSTM(nb_classes=2, dropout=0.3, encoder_only=True)
  if experimental_setting == 2:
    model = ClassifierLSTM(nb_classes=2, dropout=0.3, encoder_only=False)
  if experimental_setting == 3:
    model = ClassifierTransformer(nb_classes=2, num_heads=4, num_layers=2, block='prenorm', dropout=0.3)
  if experimental_setting == 4:
    model = ClassifierTransformer(nb_classes=2, num_heads=4, num_layers=4, block='prenorm', dropout=0.3)
  if experimental_setting == 5:
    model = ClassifierTransformer(nb_classes=2, num_heads=4, num_layers=2, block='postnorm', dropout=0.3)
  if experimental_setting == 6: 
    model = basicClassifier(backbone="bert-base-uncased", backbone_hidden_size=768, nb_classes=2)
    for parameter in model.back_bone.parameters():
      parameter.requires_grad= False
    logging_frequency = 703
  # setting up the optimizer
  optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate)
  model.to(device)
  logger = get_logger(model)
  train_loss, train_time = train_one_epoch(model, train_loader, optimizer, logging_frequency, test_loader, logger)
  eval_acc, eval_loss, eval_time  = evaluate(model, test_loader)
  put_in_dictionary(train_loss, train_time, eval_loss, eval_time, eval_acc)
  print(f"    Epoch: {1} Loss/Test: {eval_loss}, Loss/Train: {train_loss}, Acc/Test: {eval_acc}, Train Time: {train_time}, Eval Time: {eval_time}")
  save_logs(logger, "/log", str(experimental_setting))


In [15]:
#END