In [38]:
%reload_ext autoreload
%autoreload 2

# Sentiment IMDB anlysis with Transformers form scratch
Each sample is a sequence of embedded words. Output two positive/negative class
Input:
* batch of different sample sentences
* sample is a sentence of dim t x embedding_size (where t is max sequence)

In [58]:
import warnings
warnings.filterwarnings('ignore')
from _context import src
from src.models.model_utils import device_selection
from src.models.predict_model import ClassSequenceTransformer
#from src.models import TransformerClassification
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.tensorboard import SummaryWriter
from torchtext import data, datasets, vocab

import numpy as np
import random, tqdm, sys, math, gzip


In [86]:
#Params
vocab_size = 50000
batch_size = 8
num_classes = 2
embeding_size = 128
transformer_heads = 8
depth = 4
lr = 1e-4
warm = 10000
epochs =100


## Dataset

In [41]:
text = data.Field(lower=True, include_lengths=True, batch_first=True)
label = data.Field(sequential=False)

In [42]:
train, test = datasets.IMDB.splits(text, label)
text.build_vocab(train, max_size=vocab_size - 2)
label.build_vocab(train)

In [43]:
train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=batch_size, device=device_selection())

In [44]:
#Determine maximum sequence
max_sequence = max([seq.text[0].size(1) for seq in train_iter])

In [45]:
#Explore structure
for seq in train_iter:
    s = seq
    break
seq, seq.text[0].size(1)

(
 [torchtext.data.batch.Batch of size 8 from IMDB]
 	[.text]:('[torch.LongTensor of size 8x402]', '[torch.LongTensor of size 8]')
 	[.label]:[torch.LongTensor of size 8],
 402)

In [46]:
seq.text[0].size()

torch.Size([8, 402])

In [47]:
seq.label

tensor([2, 1, 1, 1, 2, 2, 2, 2])

# Classify Transformer

* Seq-to-seq model that does an global average pooling, the resulting vector is projected to a smaller dimension and  at the end and apply a softmax

* Positional embeding: each transformer is positional invariant (different order of the inputs produce the same output). Add positional information to the input vectrors by generating positional vectors that are added to the input words. Positional embeding let the model to learn the vector value for position ( does not work well in  sequence lenghts the network has nott seen before). Positional encoding just add a funtion that map position to a vectors, but the network does not learn the mapping, it is an complex hyperparam to fix.

<img src="images/transformer_model.svg"  width="500" height="600">



In [73]:
model = ClassSequenceTransformer(
                                    num_classes=num_classes,
                                    embeding_size=embeding_size,
                                    transformer_heads=transformer_heads,
                                    depth=depth,
                                    vocab_size=vocab_size,
                                    max_sequence=max_sequence,
                                    
                                )

In [49]:
torch.cuda.is_available()

False

In [50]:
optimizer = torch.optim.Adam(lr=lr, params=model.parameters())
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda i:min(i/(warm/batch_size), 1.0)) # lr times lambda(iter)

# Train

In [92]:
#Overfit batch
model = ClassSequenceTransformer(
                                    num_classes=num_classes,
                                    embeding_size=embeding_size,
                                    transformer_heads=transformer_heads,
                                    depth=depth,
                                    vocab_size=vocab_size,
                                    max_sequence=max_sequence,
                                    
                                )
optimizer = torch.optim.Adam(lr=lr, params=model.parameters())
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda i:min(i/(warm/batch_size), 1.0)) # lr times lambda(iter)
#get first batch
for seq in train_iter:
    one_batch = seq
    break

total_iters = 50
total_eopchs = 10
tensorboard = SummaryWriter(log_dir=".")
samples = 0
for epoch in range(total_eopchs):
    print("Epoch: {}/{}".format(epoch, total_eopchs))
    model.train(True)
    for batch_seq in tqdm.tqdm(range(total_iters)):
        text, label = one_batch.text[0], one_batch.label -1
        optimizer.zero_grad()
        if text.size(1) > max_sequence: text[:,:max_sequence]
        pred = model(text)
        loss = F.nll_loss(pred, label) #negstive log_likehood
        loss.backward()
        optimizer.step()
        scheduler.step()
        samples += text.size(0)
        tensorboard.add_scalar('classification/train-loss', float(loss.item()), samples)
    with torch.no_grad():
        print("Train loss ", float(loss.item()))
        model.train(False)
        qty = 0
        correct = 0
        loss_val = []
        for batch_seq in tqdm.tqdm(range(1)):
            text, label = one_batch.text[0], one_batch.label -1
            if text.size(1) > max_sequence: text[:, :max_sequence]
            pred = model(text)
            loss = F.nll_loss(pred, label)
            
            loss_val.append(loss.item())

            pred_class = pred.argmax(dim=1)
            qty += text.size(0)
            correct += (pred_class==label).sum().item()
            accuracy = float(correct) / qty
            print("Acc val: {}".format(accuracy))
            tensorboard.add_scalar('classification/val-loss', np.array(loss_val).mean(), samples)
            print("Val loss ", np.array(loss_val).mean())

  0%|          | 0/50 [00:00<?, ?it/s]

Epoch: 0/10


100%|██████████| 50/50 [00:25<00:00,  1.93it/s]
100%|██████████| 1/1 [00:00<00:00,  5.82it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Train loss  0.2967316210269928
Acc val: 1.0
Val loss  0.2892437279224396
Epoch: 1/10


100%|██████████| 50/50 [00:25<00:00,  1.96it/s]
100%|██████████| 1/1 [00:00<00:00,  5.62it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Train loss  0.07365045696496964
Acc val: 1.0
Val loss  0.07171900570392609
Epoch: 2/10


100%|██████████| 50/50 [00:24<00:00,  2.00it/s]
100%|██████████| 1/1 [00:00<00:00,  5.86it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Train loss  0.02310607209801674
Acc val: 1.0
Val loss  0.022653289139270782
Epoch: 3/10


100%|██████████| 50/50 [00:25<00:00,  1.97it/s]
100%|██████████| 1/1 [00:00<00:00,  5.81it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Train loss  0.009804179891943932
Acc val: 1.0
Val loss  0.009660748764872551
Epoch: 4/10


100%|██████████| 50/50 [00:25<00:00,  1.99it/s]
100%|██████████| 1/1 [00:00<00:00,  5.62it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Train loss  0.005123793613165617
Acc val: 1.0
Val loss  0.00506583834066987
Epoch: 5/10


100%|██████████| 50/50 [00:25<00:00,  1.97it/s]
100%|██████████| 1/1 [00:00<00:00,  5.38it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Train loss  0.003079689107835293
Acc val: 1.0
Val loss  0.0030519049614667892
Epoch: 6/10


100%|██████████| 50/50 [00:25<00:00,  1.99it/s]
100%|██████████| 1/1 [00:00<00:00,  5.72it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Train loss  0.0020407685078680515
Acc val: 1.0
Val loss  0.0020256286952644587
Epoch: 7/10


100%|██████████| 50/50 [00:25<00:00,  1.99it/s]
100%|██████████| 1/1 [00:00<00:00,  5.74it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Train loss  0.0014514847425743937
Acc val: 1.0
Val loss  0.0014424878172576427
Epoch: 8/10


100%|██████████| 50/50 [00:25<00:00,  1.94it/s]
100%|██████████| 1/1 [00:00<00:00,  5.16it/s]
  0%|          | 0/50 [00:00<?, ?it/s]

Train loss  0.0010884369257837534
Acc val: 1.0
Val loss  0.0010826787911355495
Epoch: 9/10


100%|██████████| 50/50 [00:28<00:00,  1.74it/s]
100%|██████████| 1/1 [00:00<00:00,  5.18it/s]

Train loss  0.000849991396535188
Acc val: 1.0
Val loss  0.0008460917742922902





In [78]:
model = ClassSequenceTransformer(
                                    num_classes=num_classes,
                                    embeding_size=embeding_size,
                                    transformer_heads=transformer_heads,
                                    depth=depth,
                                    vocab_size=vocab_size,
                                    max_sequence=max_sequence,
                                    
                                )
optimizer = torch.optim.Adam(lr=lr, params=model.parameters())
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda i:min(i/(warm/batch_size), 1.0)) # lr times lambda(iter)


tensorboard = SummaryWriter(log_dir=".")
samples = 0
for epoch in range(epochs):
    print("Epoch: {}/{}".format(epoch, epochs))
    model.train(True)
    for batch_seq in tqdm.tqdm(train_iter):
        text, label = batch_seq.text[0], batch_seq.label -1
        optimizer.zero_grad()
        if text.size(1) > max_sequence: text[:,:max_sequence]
        pred = model(text)
        loss = F.nll_loss(pred, label) #negstive log_likehood
        loss.backward()
        optimizer.step()
        scheduler.step()
        samples += text.size(0)
        tensorboard.add_scalar('classification/train-loss', float(loss.item()), samples)
    with torch.no_grad():
        print("Train loss ", float(loss.item()))
        model.train(False)
        qty = 0
        correct = 0
        loss_val = []
        for batch_seq in tqdm.tqdm(test_iter):
            text, label = batch_seq.text[0], batch_seq.label -1
            if text.size(1) > max_sequence: text[:, :max_sequence]
            pred = model(text)
            loss = F.nll_loss(pred, label)
            
            loss_val.append(loss.item())

            pred_class = pred.argmax(dim=1)
            qty += text.size(0)
            correct += (pred_class==label).sum().item()
            accuracy = float(correct) / qty
            print("Acc val: {}".format(accuracy))
            tensorboard.add_scalar('classification/val-loss', np.array(loss_val).mean(), samples)
            print("Val loss ", np.array(loss_val).mean())

  0%|          | 0/3125 [00:00<?, ?it/s]

Epoch: 0/5


  0%|          | 8/3125 [00:08<54:36,  1.05s/it]  


KeyboardInterrupt: 

In [81]:
train_iter.shuffle=False

False