# PyTorch Transformer Model for Classification: Input-Output
[Site: PyTorch Transformer Model for Classification: Input-Output](https://jamesmccaffrey.wordpress.com/2021/03/02/pytorch-transformer-model-for-classification-input-output/)

In [1]:
# imdb_transformer_io.py
# IMDB classification, loosely  based on the PyTorch docs

# Input-Ouput only. Work in progress. Surely has bugs.

# Python 3.7.6, PyTorch 1.7.0
# Windows 10
# uses BucketIterator - so results not reproducible.
# data has been manually pruned from 25,000 train and
# 25,000 test to 200 train and 200 test

import torch as T
import torchtext as tt
import numpy as np
import math
import warnings  # most of torchtext is deprecated
warnings.filterwarnings("ignore")

device = T.device("cpu")

In [2]:
class TransformerModel(T.nn.Module):
  # Transformer model for IMDB binary classification
  # n_tokens: num distinct words == vocabulary size
  # embed_dim: num values for each word/token is embed_dim
  # n_heads: num attention heads, needed by Tr_EncoderLayer
  # n_hid : num hidden nodes in NN part of Tr_EncoderLayer
  # n_eclayers: num Tr_EncoderLayer layers in Tr_Encoder
  # drop_p is used by PositionalEncoding AND Tr_EncoderLayer

  def __init__(self, n_tokens, embed_dim, n_heads, n_hid, \
    n_eclayers, drop_p=0.5):
    super(TransformerModel, self).__init__()
    self.embed_dim = embed_dim

    self.embedder = T.nn.Embedding(n_tokens, embed_dim)
    self.pos_encoder = PositionalEncoding(embed_dim, drop_p)
    enc_layer = T.nn.TransformerEncoderLayer(embed_dim, \
      n_heads, n_hid, drop_p)
    self.transformer_encoder = \
      T.nn.TransformerEncoder(enc_layer, n_eclayers)
    # map 4 embed vals to 2 classes
    self.to_logits = T.nn.Linear(embed_dim, 2)  

    self.embedder.weight.data.uniform_(-0.01, 0.01)
    self.to_logits.weight.data.uniform_(-0.01, 0.01)
    self.to_logits.bias.data.zero_()

  def forward(self, src, src_mask):
    # assumes src is [seq, bat] -- 
    # but most default data loaders
    # give [bat, seq] and so must 
    # 1.) reshape src HERE, or
    # 2.) reshape src outside, before feeding to forward(), or
    # 3.) modify loader by using batch_first=False if available
    # note: no mask needed for classification problems

    print("src entering forward, before encoder * sqrt:")
    print(src); print(src.shape); input()  # [5,3], [seq,bat]

    z = self.embedder(src) * math.sqrt(self.embed_dim) 

    print("z after embedding, before pos_encoder ")
    print(z); print(z.shape); input()  # [5,3,4], [seq,bat,emb]

    z = self.pos_encoder(z)

    print("z after pos_encoder, before trans_encoder ")
    print(z); print(z.shape); input()  # [5,3,4]

    oupt = self.transformer_encoder(z)

    print("prelim output from trans_encoder, before pooling:")
    print(oupt); print(oupt.shape); input()
    # [5,3,4] same shape as embedded input: seq-to-seq  

    # combine output sequences to 'one word' of review
    oupt = oupt.max(dim=0)[0]  # [3,4] == [bat, 'one word']
    # avg pooling is one of many promising alternatives
    # oupt = oupt.mean(dim=0)      # [3,4]

    print("prelim oupt after max pooling, before to_logits")
    print(oupt); print(oupt.shape); input()

    # map 'one word' to 2 predictions (as logits)
    oupt = self.to_logits(oupt)    # [3,2] == [bat, class]

    print("prelim oupt after to_logits, before log_softmax:")
    print(oupt); print(oupt.shape); input()

    # apply log_softmax so oupt can be used by loss_func
    result = T.nn.functional.log_softmax(oupt, dim=1)

    print("final result after log_softmax, \
      before return statement:")
    print(result); print(result.shape); input()

    return result

In [4]:
class PositionalEncoding(T.nn.Module):
  # assumes inpt is [seq, bat, emb] - which word, 
  # which sentence, which embed_val -- unintuitive

  def __init__(self, d_model, drop_p=0.1, max_len=5000):
    super(PositionalEncoding, self).__init__()
    self.dropout = T.nn.Dropout(p=drop_p)

    pe = T.zeros(max_len, d_model).to(device)
    position = T.arange(0, max_len, dtype=T.float).\
      unsqueeze(1).to(device)
    div_term = T.exp(T.arange(0, d_model, 2).float() * \
      (-math.log(10000.0) / d_model))
    pe[:, 0::2] = T.sin(position * div_term)
    pe[:, 1::2] = T.cos(position * div_term)
    self.pe = pe.unsqueeze(0).transpose(0, 1)

  def forward(self, x):
    x = x + self.pe[:x.size(0), :]
    return self.dropout(x)

In [9]:
print("\nBegin ")

# 0. get ready
T.manual_seed(1)
np.random.seed(1)

# 2. create IMDB data iterators
print("\nGetting (pruned) IMDB data and building vocabulary ")
# setting batch_first=False . . sets [seq,bat] required shape

max_vocab_tokens = 500
bat_size = 3

TEXT = tt.legacy.data.Field(lower=True, include_lengths=True,
  batch_first=False, tokenize="basic_english")  # note!
LABEL = tt.legacy.data.Field(sequential=False)

train_ds, test_ds = tt.legacy.datasets.IMDB.splits(TEXT, LABEL)
TEXT.build_vocab(train_ds, max_size=max_vocab_tokens-2) 
LABEL.build_vocab(train_ds)  # unk, neg, pos

train_itr, test_itr = \
  tt.legacy.data.BucketIterator.splits((train_ds, test_ds), \
  shuffle=True, batch_size=bat_size, device=device)
print("Data successfully fetched")

# 3. ceate model
print("\nCreating TransformerModel object ")

n_tokens = len(TEXT.vocab.stoi) 
embed_dim = 4                
n_heads = 2     
n_hid = 10    
n_eclayers = 2  
drop_p = 0.2   

model = TransformerModel(n_tokens, embed_dim, n_heads, \
  n_hid, n_eclayers, drop_p).to(device)
print("Model successfully created ")

# 4. feed a batch to model, display shapes as processing

lr = 0.0
# lr_warmup = 100 
# gradient_clip = 1.0 
max_seq_len = 5
max_epochs = 3

# loss_func = T.nn.CrossEntropyLoss()
# loss_func = T.nn.functional.nll_loss
# opt = T.optim.Adam(lr=arg_lr, params=model.parameters())
# sch = T.optim.lr_scheduler.LambdaLR(opt, lambda i: \
#   min(i / (arg_lr_warmup / arg_batch_size), 1.0))
# opt = T.optim.SGD(model.parameters(), lr=lr)

model.train()  # set mode
print("\nStarting training (not really)")
for epoch in range(max_epochs):
  print("\nEpoch " + str(epoch))

  for bat_idx, batch in enumerate(train_itr):
    print(" batch: " + str(bat_idx))
    # optimizer.zero_grad()

    inpt = batch.text[0]            # [seq, bat] required
    if len(inpt) != max_seq_len:  # trim inpt to 5 tokens
      inpt = inpt[0:max_seq_len, :]
    lbl = (batch.label - 1)         # 0, 1
    print("  targets: ")
    print("  ", end=""); print(lbl); input()

    # call forward(), display intermediate values and shapes
    oupt = model(inpt, None)        # no mask to forward()
    
    # training:
    # compute loss
    # call backward() to compute gradients
    # clip_grad_norm()
    # call step() to update wts and biases
    # loss = loss_func(output.view(-1, ntokens), targets)
    # loss.backward()
    # T.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
    # optimizer.step()

print("\nEnd demo ")


Begin 

Getting (pruned) IMDB data and building vocabulary 
Data successfully fetched

Creating TransformerModel object 
Model successfully created 

Starting training (not really)

Epoch 0
 batch: 0
  targets: 
  tensor([1, 0, 0])
abc


NotImplementedError: ignored