This is an implemetation of a custom transformer for text classification. It classifies long text (500-1000 words) into binary sentiment classes (positive or negative).

In [None]:
import torch, pickle, math
import numpy as np
import pandas as pd
from torchtext.vocab import Vectors
from torchtext import data
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import *
from torch.nn import TransformerEncoder, TransformerEncoderLayer
#from nltk.corpus.stopwords import words

The dataset



In [None]:
path='drive/My Drive/Colab Notebooks/Colab Datasets/longtext/training_set.csv'
train_df = pd.read_csv(path, sep=',', delimiter=None, header='infer')
train_df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
train_df.shape

(45000, 2)

In [None]:
def preprocess(x):
    if len(x) < max_seq_len:
       remain = max_seq_len-len(x)
       lst = ['<pad>']*remain
       x.extend(lst)
       return x
    else:
       lst = x[0:max_seq_len]
       return lst


class PositionalEncoding(nn.Module):

  def __init__(self, d_model, dropout=0.1, max_len=5000):
      super(PositionalEncoding, self).__init__()
      self.dropout = nn.Dropout(p=dropout)

      pe = torch.zeros(max_len, d_model)
      position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
      div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
      pe[:, 0::2] = torch.sin(position * div_term)
      pe[:, 1::2] = torch.cos(position * div_term)
      pe = pe.unsqueeze(0).transpose(0, 1)
      self.register_buffer('pe', pe)

  def forward(self, x):
      x = x + self.pe[:x.size(0), :]
      return self.dropout(x)


class TransformerModel(nn.Module):

  def __init__(self, glove, vocab_size, embed_dim, nhead, nlayers, dropout, attn_dim, out_dim): 
      super(TransformerModel, self).__init__()
      self.embed_dim = embed_dim
      self.embed_layer = nn.Embedding.from_pretrained(glove, freeze=True)
      self.pos_encoder = PositionalEncoding(embed_dim, dropout)
      self.transformer_encoder = TransformerEncoder(TransformerEncoderLayer(embed_dim, nhead, attn_dim, dropout), nlayers)
      self.out_layer = nn.Linear(embed_dim*max_seq_len, out_dim)

  def forward(self, inpt): 
      embeds = self.embed_layer(inpt) * math.sqrt(self.embed_dim)        
      embeds = self.pos_encoder(embeds)
      out = self.transformer_encoder(embeds)
      out = out.view(out.shape[0], -1) 
      out = self.out_layer(out)
      scores = F.log_softmax(out, dim=1) 
      return scores


#Training the model
def train(model,iterator):
    model.train()
    losses = []
    for batch in iterator:
        optimizer.zero_grad()
        modeloutput = model(batch.review)
        loss = lossFunction(modeloutput, batch.sentiment)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    return losses


#Evaluating the model
def evaluate(model,iterator):
    model.eval()
    actuals = []
    predictions = []
    with torch.no_grad():
         for batch in iterator:
             modeloutput = model(batch.review)
             prediction = modeloutput.argmax(dim=1, keepdim=True)
             actuals.extend(batch.sentiment)
             predictions.extend(prediction)
    return [i.item() for i in actuals], [i.item() for i in predictions]

In [None]:
SEED = 1234
torch.manual_seed(SEED)
max_seq_len = 60

In [None]:
#Define columns of the dataframe that will hold the data
text = data.Field(preprocessing=preprocess, batch_first=True)
label = data.LabelField()

#Map dataframe columns to columns in the csv file.
colmapper = [('review',text),('sentiment',label)]

#Load data
train_data, val_data, test_data = data.TabularDataset.splits( 
     path='drive/My Drive/Colab Notebooks/Colab Datasets/longtext', 
     train='training_set.csv', 
     validation='validation_set.csv', 
     test='test_set.csv', 
     format='csv',
     fields=colmapper,
     skip_header = True)

#Explore what is loaded.
#print(vars(train_data.examples[0])['review'])


In [None]:
#Build vocabulary
vectors = Vectors(name='drive/My Drive/Colab Notebooks/Colab Datasets/glove2B100d.txt', cache='./')
text.build_vocab(train_data, vectors=vectors, unk_init = torch.Tensor.normal_)
label.build_vocab(train_data)

In [None]:
#Create iterators
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, val_data, test_data), 
    batch_size = 32,
    sort_key=lambda x: data.interleave_keys(len(x.review), len(x.sentiment)),
    sort_within_batch = False,
    device = device)

In [None]:
#Define model parameters
vocab_size = len(text.vocab)
embed_dim = 100
nhead = 2
nlayers = 2
dropout = 0.1
attn_dim = 100
out_dim = len(label.vocab)
model = TransformerModel(text.vocab.vectors,vocab_size,embed_dim,nhead,nlayers,dropout,attn_dim,out_dim).to(device)
lossFunction = nn.NLLLoss()
optimizer = optim.Adam([param for param in model.parameters() if param.requires_grad == True], lr=0.001)

In [None]:
#Train
losses = []
for epoch in range(0, 10):
    epoch_losses = train(model, train_iterator)
    losses.append(sum(epoch_losses))
np.savetxt("losses.csv", losses, delimiter=",")
pickle.dump(model, open("bestModel.p", "wb"))

In [None]:
#test the best model
model = pickle.load(open("bestModel.p", "rb"))
actuals, predictions = evaluate(model, test_iterator)
print('Confusion matrix:')
print(confusion_matrix(actuals, predictions))
print('F1 score: %f' % f1_score(actuals, predictions, average='micro'))
print('Accuracy score: %f' % accuracy_score(actuals, predictions))

Confusion matrix:
[[190  62]
 [ 74 174]]
F1 score: 0.728000
Accuracy score: 0.728000
