## Libraries

In [1]:
import spacy
import warnings
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext
import torchtext.data
from torchtext import vocab
from torchtext.vocab import Vocab
from captum.attr import LayerIntegratedGradients, TokenReferenceBase, visualization
from model import CNN

warnings.filterwarnings('ignore')
preprocessor = spacy.load('en_core_web_sm')
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

## Load data and build vocabulary

In [2]:
# Load IMDB data
TEXTS = torchtext.data.Field(
    lower = True,
    tokenize = 'spacy',
    tokenizer_language = 'en_core_web_sm'
)

LABELS = torchtext.data.LabelField(dtype = torch.float)

# split data
train, _ = torchtext.datasets.IMDB.splits(
    text_field = TEXTS,
    label_field = LABELS,
    train = 'train',
    test = 'test',
    path = 'data/aclImdb'
)

# Load GloVe vectors
loaded_vectors = torchtext.vocab.Vectors('glove-6B-50d.txt') #loaded_vectors = vocab.GloVe(name = '6B', dim = 50)

# Build vocabulary based on training data
TEXTS.build_vocab(train, vectors = loaded_vectors, max_size = len(loaded_vectors.stoi))
# Assing vectors to vocabulary tokens
TEXTS.vocab.set_vectors(stoi = loaded_vectors.stoi, vectors = loaded_vectors.vectors, dim = loaded_vectors.dim)
# Build label vocabulary
LABELS.build_vocab(train)

# print vocabulary size
print(f'Vocabulary size: {len(TEXTS.vocab)}')

# save tokenizer
torch.save(TEXTS, 'tokenizer.pt')

Vocabulary Size:  101513


## Load tokenizer

In [2]:
tokenizer = torch.load('tokenizer.pt')

## Load model

In [8]:
model = CNN()

model.load_state_dict(torch.load('model.pt'))
model.eval()
model = model.to(DEVICE)

print(model)

CNN(
  (embedding): Embedding(101982, 50, padding_idx=1)
  (convs): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(3, 50), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(4, 50), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(5, 50), stride=(1, 1))
  )
  (fc): Linear(in_features=300, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


## Prediction function

In [6]:
def analyse_string(model, sentence, min_len = 7):
    # reference/baseline token index = pad token index
    pad_index = tokenizer.vocab.stoi['pad']
    token_reference = TokenReferenceBase(reference_token_idx = pad_index)
    # initialize feature attribution model
    lig = LayerIntegratedGradients(model, model.embedding)
    # tokenize text
    text = [token.text for token in preprocessor.tokenizer(sentence)]
    # pad to minimum length
    if len(text) < min_len:
        text += ['pad'] * (min_len - len(text))
    # get indices for token strings
    indices = [tokenizer.vocab.stoi[token] for token in text]
    # clear gradients
    model.zero_grad()
    # initialize input tensor
    indices_tensor = torch.tensor(indices, device = DEVICE)
    # induce batch dim
    indices_tensor = indices_tensor.unsqueeze(0)
    # save input seuqnce length
    seq_length = len(text) #min_len
    # predict probability with model
    pred_prob = torch.sigmoid(model(indices_tensor)).item()
    # generate reference indices
    reference_indices = token_reference.generate_reference(seq_length, device = DEVICE).unsqueeze(0)
    # compute attributions using layer-integrated gradients
    attributions = lig.attribute(
        indices_tensor,
        reference_indices,
        n_steps = 500
    )
    # sum attributions along embedding dimensions and norm to [-1, 1]
    attributions = attributions.sum(dim = 2).squeeze(0)
    attributions = attributions / torch.norm(attributions)
    attributions = attributions.cpu().detach().numpy()
    # return probability, attributions and text
    return pred_prob, attributions, text

In [7]:
prob, attrs, text = analyse_string(model, 'It was a truly fantastic performance today!')

print(prob, '\n', attrs, '\n', text)

0.9998109936714172 
 [ 0.01607962 -0.10480973 -0.37624715 -0.37292588  0.51327613  0.32549393
  0.5588112  -0.16264103] 
 ['It', 'was', 'a', 'truly', 'fantastic', 'performance', 'today', '!']
