In [1]:
!pip install torchtext --upgrade

import torch
import torch.nn as nn
import torch.optim as optim

import torchtext
import torchtext.experimental
import torchtext.experimental.vectors
from torchtext.experimental.datasets.raw.text_classification import RawTextIterableDataset
from torchtext.experimental.datasets.text_classification import TextClassificationDataset
from torchtext.experimental.functional import sequential_transforms, vocab_func, totensor

import collections
import random
import time

Requirement already up-to-date: torchtext in /usr/local/lib/python3.6/dist-packages (0.7.0)


In [2]:
seed = 1234

torch.manual_seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
raw_train_data, raw_test_data = torchtext.experimental.datasets.raw.IMDB()

In [4]:
print(raw_train_data)

<torchtext.experimental.datasets.raw.text_classification.RawTextIterableDataset object at 0x7fe56e6390b8>


In [5]:
raw_train_data = list(raw_train_data)

print(raw_train_data[0])

(0, 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between

In [6]:
raw_test_data = list(raw_test_data)

print(raw_test_data[0])

(0, 'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it\'s rubbish as they 

In [7]:
print(f'Number of training examples: {len(raw_train_data):,}')
print(f'Number of testing examples: {len(raw_test_data):,}')

Number of training examples: 25,000
Number of testing examples: 25,000


In [8]:
def get_train_valid_split(raw_train_data, split_ratio = 0.8):

    raw_train_data = list(raw_train_data)
        
    random.shuffle(raw_train_data)
        
    n_train_examples = int(len(raw_train_data) * split_ratio)
        
    train_data = raw_train_data[:n_train_examples]
    valid_data = raw_train_data[n_train_examples:]
    
    train_data = RawTextIterableDataset(train_data)
    valid_data = RawTextIterableDataset(valid_data)
    
    return train_data, valid_data

In [9]:
raw_train_data, raw_valid_data = get_train_valid_split(raw_train_data)

In [10]:
raw_train_data = list(raw_train_data)
raw_valid_data = list(raw_valid_data)

In [11]:
print(f'Number of training examples: {len(raw_train_data):,}')
print(f'Number of validation examples: {len(raw_valid_data):,}')
print(f'Number of testing examples: {len(raw_test_data):,}')

Number of training examples: 20,000
Number of validation examples: 5,000
Number of testing examples: 25,000


In [12]:
class Tokenizer:
    def __init__(self, tokenize_fn = 'basic_english', lower = True, max_length = None):
        
        self.tokenize_fn = torchtext.data.utils.get_tokenizer(tokenize_fn)
        self.lower = lower
        self.max_length = max_length
        
    def tokenize(self, s):
        
        tokens = self.tokenize_fn(s)
        
        if self.lower:
            tokens = [token.lower() for token in tokens]
            
        if self.max_length is not None:
            tokens = tokens[:max_length]
            
        return tokens

In [13]:
max_length = 250

tokenizer = Tokenizer(max_length = max_length)

In [14]:
s = "this film is terrible. i hate it and it's bad!"

print(tokenizer.tokenize(s))

['this', 'film', 'is', 'terrible', '.', 'i', 'hate', 'it', 'and', 'it', "'", 's', 'bad', '!']


In [15]:
def build_vocab_from_data(raw_data, tokenizer, **vocab_kwargs):
    
    token_freqs = collections.Counter()
    
    for label, text in raw_data:
        tokens = tokenizer.tokenize(text)
        token_freqs.update(tokens)
                
    vocab = torchtext.vocab.Vocab(token_freqs, **vocab_kwargs)
    
    return vocab

In [16]:
min_freq = 2
max_size = 25_000

vocab = build_vocab_from_data(raw_train_data, tokenizer, min_freq = min_freq, max_size = max_size)

In [17]:
vocab.freqs.most_common(20)

[('the', 189116),
 ('.', 188121),
 (',', 153127),
 ('a', 93793),
 ('and', 92021),
 ('of', 82137),
 ('to', 74996),
 ("'", 73288),
 ('is', 61511),
 ('it', 56770),
 ('i', 55777),
 ('in', 52098),
 ('this', 46729),
 ('that', 40676),
 ('s', 33495),
 ('was', 29882),
 ('movie', 27993),
 ('as', 25564),
 ('with', 24610),
 ('for', 24381)]

In [18]:
vocab.itos[:10]

['<unk>', '<pad>', 'the', '.', ',', 'a', 'and', 'of', 'to', "'"]

In [19]:
vocab.stoi['the']

2

In [20]:
def process_raw_data(raw_data, tokenizer, vocab):
    
    raw_data = [(label, text) for (label, text) in raw_data]

    text_transform = sequential_transforms(tokenizer.tokenize,
                                           vocab_func(vocab),
                                           totensor(dtype=torch.long))
    
    label_transform = sequential_transforms(totensor(dtype=torch.long))

    transforms = (label_transform, text_transform)

    dataset = TextClassificationDataset(raw_data,
                                        vocab,
                                        transforms)
    
    return dataset

In [21]:
train_data = process_raw_data(raw_train_data, tokenizer, vocab)

In [22]:
label, indexes = train_data[0]

print(indexes)

tensor([  588,  7875,    10,     5,    58,   211,     6,     5,    58,     0,
          170,     3,    32,    80,    18,     9,   329,     7,  1480,     9,
           17,     2,    34,    12,   388,   122,     4,     6,    11,   277,
            5,   200,     7,   116,    20,   113,   654,  1791,     7,   125,
         1453,    19,    39,    88,    96,  2494,  1109,     4,     6,    43,
          134,     2,  1791,     7,     2,   490,     3,    43,    10,     9,
         5697,     9,    76,    44,     2,   421,   515,     8,   277,     2,
          364,     7,     2,   490,     8,     2,   599,   829,   441,     3,
            2,  4172,   113,    30,   120,  1982,     4,    34,  2865,     6,
           34,  2320,    42,   883,    20,     5, 19273,  5565,  1642,     3,
            2,   568,     7,    41,   166,  2865,  2092,  1895,    42,  1450,
            8,    35,    41,  2074,  2421,     7,     2,  6348,  6215,   337,
        21887,     2,  2865,  2242,    13,     2,   364,     6, 

In [23]:
print([vocab.itos[i] for i in indexes])

['david', 'mamet', 'is', 'a', 'very', 'interesting', 'and', 'a', 'very', '<unk>', 'director', '.', 'his', 'first', 'movie', "'", 'house', 'of', 'games', "'", 'was', 'the', 'one', 'i', 'liked', 'best', ',', 'and', 'it', 'set', 'a', 'series', 'of', 'films', 'with', 'characters', 'whose', 'perspective', 'of', 'life', 'changes', 'as', 'they', 'get', 'into', 'complicated', 'situations', ',', 'and', 'so', 'does', 'the', 'perspective', 'of', 'the', 'viewer', '.', 'so', 'is', "'", 'homicide', "'", 'which', 'from', 'the', 'title', 'tries', 'to', 'set', 'the', 'mind', 'of', 'the', 'viewer', 'to', 'the', 'usual', 'crime', 'drama', '.', 'the', 'principal', 'characters', 'are', 'two', 'cops', ',', 'one', 'jewish', 'and', 'one', 'irish', 'who', 'deal', 'with', 'a', 'racially', 'charged', 'area', '.', 'the', 'murder', 'of', 'an', 'old', 'jewish', 'shop', 'owner', 'who', 'proves', 'to', 'be', 'an', 'ancient', 'veteran', 'of', 'the', 'israeli', 'independence', 'war', 'triggers', 'the', 'jewish', 'ident

In [24]:
valid_data = process_raw_data(raw_valid_data, tokenizer, vocab)
test_data = process_raw_data(raw_test_data, tokenizer, vocab)

In [25]:
class Collator:
    def __init__(self, pad_idx):
        
        self.pad_idx = pad_idx
        
    def collate(self, batch):
        
        labels, text = zip(*batch)
        
        labels = torch.LongTensor(labels)
        
        text = nn.utils.rnn.pad_sequence(text, padding_value = self.pad_idx)
        
        return labels, text

In [26]:
pad_idx = vocab['<pad>']
collator = Collator(pad_idx)

In [27]:
batch_size = 128

train_iterator = torch.utils.data.DataLoader(train_data, 
                                             batch_size, 
                                             shuffle = True, 
                                             collate_fn = collator.collate)

valid_iterator = torch.utils.data.DataLoader(valid_data, 
                                             batch_size, 
                                             shuffle = False, 
                                             collate_fn = collator.collate)

test_iterator = torch.utils.data.DataLoader(test_data, 
                                            batch_size, 
                                            shuffle = False, 
                                            collate_fn = collator.collate)

In [28]:
class NBOW(nn.Module):
    def __init__(self, input_dim, emb_dim, output_dim, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx = pad_idx)
        self.fc = nn.Linear(emb_dim, output_dim)
        
    def forward(self, text):
        
        # text = [seq len, batch size]
        
        embedded = self.embedding(text)
        
        # embedded = [seq len, batch size, emb dim]
        
        pooled = embedded.mean(0)
        
        # pooled = [batch size, emb dim]
        
        prediction = self.fc(pooled)
        
        # prediction = [batch size, output dim]
        
        return prediction

In [29]:
input_dim = len(vocab)
emb_dim = 100
output_dim = 2

model = NBOW(input_dim, emb_dim, output_dim, pad_idx)

In [30]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [31]:
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,500,402 trainable parameters


In [32]:
glove = torchtext.experimental.vectors.GloVe(name = '6B',
                                             dim = emb_dim)

In [33]:
glove.vectors['the']

tensor([-0.0382, -0.2449,  0.7281, -0.3996,  0.0832,  0.0440, -0.3914,  0.3344,
        -0.5755,  0.0875,  0.2879, -0.0673,  0.3091, -0.2638, -0.1323, -0.2076,
         0.3340, -0.3385, -0.3174, -0.4834,  0.1464, -0.3730,  0.3458,  0.0520,
         0.4495, -0.4697,  0.0263, -0.5415, -0.1552, -0.1411, -0.0397,  0.2828,
         0.1439,  0.2346, -0.3102,  0.0862,  0.2040,  0.5262,  0.1716, -0.0824,
        -0.7179, -0.4153,  0.2033, -0.1276,  0.4137,  0.5519,  0.5791, -0.3348,
        -0.3656, -0.5486, -0.0629,  0.2658,  0.3020,  0.9977, -0.8048, -3.0243,
         0.0125, -0.3694,  2.2167,  0.7220, -0.2498,  0.9214,  0.0345,  0.4674,
         1.1079, -0.1936, -0.0746,  0.2335, -0.0521, -0.2204,  0.0572, -0.1581,
        -0.3080, -0.4162,  0.3797,  0.1501, -0.5321, -0.2055, -1.2526,  0.0716,
         0.7056,  0.4974, -0.4206,  0.2615, -1.5380, -0.3022, -0.0734, -0.2831,
         0.3710, -0.2522,  0.0162, -0.0171, -0.3898,  0.8742, -0.7257, -0.5106,
        -0.5203, -0.1459,  0.8278,  0.27

In [34]:
glove.vectors['shoggoth']

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.])

In [35]:
glove.vectors['The']

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.])

In [36]:
def get_pretrained_embedding(vectors, vocab, unk_token):
    
    unk_vector = vectors[unk_token]
    emb_dim = unk_vector.shape[-1]
    zero_vector = torch.zeros(emb_dim)

    pretrained_embedding = torch.zeros(len(vocab), emb_dim) 
    
    unk_tokens = []
    
    for idx, token in enumerate(vocab.itos):
        pretrained_vector = vectors[token]
        if torch.all(torch.eq(pretrained_vector, unk_vector)):
            unk_tokens.append(token)
            pretrained_embedding[idx] = zero_vector
        else:
            pretrained_embedding[idx] = pretrained_vector
        
    return pretrained_embedding, unk_tokens

In [37]:
unk_token = '<unk>'

pretrained_embedding, unk_tokens = get_pretrained_embedding(glove.vectors, vocab, unk_token)

In [38]:
pretrained_embedding

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.4769,  0.6460, -0.2009,  ..., -0.2221, -0.2449,  0.8116],
        [ 0.7019, -0.0129,  0.7528,  ..., -0.8730,  0.3202,  0.0773],
        [-0.1876,  0.1964,  0.4381,  ...,  0.0729, -0.5052,  0.3773]])

In [39]:
len(unk_tokens)

661

In [40]:
print(unk_tokens[:10])

['<unk>', '<pad>', '\x96', '****', 'hadn', 'camera-work', '*1/2', '100%', '$1', '*****']


In [41]:
model.embedding.weight.data

tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.5903, -0.1947, -0.2415],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.7289, -0.7336,  1.5624,  ..., -0.5592, -0.4480, -0.6476],
        ...,
        [ 0.0914,  1.5196,  0.4670,  ...,  0.6393, -0.0332,  0.0185],
        [-0.6290,  0.4650, -0.7165,  ..., -1.3171,  2.0381, -2.0497],
        [-1.1222, -0.0240, -1.0878,  ..., -0.4948, -0.3874,  0.0339]])

In [42]:
model.embedding.weight.data.copy_(pretrained_embedding)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.4769,  0.6460, -0.2009,  ..., -0.2221, -0.2449,  0.8116],
        [ 0.7019, -0.0129,  0.7528,  ..., -0.8730,  0.3202,  0.0773],
        [-0.1876,  0.1964,  0.4381,  ...,  0.0729, -0.5052,  0.3773]])

In [43]:
optimizer = optim.Adam(model.parameters())

In [44]:
criterion = nn.CrossEntropyLoss()

In [45]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f'Using: {device}')

Using: cuda


In [46]:
model = model.to(device)
criterion = criterion.to(device)

In [47]:
def calculate_accuracy(predictions, labels):
    top_predictions = predictions.argmax(1, keepdim = True)
    correct = top_predictions.eq(labels.view_as(top_predictions)).sum()
    accuracy = correct.float() / labels.shape[0]
    return accuracy

In [48]:
def train(model, iterator, optimizer, criterion, device):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for labels, text in iterator:
        
        labels = labels.to(device)
        text = text.to(device)
        
        optimizer.zero_grad()
        
        predictions = model(text)
        
        loss = criterion(predictions, labels)
        
        acc = calculate_accuracy(predictions, labels)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [49]:
def evaluate(model, iterator, criterion, device):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for labels, text in iterator:

            labels = labels.to(device)
            text = text.to(device)
            
            predictions = model(text)
            
            loss = criterion(predictions, labels)
            
            acc = calculate_accuracy(predictions, labels)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [50]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [51]:
n_epochs = 10

best_valid_loss = float('inf')

for epoch in range(n_epochs):

    start_time = time.monotonic()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, device)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, device)
    
    end_time = time.monotonic()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'nbow-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 7s
	Train Loss: 0.670 | Train Acc: 63.32%
	 Val. Loss: 0.632 |  Val. Acc: 72.81%
Epoch: 02 | Epoch Time: 0m 7s
	Train Loss: 0.586 | Train Acc: 75.22%
	 Val. Loss: 0.532 |  Val. Acc: 79.30%
Epoch: 03 | Epoch Time: 0m 7s
	Train Loss: 0.480 | Train Acc: 82.33%
	 Val. Loss: 0.442 |  Val. Acc: 83.69%
Epoch: 04 | Epoch Time: 0m 7s
	Train Loss: 0.394 | Train Acc: 86.19%
	 Val. Loss: 0.388 |  Val. Acc: 85.39%
Epoch: 05 | Epoch Time: 0m 7s
	Train Loss: 0.336 | Train Acc: 88.18%
	 Val. Loss: 0.352 |  Val. Acc: 86.54%
Epoch: 06 | Epoch Time: 0m 7s
	Train Loss: 0.296 | Train Acc: 89.64%
	 Val. Loss: 0.332 |  Val. Acc: 86.95%
Epoch: 07 | Epoch Time: 0m 7s
	Train Loss: 0.265 | Train Acc: 90.85%
	 Val. Loss: 0.316 |  Val. Acc: 87.44%
Epoch: 08 | Epoch Time: 0m 7s
	Train Loss: 0.240 | Train Acc: 91.85%
	 Val. Loss: 0.304 |  Val. Acc: 87.95%
Epoch: 09 | Epoch Time: 0m 7s
	Train Loss: 0.220 | Train Acc: 92.60%
	 Val. Loss: 0.298 |  Val. Acc: 88.38%
Epoch: 10 | Epoch Time: 0m 7

In [52]:
model.load_state_dict(torch.load('nbow-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion, device)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.326 | Test Acc: 86.38%


In [53]:
def predict_sentiment(tokenizer, vocab, model, device, sentence):
    model.eval()
    tokens = tokenizer.tokenize(sentence)
    indexes = [vocab.stoi[token] for token in tokens]
    tensor = torch.LongTensor(indexes).unsqueeze(-1).to(device)
    prediction = model(tensor)
    probabilities = nn.functional.softmax(prediction, dim = -1)
    pos_probability = probabilities.squeeze()[-1].item()
    return pos_probability

In [54]:
sentence = 'the absolute worst movie of all time.'

predict_sentiment(tokenizer, vocab, model, device, sentence)

1.9899568104442267e-10

In [55]:
sentence = 'one of the greatest films i have ever seen in my life.'

predict_sentiment(tokenizer, vocab, model, device, sentence)

0.9999610185623169

In [56]:
sentence = "i thought it was going to be one of the greatest films i have ever seen in my life, \
but it was actually the absolute worst movie of all time."

predict_sentiment(tokenizer, vocab, model, device, sentence)

0.21951617300510406

In [57]:
sentence = "i thought it was going to be the absolute worst movie of all time, \
but it was actually one of the greatest films i have ever seen in my life."

predict_sentiment(tokenizer, vocab, model, device, sentence)

0.21951621770858765