In [1]:
import torch
import torch.nn as nn 
from torch.utils.data import Dataset, DataLoader, Subset
import pandas as pd
import os
import random
from torch.utils.data import random_split
import re 
from collections import Counter, OrderedDict

In [2]:
from typing import Dict, List

**Project one - predicting the sentiment of IMDb movie reviews:**

Sentiment analysis is concerned with analysing the expressed opinion of a sentence or text document. Following section: we implement a multilayer RNN for sentiment analysis using a many-to-one architecture. 

In [3]:
class TextDataset(Dataset):
    def __init__(self, root_dir, split='train', transform = None):
        self.samples = []
        self.transform = transform 

        combined_path = os.path.join(root_dir,split)
        for label_name in ['pos','neg']:
            label_dir = os.path.join(combined_path, label_name)
            label = 1 if label_name == 'pos' else 0
            for fname in os.listdir(label_dir):
                if fname.endswith('.txt'):
                    path = os.path.join(label_dir, fname)
                    self.samples.append((path, label))
        random.shuffle(self.samples)
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        path, label = self.samples[idx]
        with open(path, 'r', encoding='utf-8') as f:
            text = f.read()
        if self.transform:
            text = self.transform(text)
        return text, label    

In [4]:
root_dir = "/Users/blaise/Documents/ML/Machine-Learning-and-Big-Data-Analytics/data/aclImdb"
train_dataset = TextDataset(root_dir=root_dir, split='train')
test_dataset = TextDataset(root_dir=root_dir, split='test')

In [5]:
len(train_dataset), len(test_dataset)

(25000, 25000)

In [6]:
train_dataset[0]

('This "movie" is such a bad work! Nothing seems to even try and be realistic. Plot is weak, acting - miserable, actors wondering around like in a 1st year production, trying very hard to act with no chance at all from the beginning. What a flop! What a waste of time, money and effort to all concerned including the audience. Well, as in any thriller, here too are murders, corpses and blood. Just imagine someone who 5 minutes ago, committed a murder with a knife, and came out calm and smiling, not to mention clean as a whistle, as if slashing one\'s throat is done by a virtual agent. Also, this murder was supposed to be done by a tiny fragile woman on a high strong male, and she cut his throat!!! Did she ask him, politely, to bend down for her? Much more stupidity of that same kind is going on and on leaving the audience wondering if this meant to be a joke which just turned out to be a bad one. Continuity is another huge problem as for instance: The eager-hungry groom is lying in bed, 

Creating a validation set from the training set:

In [7]:
train_dataset, valid_dataset = random_split(train_dataset, [20000,5000])

In [8]:
len(train_dataset), len(valid_dataset)

(20000, 5000)

To prepare the data for input to a NN, we need to encode it to numeric values. We need to find the unique words(tokens) in the training dataset. We will use the python Counter class to find the unique tokens in the text. We are only interested in the unique word and won't require word counts as in BoW (Bag of Words) models. word counts here are created as a side product. To split the text into words - we will use a tokenizer function, which also cleans up the text, removing html markups, as well as punctuation and other non-letter characters. 

In [9]:
def tokenizer(text):
    text = re.sub('<[^>]*>','', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub("[^\w']+",' ', text.lower()) + ' ' +' '.join(emoticons).replace('-','')
    tokenized = text.split()
    return tokenized

  emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
  text = re.sub("[^\w']+",' ', text.lower()) + ' ' +' '.join(emoticons).replace('-','')


In [10]:
line, label = train_dataset[0]

In [11]:
line

'My first review of 2010 is "Into The Blue 2: The Reef". The story is about two divers played by Chris Carmack and Laura Vandervoort who love to explore hidden treasures at a bottom of a local reef. One day after a day of exploring they are approached by a couple played by David Anders and Marsha Thomason. They tell the young divers that they want to hire them to explore the reef and find a rare artifact about Columbus\' hidden treasure that is reported at the bottom of the reef.<br /><br />Next day the four dive to the bottom of the reef and of coarse after a whole day of diving they find nothing. A few more days past and the two hired divers found out that they a part of a major deadly plot in which they can\'t escape otherwise they will be killed. They were hired to find two big containers. One contains a nuclear reactor and the other contains a core.<br /><br />The movie also has a back story about another person (brother of the lead character) trying to patch things up with his gi

In [12]:
tokens = tokenizer(line)

In [13]:
tokens

['my',
 'first',
 'review',
 'of',
 '2010',
 'is',
 'into',
 'the',
 'blue',
 '2',
 'the',
 'reef',
 'the',
 'story',
 'is',
 'about',
 'two',
 'divers',
 'played',
 'by',
 'chris',
 'carmack',
 'and',
 'laura',
 'vandervoort',
 'who',
 'love',
 'to',
 'explore',
 'hidden',
 'treasures',
 'at',
 'a',
 'bottom',
 'of',
 'a',
 'local',
 'reef',
 'one',
 'day',
 'after',
 'a',
 'day',
 'of',
 'exploring',
 'they',
 'are',
 'approached',
 'by',
 'a',
 'couple',
 'played',
 'by',
 'david',
 'anders',
 'and',
 'marsha',
 'thomason',
 'they',
 'tell',
 'the',
 'young',
 'divers',
 'that',
 'they',
 'want',
 'to',
 'hire',
 'them',
 'to',
 'explore',
 'the',
 'reef',
 'and',
 'find',
 'a',
 'rare',
 'artifact',
 'about',
 "columbus'",
 'hidden',
 'treasure',
 'that',
 'is',
 'reported',
 'at',
 'the',
 'bottom',
 'of',
 'the',
 'reef',
 'next',
 'day',
 'the',
 'four',
 'dive',
 'to',
 'the',
 'bottom',
 'of',
 'the',
 'reef',
 'and',
 'of',
 'coarse',
 'after',
 'a',
 'whole',
 'day',
 'of',


In [14]:
Counter(tokens)

Counter({'the': 19,
         'a': 18,
         'of': 12,
         'and': 9,
         'to': 8,
         'they': 8,
         'is': 6,
         'reef': 5,
         'by': 4,
         'day': 4,
         'that': 4,
         'story': 3,
         'about': 3,
         'two': 3,
         'divers': 3,
         'bottom': 3,
         'find': 3,
         'which': 3,
         'this': 3,
         'into': 2,
         'blue': 2,
         '2': 2,
         'played': 2,
         'explore': 2,
         'hidden': 2,
         'at': 2,
         'one': 2,
         'after': 2,
         'couple': 2,
         'hired': 2,
         'part': 2,
         'plot': 2,
         'in': 2,
         'be': 2,
         'contains': 2,
         'movie': 2,
         'also': 2,
         'has': 2,
         'waste': 2,
         'it': 2,
         'characters': 2,
         'my': 1,
         'first': 1,
         'review': 1,
         '2010': 1,
         'chris': 1,
         'carmack': 1,
         'laura': 1,
         'vandervoort': 1,
  

Building the vocabulary:

In [15]:
token_counts = Counter()

for line, label in train_dataset:
    tokens = tokenizer(line)
    token_counts.update(tokens)
print('Vocab size: ', len(token_counts))

Vocab size:  80366


Next up, we go to mapping each unique word to a unique integer. This can be done manually using a python dictionary, where the keys are the unique tokens (words) and the value associated with each key is a unique integer. 

Don't have the torchtext package so - implementing the vocabulary class:

In [16]:
class Vocabulary:
    def __init__(self, ordered_dict: OrderedDict, unk_index: int=None):
        # ordered dict maps token -> frequency (frequency is ignored after init)
        self.itos: List[str] = list(ordered_dict.keys()) # index -> token
        self.stoi: Dict[str,int] = {tok:idx for idx,tok in enumerate(self.itos)}
        self.unk_index = unk_index
        self.default_index = unk_index 
    
    def insert_token(self, token: str, index: int) -> None:
        """Insert a token at a specific index (shifts existing entries)"""
        if token in self.stoi:
            # token already exists -> remove old entry
            old_idx = self.stoi.pop(token)
            # shift everything after old_idx down
            for t, i in self.stoi.items():
                if i > old_idx:
                    self.stoi[t] = i-1
            self.itos = [t for t,_ in sorted(self.stoi.items(), key=lambda x: x[1])]

        # insert at the requested index
        self.itos.insert(index, token)
        self.stoi[token] = index
        # shift everything >= index up by 1
        for t,i in self.stoi.items():
            if i >= index and t!=token:
                self.stoi[t] = i+1
    
    def set_default_index(self, idx: int) -> None:
        self.default_index = idx
    
    # convenience methods 
    def __getitem__(self, token:str) -> int:
        return self.stoi.get(token, self.default_index)
    
    def __len__(self) -> int:
        return len(self.itos)
    
    def lookup_indices(self, tokens: List[str]) -> List[int]:
        return [self[t] for t in tokens]
    
    def lookup_tokens(self, indices:List[int]) -> List[str]:
        return [self.itos[i] for i in indices if i < len(self.itos)]

In [17]:
sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
ordered_dict =OrderedDict(sorted_by_freq_tuples)
vocab = Vocabulary(ordered_dict)
vocab.insert_token("<pad>",0)
vocab.insert_token("<unk>",1)
vocab.set_default_index(1)

To demonstrate the working of the vocab object, we will convert an example input text into a list of integer values:

In [18]:
print([vocab[token] for token in ['this','is','an','example']])

[11, 7, 32, 456]


Any token not in the vocabulary will get assigned the index 1 -  the unknown word/token index. Another reserved value is the integer 0, which serves as a placeholder, a so-called padding token, for adjusting the sequence length. We can now define the text pipeline function to transform each text accordingly

In [19]:
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]

We will generate batches of samples using DataLoader and pass the data processing pipelines declared previously to the argument collate_fn. We will wrap the text encoding into the collate_batch function.

In [20]:
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []

    for _text, _label in batch:
        label_list.append(_label)
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    padded_text_list = nn.utils.rnn.pad_sequence(text_list, batch_first=True)
    return padded_text_list, label_list, lengths

In [21]:
# Take a small batch
dataloader = DataLoader(train_dataset, batch_size=4, shuffle=False, collate_fn=collate_batch)

Let's illustrate how this padding works:
- lets observe some of the results from the above operations...

In [22]:
text_batch, label_batch, length_batch = next(iter(dataloader))

In [23]:
print(text_batch)

tensor([[  57,   83,  721,  ...,    4,  352, 1338],
        [   2, 1031, 4706,  ...,    0,    0,    0],
        [  10, 3003,   11,  ...,    0,    0,    0],
        [ 240, 1613,  420,  ...,    0,    0,    0]])


In [24]:
print(label_batch)

tensor([0, 0, 0, 1])


In [25]:
print(length_batch)

tensor([277, 202, 253, 222])


In [26]:
print(text_batch.shape)

torch.Size([4, 277])


Finally, let us divide all 3 datasets into data loaders with a batch size of 32

In [27]:
batch_size = 128
train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
valid_dl = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
test_dl = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

Now, the data is in a suitable format for an rnn model, which is implemented belowL

**Embedding layers for sentence encoding:**

Given a set of tokens of size n+2 (n is the size of the token set, plus index 0 is reserved for the padding placeholder, and 1 is for the words not present in the token set), an embedding matrix of size (n+2)xembedding_dim will be created where each row of this matrix represents numeric features associated with a token. Therefore, when an integer index, i, is given as input to the embedding, it will lookup the corresponding row of the matrix at index i and return numeric features. The embedding matrix serves as the input to our NN models, In practice creating an embedding layer is simply done using nn.Embedding.

In [28]:
embedding = nn.Embedding(
    num_embeddings=10,
    embedding_dim=3,
    padding_idx=0
)

In [29]:
# a batch of 2 samples of 4 indices each
text_encoded_input = torch.LongTensor([[1,2,4,5],[4,3,2,0]])
print(embedding(text_encoded_input))

tensor([[[ 3.5943e-01,  1.6515e-01, -1.1147e+00],
         [ 8.9677e-01,  9.6746e-01,  1.1298e+00],
         [ 7.7055e-04, -2.0528e+00, -7.1877e-01],
         [ 1.6979e+00, -2.8334e-01,  2.3272e-01]],

        [[ 7.7055e-04, -2.0528e+00, -7.1877e-01],
         [-1.9676e+00,  2.5564e-01,  2.4446e-01],
         [ 8.9677e-01,  9.6746e-01,  1.1298e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00]]],
       grad_fn=<EmbeddingBackward0>)


- Inputs to the embedding layer must have a rank of 2 - with the dimensionality of batch_size x input_length, where input_length is the length of sequences (here, 4). Output of the embedding layer will then have output of size: batchsize x inputlength x embedding_dim. Where embedding_dim is the size of the embedding features. The other argument provided to the embedding_layer, num_embeddings, correspinds to the unique integer values that the model will receive as input (for instance, n+2, set to 10 in this example). Therefore the embedding matrix in this case has the size 10x3.
- padding_idx indicates the token index for padding (here, 0), which, if specified, will not contribute to the gradient updates during training. In our example, the length of the original sequence of the second example is 3, which we padded with 1 more element 0. The embedding output of the padded element is [0,0,0]

**Building an RNN model:**
- Now, we can build an RNN model. Using the nn.Module class, we can combine the embedding layer, the recurrent layers of the RNN,  and the fully connected non-recurrent layers. For the recurrent layers, we can use any of the following implementations:
  - RNN: a regular RNN layer, that is, a fully connected recurrent layer
  - LSTM:  a long short-term memory RNN, which is useful for capturing long-term dependencies
  - GRU: a recurrent layer with a gated recurrent unit. 
- To see how a multilayer RNN model can be built using one of these recurrent layers, in the following example, we create an RNN model with 2 recurrent layers of type RNN. Finally, we will add a non-recurrent fully connected layer as the output layer, which will return a single output value as the prediction

In [30]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers=2, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
    
    def forward(self, x):
        _, hidden = self.rnn(x)
        out = hidden[-1,:,:] # we use the final hidden state from the last hidden layer as the input to the fc layer, as this is equal to the output of rnn's last time step
        out = self.fc(out)
        return out

model = RNN(64, 32)
print(model)      


RNN(
  (rnn): RNN(64, 32, num_layers=2, batch_first=True)
  (fc): Linear(in_features=32, out_features=1, bias=True)
)


In [31]:
device = torch.device("mps" if torch.mps.is_available() else "cpu")
# device = torch.device("cpu")

In [32]:
# model.to(device)
# model(torch.randn(5, 3, 64).to(device))

Building RNNs with these recurrent layers is as straightforward as can be seen above. In the next subsection, we go back to the sentiment analysis task and build an RNN model to solve that

**Building an RNN model for the sentiment analysis task:**
- Because of the length of the sequences we've got and the need to store and propagate early learned useful context over long sequences we are going to use an LSTM layer to account for this (long range effects). We create an RNN model for sentiment analysis, starting with an embedding layer producing word embeddings of feature size 20 (embed_dim=20). Then, a recurrent layer of type LSTM eill be added. Finally, we add a fully connected layer as a hidden layer and another fully connected layer as the output layer, which will return a single class-membership probability value using the logistic sigmoid activation as the prediction.

In [33]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True)
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(
            out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True
        )
        out, (hidden, cell) = self.rnn(out)
        out = hidden[-1, :, :]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

In [34]:
vocab_size = len(vocab)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64
torch.manual_seed(1)

<torch._C.Generator at 0x11d3edeb0>

In [35]:
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size).to(device)

In [36]:
model

RNN(
  (embedding): Embedding(80368, 20, padding_idx=0)
  (rnn): LSTM(20, 64, batch_first=True)
  (fc1): Linear(in_features=64, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [44]:
def train(dataloader):
    model.train()
    total_acc, total_loss = 0, 0
    for text_batch, label_batch, lengths in dataloader:
        optimizer.zero_grad(set_to_none=True)
        label_batch = label_batch.float()
        text_batch, label_batch = text_batch.to(device), label_batch.to(device)
        pred = model(text_batch, lengths)[:,0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()
        total_acc += (
            (pred >= 0.5).float() == label_batch
        ).float().sum().item()
        total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

In [45]:
def evaluate(dataloader):
    model.eval()
    total_acc, total_loss = 0, 0
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
            label_batch = label_batch.float()
            text_batch, label_batch = text_batch.to(device), label_batch.to(device)
            pred = model(text_batch, lengths)[:,0]
            loss = loss_fn(pred, label_batch)
            total_acc += (
                (pred >= 0.5).float() == label_batch
            ).float().sum().item()
            total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

In [46]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [40]:
num_epochs = 10
torch.manual_seed(1)
for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid = evaluate(valid_dl)
    print(f'Epoch: {epoch} | accuracy: {acc_train:.4f} | val_accuracy: {acc_valid:.4f}')


Epoch: 0 | accuracy: 0.5581 | val_accuracy: 0.6226
Epoch: 1 | accuracy: 0.6668 | val_accuracy: 0.7200
Epoch: 2 | accuracy: 0.6828 | val_accuracy: 0.7420
Epoch: 3 | accuracy: 0.7689 | val_accuracy: 0.7678
Epoch: 4 | accuracy: 0.8109 | val_accuracy: 0.7712
Epoch: 5 | accuracy: 0.8307 | val_accuracy: 0.7962
Epoch: 6 | accuracy: 0.8565 | val_accuracy: 0.7898
Epoch: 7 | accuracy: 0.8801 | val_accuracy: 0.8342
Epoch: 8 | accuracy: 0.8858 | val_accuracy: 0.8272
Epoch: 9 | accuracy: 0.8940 | val_accuracy: 0.8336


Evaluate on the test data:

In [41]:
acc_test, _ = evaluate(test_dl)

In [42]:
print(f'test accuracy: {acc_test:.4f}')

test accuracy: 0.8302


**More on the bidirectional RNN:**
- In addition, we will set the bidirectional configuration of the LSTM to true, which will make the recurrent layer pass through input sequences from both directions, start to end, as well as in the reverse direction:

In [43]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(
            vocab_size, embed_dim, padding_idx=0
        )
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(rnn_hidden_size*2, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
        _, (hidden,cell) = self.rnn(out)
        out = torch.cat((hidden[-2,:,:], hidden[-1, :, :]), dim=1)
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

In [60]:
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size).to(device)

In [61]:
batch_size = 256
train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
valid_dl = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
test_dl = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

In [62]:
num_epochs = 10
torch.manual_seed(1)
for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid = evaluate(valid_dl)
    print(f'Epoch: {epoch} | accuracy: {acc_train:.4f} | val_accuracy: {acc_valid:.4f}')


Epoch: 0 | accuracy: 0.5036 | val_accuracy: 0.4856
Epoch: 1 | accuracy: 0.5036 | val_accuracy: 0.4856


  emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
  text = re.sub("[^\w']+",' ', text.lower()) + ' ' +' '.join(emoticons).replace('-','')


KeyboardInterrupt: 