# Sentiment Analysis on IMDB Rating
Implemented using RNN

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
import numpy as np
import matplotlib.pyplot as plt
import datasets
import re
from collections import Counter, OrderedDict

## Preprocessing

In [10]:
#Load test and train data
train_data, test_data = datasets.load_dataset('imdb', split=['train','test'])

#Split test data into train (20k) and validate (5k)
from torch.utils.data.dataset import random_split
torch.manual_seed(1)
train_data, valid_data = random_split(list(train_data),[20000,5000])

In [23]:
def tokenizer(text):
    # Remove HTML tags
    text = re.sub('<[^>]*>', '', text)
    # Extract emoticons
    emoticons = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    # Eliminate excessive whitespace and convert text to lowercase
    text = re.sub(r'[\W]+', ' ', text.lower())
    # Append emoticons at the end, removing the "nose" for standardization
    text = text + ' ' + ' '.join(emoticons).replace('-', '')
    #Split by white space
    tokenized = text.split()
    return tokenized

In [29]:
#How many unique tokens are in the text corpus?
token_counts = Counter()
for review in train_data:
    text = review['text']
    tokens = tokenizer(text)
    token_counts.update(tokens)
print('number of tokens', len(token_counts))

number of tokens 69006


In [98]:
#Map each token to a unique integer. In reverse frequency order. 0 and 1 placeholders
#Sort counter in reverse frequency order
sorted_dict = sorted(
    token_counts.items(), key=lambda x:x[1], reverse=True
)
ordered_dict = OrderedDict(sorted_dict)

#Word_index contains word:index pairs
word_index = {}
counter = 2
for word, freq in ordered_dict.items():
    word_index[word] = counter
    counter += 1

#0 reserverd for padding. 1 reserved for unknown words
word_index['<pad>'] = 0
word_index['<unk>'] = 1

#Demonstrate encoding scheme works
def word_index_conversion(text):
    encoding = []
    tokens = tokenizer(text)
    for token in tokens:
        encoding.append(word_index.get(token,1))
    return encoding

#Testing
print(word_index_conversion("Roses are red"))
print(word_index_conversion("roSes ARE reD :)"))

[11558, 26, 736]
[11558, 26, 736, 2152]


In [81]:
def build_dataloader(batch):
    label_list, text_list, lengths = [], [], []
    for review in batch:
        text = review['text']
        label = review['label']
        label_list.append(label)
        processed_text = torch.tensor(word_index_conversion(text), dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    #Ensure all sequence in minibatch have same length to store efficiently as tensor
    padded_text_list = nn.utils.rnn.pad_sequence(text_list, batch_first=True)
    return padded_text_list, label_list, lengths

In [92]:
#Load a small sample with batchsize of 4
from torch.utils.data import DataLoader
dataloader = DataLoader(train_data,batch_size=4,shuffle=False, collate_fn=build_dataloader)
text_batch, label_batch, length_batch = next(iter(dataloader))
#Length of text_batch is maximum in the minibatch

In [95]:
#Divide into batches of size 32
batch_size = 32
train_dl = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=build_dataloader)
valid_dl = DataLoader(valid_data, batch_size=batch_size, shuffle=True, collate_fn=build_dataloader)
test_dl = DataLoader(test_data, batch_size=batch_size, shuffle=True, collate_fn=build_dataloader)

## Dimensionality Reduction
One way to encode the index is via one hot encoding. Results in sparse feature vectors \
Suffer from curse of dimensionality \
A better approach is to map each word to a vector of fixed size with real-valued elements \
-> Advantage: Reduction in dimensionality of the feature space \
-> Extraction of salient features since the embedding layer in an NN can be optimized \
Let n be the number of unique words \
Embedding matrix is of size (n+2) x embedding_dim. Reserve 2 spots for \<unknown\> and \<pad\> \
Given integer index i, simply look up the row at index i

In [108]:
embedding = nn.Embedding(num_embeddings=100, 
                         embedding_dim=3, #Dim of embedding space
                         padding_idx=0) #Which index indicates padding. Doesn't contribute to gradient update
#Sample training data
text_encoded_input = torch.LongTensor([word_index_conversion("Roses are red"), 
                                      word_index_conversion("Violets are blue")])
print(embedding(text_encoded_input))

IndexError: index out of range in self