# RNN Movie Review Predictor

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
from matplotlib import pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as du
from torch.utils.data import Dataset
from torchvision import datasets, transforms, models
import torchvision.utils as vutils
import matplotlib.pyplot as plt
import os
import cv2 
import string

In [2]:
class Movie_Review_Data(Dataset):
    '''
    data_path: location of dataset
    seq_len: maximum length of a sentence
    embeddings_size: length of a word embedding vector
    '''
    def __init__(self, data_path, seq_len, embeddings, embeddings_size):
        super(Movie_Review_Data, self).__init__()
        self.seq_len = seq_len
        self.embeddings = embeddings
        self.embeddings_size = embeddings_size
        data_dict = None
        with open(data_path, 'rb') as handle:
            data_dict = pickle.load(handle)
        if(data_dict is None):
            return "Invalid data path"
        self.data = []
        self.labels = []
        for d in data_dict.items():
            self.data.append(d[1:][0][1])
            # self.labels.append(np.round(d[1:][0][0])-1)
            self.labels.append(1 if d[1:][0][0] > 7 else 0)
        self.labels = torch.tensor(self.labels, dtype=torch.long)
        # self.x = torch.ones(self.data.shape[0], seq_len, embeddings_size) * embeddings["_"]  # the embedding of "_" is used as a padding token
        # self.y = torch.tensor(self.labels)
        # for i, d in enumerate(self.data):
        #     for j, word in enumerate(d.split()):
        #         try:
        #             embed = embeddings[(word.lower()).translate(str.maketrans('', '', string.punctuation))]
        #             self.x[i,j] = embed
        #         except:
        #             word = word.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
        #             for w in word.split():
        #                 try:
        #                     embed = embeddings[(w.lower()).translate(str.maketrans('', '', string.punctuation))]
        #                     self.x[i,j] = embed
        ##              except:
        ##                  print(f"Unable to parse word: {word}")
                

    def __len__(self):
        '''return len of dataset'''
        return len(self.data)
    def __getitem__(self, idx):
        '''return sequence, future sequence'''
        # return self.x[idx], self.y[idx]
        self.x = torch.ones(self.seq_len, self.embeddings_size) * self.embeddings["_"]  # the embedding of "_" is used as a padding token
        for j, word in enumerate(self.data[idx].split()):
            if (j >= self.seq_len):
                break
            try:
                embed = self.embeddings[(word.lower()).translate(str.maketrans('', '', string.punctuation))]
            except:
                word = word.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
                for w in word.split():
                    try:
                        embed = self.embeddings[(w.lower()).translate(str.maketrans('', '', string.punctuation))]
                    except:
                        embed = self.embeddings["_"]
                    else:
                        self.x[j] = torch.tensor(embed)
            else:
                self.x[j] = torch.tensor(embed)
        return self.x, self.labels[idx]

In [3]:
class LSTM(nn.Module):
    def __init__(self, in_feat, hidden_dim, num_layers, out_dim, dropout, seq_len):
        '''
        in_dim: input layer dim
        hidden_layers: hidden layers in lstm
        '''
        super(LSTM, self).__init__()
        self.lstm = nn.LSTM(in_feat, hidden_dim, num_layers, bidirectional=True, dropout=dropout, batch_first=True)
        self.flatten = nn.Flatten()
        # self.fc1 = nn.Linear(hidden_dim*seq_len*2, (hidden_dim*seq_len*2)//2)
        # self.fc2 = nn.Linear((hidden_dim*seq_len*2)//2, out_dim)
        self.fc1 = nn.Linear(hidden_dim*2, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, out_dim)

    def forward(self, x):
        # x, _ = self.lstm(x)
        # x = self.fc1(F.relu(self.flatten(x)))
        # x = self.fc2(F.relu(x))
        _, (hidden, _) = self.lstm(x)
        x = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
        # x = self.fc1(F.relu(self.flatten(x)))
        x = self.fc1(self.flatten(x))
        x = self.fc2(F.relu(x))
        return x

In [4]:
device = f'cuda:0' if torch.cuda.is_available() else 'cpu'
print(f"using device: {device}")
batch_size = 4
learning_rate = 1e-4
max_epochs = 100
hidden_dim = 128
num_layers = 2
dropout = 0.2
# out_dim = 10
out_dim = 2
seq_len = 2000
embeddings_size = 200
seed = 0

torch.manual_seed(seed)
model = LSTM(embeddings_size, hidden_dim, num_layers, out_dim, dropout, seq_len)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
embeddings = {}
with open(f"Embeddings/glove.twitter.27B/glove.twitter.27B.{embeddings_size}d.txt", "r", encoding="utf-8") as f: # parsing file and saving each word embedding in a hashmap as {word: embedding}
    for line in f:
        values = line.split()
        embeddings[values[0]] = np.array(values[1:], dtype="float32")

# load training data in batches
SAVE_LOCATION = './data/'
train_loader = du.DataLoader(dataset=Movie_Review_Data(f'{SAVE_LOCATION}processed/data_train.pkl', seq_len, embeddings, embeddings_size, ),
                             batch_size=batch_size,
                             shuffle=True)
dev_loader = du.DataLoader(dataset=Movie_Review_Data(f'{SAVE_LOCATION}processed/data_dev.pkl', seq_len, embeddings, embeddings_size),
                             batch_size=batch_size,
                             shuffle=True)
test_loader = du.DataLoader(dataset=Movie_Review_Data(f'{SAVE_LOCATION}processed/data_test.pkl', seq_len, embeddings, embeddings_size),
                             batch_size=batch_size,
                             shuffle=True)
# send model over to device
model = model.to(device)
model.train()

using device: cuda:0


LSTM(
  (lstm): LSTM(200, 128, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=256, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=2, bias=True)
)

In [5]:
last_loss = np.inf
train_loss_list = []
valid_loss_list = []
valid_acc_list = []
# iterating through all epochs
for epoch in range(1, max_epochs + 1):    
    # training step
    train_loss = 0.
    train_accuracy = 0.
    model.train()
    # iterating through entire dataset in batches
    for batch_idx, (data, target) in enumerate(tqdm(train_loader)):
        # sending batch over to device
        data, target = data.to(device), target.to(device)
        # zeroing out previous gradients
        optimizer.zero_grad()
        # getting predictions from model
        pred = model(data)
        # # calculating BCE loss between predictions and labels
        loss = F.cross_entropy(pred, target)
        train_loss += loss.item()
        # # calculating backprop and using an adam optimizer for update step 
        loss.backward()
        optimizer.step()
        train_accuracy += torch.sum(torch.argmax(pred, dim=1) == target)

    dev_loss = 0.
    dev_accuracy = 0.
    with torch.no_grad():
        model.eval()
        # iterating through entire dataset in batches
        for batch_idx, (data, target) in enumerate(tqdm(dev_loader)):
            # sending batch over to device
            data, target = data.to(device), target.to(device)
            # zeroing out previous gradients
            optimizer.zero_grad()
            # getting predictions from model
            pred = model(data)
            # # calculating BCE loss between predictions and labels
            loss = F.cross_entropy(pred, target)
            dev_loss += loss.item()
            dev_accuracy += torch.sum(torch.argmax(pred, dim=1) == target)
    train_loss /= len(train_loader.dataset)
    train_accuracy /= len(train_loader.dataset)
    dev_loss /= len(dev_loader.dataset)
    dev_accuracy /= len(dev_loader.dataset)
    print(f"Epoch: {epoch}, training_loss {train_loss}, training_accuracy {train_accuracy}, dev_loss {dev_loss}, dev_accuracy {dev_accuracy}")
    

100%|██████████| 2000/2000 [08:06<00:00,  4.11it/s]
100%|██████████| 250/250 [00:37<00:00,  6.64it/s]


Epoch: 1, training_loss 0.1653024829812348, training_accuracy 0.5997500419616699, dev_loss 0.15155702260136605, dev_accuracy 0.706000030040741


100%|██████████| 2000/2000 [08:58<00:00,  3.71it/s]
100%|██████████| 250/250 [00:39<00:00,  6.36it/s]


Epoch: 2, training_loss 0.15947962354309858, training_accuracy 0.6472500562667847, dev_loss 0.15162844339013098, dev_accuracy 0.6950000524520874


100%|██████████| 2000/2000 [08:58<00:00,  3.71it/s]
100%|██████████| 250/250 [00:40<00:00,  6.10it/s]


Epoch: 3, training_loss 0.15166995834745467, training_accuracy 0.6842500567436218, dev_loss 0.1545852716565132, dev_accuracy 0.659000039100647


100%|██████████| 2000/2000 [07:55<00:00,  4.20it/s]
100%|██████████| 250/250 [00:33<00:00,  7.38it/s]


Epoch: 4, training_loss 0.15370827978104354, training_accuracy 0.6681250333786011, dev_loss 0.15093438228964806, dev_accuracy 0.7020000219345093


100%|██████████| 2000/2000 [08:34<00:00,  3.88it/s]
100%|██████████| 250/250 [00:37<00:00,  6.69it/s]


Epoch: 5, training_loss 0.151339020896703, training_accuracy 0.687250018119812, dev_loss 0.14966784504055977, dev_accuracy 0.7000000476837158


 35%|███▌      | 709/2000 [02:43<04:57,  4.33it/s]


KeyboardInterrupt: 