In [1]:
from data_utils import load_imdb, get_train_test_split

import os, sys, random, functools
os.environ['CUDA_VISIBLE_DEVICES'] = '7'
import numpy as np
from collections import Counter
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset

torch.backends.cudnn.benchmark = True

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/jz288/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# load raw data and split it into train/test

In [2]:
x_train, x_test, y_train, y_test = get_train_test_split("IMDBDataset.csv", test_size=0.2)

length of train data is 40000
length of test data is 10000


# tokenizer

In [3]:
def build_vocab(x_train, min_freq=1, hparams=None):
    word_list = []
    stop_words = set(stopwords.words('english'))
    for sent in x_train:
        for word in sent.lower().split():
            if word not in stop_words and word != '':
                word_list.append(word)

    corpus = Counter(word_list)
    # sorting on the basis of most common words
    corpus_ = [word for word, freq in corpus.items() if freq >= min_freq]
    # creating a dict
    # here assume the pad token is 0 and unknown token is 1
    vocab = {w: i+2 for i, w in enumerate(corpus_)}
    vocab[hparams.PAD_TOKEN] = hparams.PAD_INDEX
    vocab[hparams.UNK_TOKEN] = hparams.UNK_INDEX
    return vocab

def tokenize(vocab, sentence):
    return [vocab[word] for word in sentence.lower().split() if word in vocab.keys()]

# a custom pytorch dataset class

In [4]:
class IMDB(Dataset):
    def __init__(self, x, y, vocab, max_length=256) -> None:
        self.x = x
        self.y = y
        self.vocab = vocab
        self.max_length = max_length

    def __getitem__(self, idx):
        token_ids = tokenize(self.vocab, self.x[idx])
        if self.max_length:
            token_ids = token_ids[:self.max_length]

        label = 1 if self.y[idx] == 'positive' else 0
        return {"ids": token_ids, "length": len(token_ids), "label": label}

    def __len__(self):
        return len(self.x)

# LSTM model

In [5]:
def init_weights(m):
    if isinstance(m, nn.Embedding):
        nn.init.xavier_normal_(m.weight)
    elif isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight)
        nn.init.zeros_(m.bias)
    elif isinstance(m, nn.LSTM) or isinstance(m, nn.GRU):
        for name, param in m.named_parameters():
            if 'bias' in name:
                nn.init.zeros_(param)
            elif 'weight' in name:
                nn.init.orthogonal_(param)
                
class LSTM(nn.Module):
    def __init__(
        self, 
        vocab_size: int, 
        embedding_dim: int, 
        hidden_dim: int, 
        output_dim: int, 
        n_layers: int, 
        dropout_rate: float, 
        pad_index: int,
        bidirectional: bool,
        **kwargs):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_index)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout_rate, batch_first=True,
                           bidirectional=bidirectional)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout_rate)
        self.apply(init_weights)

    def forward(self, ids:torch.Tensor, length:torch.Tensor):
        embedded = self.dropout(self.embedding(ids))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, length, batch_first=True,
                                                            enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        hidden = self.dropout(hidden[-1])
        prediction = self.fc(hidden)
        return prediction

# training/evaluation function

In [6]:
def train(dataloader, model, criterion, optimizer, scheduler, device):
    model.train()
    epoch_losses = []
    epoch_acc = 0

    for batch in tqdm(dataloader, desc='training...', file=sys.stdout):
        ids = batch['ids'].to(device)
        length = batch['length']
        label = batch['label'].to(device)
        prediction = model(ids, length)
        loss = criterion(prediction, label)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_losses.append(loss.item())
        epoch_acc += get_correct(prediction, label)
        scheduler.step()

    return epoch_losses, epoch_acc / len(dataloader.dataset)

def evaluate(dataloader, model, criterion, device):
    model.eval()
    epoch_losses = []
    epoch_acc = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc='evaluating...', file=sys.stdout):
            ids = batch['ids'].to(device)
            length = batch['length']
            label = batch['label'].to(device)
            prediction = model(ids, length)
            loss = criterion(prediction, label)
            epoch_losses.append(loss.item())
            epoch_acc += get_correct(prediction, label)

    return epoch_losses, epoch_acc / len(dataloader.dataset)

def get_correct(prediction, label):
    predicted_classes = prediction.argmax(dim=-1)
    return predicted_classes.eq(label).sum().item()

def predict_sentiment(text, model, vocab, device):
    tokens = tokenize(vocab, text)
    ids = [vocab[t] if t in vocab else UNK_INDEX for t in tokens]
    length = torch.LongTensor([len(ids)])
    tensor = torch.LongTensor(ids).unsqueeze(dim=0).to(device)
    prediction = model(tensor, length).squeeze(dim=0)
    probability = torch.softmax(prediction, dim=-1)
    predicted_class = prediction.argmax(dim=-1).item()
    predicted_probability = probability[predicted_class].item()
    return predicted_class, predicted_probability

def collate(batch, pad_index):
    batch_ids = [torch.LongTensor(i['ids']) for i in batch]
    batch_ids = nn.utils.rnn.pad_sequence(batch_ids, padding_value=pad_index, batch_first=True)
    batch_length = torch.Tensor([i['length'] for i in batch])
    batch_label = torch.LongTensor([i['label'] for i in batch])
    batch = {'ids': batch_ids, 'length': batch_length, 'label': batch_label}
    return batch

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

class ConstantWithWarmup(torch.optim.lr_scheduler._LRScheduler):
    def __init__(
        self,
        optimizer,
        num_warmup_steps: int,
    ):
        self.num_warmup_steps = num_warmup_steps
        super().__init__(optimizer)

    def get_lr(self):
        if self._step_count <= self.num_warmup_steps:
            # warmup
            scale = 1.0 - (self.num_warmup_steps - self._step_count) / self.num_warmup_steps
            lr = [base_lr * scale for base_lr in self.base_lrs]
            self.last_lr = lr
        else:
            lr = self.base_lrs
        return lr

def train_and_test_model_with_hparams(data, hparams, model_type="lstm", **kwargs):
    torch.manual_seed(hparams.SEED)
    random.seed(hparams.SEED)
    np.random.seed(hparams.SEED)

    x_train, x_test, y_train, y_test = data
    vocab = build_vocab(x_train, hparams=hparams)
    vocab_size = len(vocab)
    print(f'Length of vocabulary is {vocab_size}')

    train_data = IMDB(x_train, y_train, vocab, hparams.MAX_LENGTH)
    test_data = IMDB(x_test, y_test, vocab, hparams.MAX_LENGTH)

    collate_fn = functools.partial(collate, pad_index=hparams.PAD_INDEX)

    train_dataloader = torch.utils.data.DataLoader(
        train_data, batch_size=hparams.BATCH_SIZE, collate_fn=collate_fn, shuffle=True)
    test_dataloader = torch.utils.data.DataLoader(
        test_data, batch_size=hparams.BATCH_SIZE, collate_fn=collate_fn)
    
    # Model
    model = LSTM(
        vocab_size, 
        hparams.EMBEDDING_DIM, 
        hparams.HIDDEN_DIM, 
        hparams.OUTPUT_DIM,
        hparams.N_LAYERS,
        hparams.DROPOUT_RATE, 
        hparams.PAD_INDEX,
        hparams.BIDIRECTIONAL,
        **kwargs)
    num_params = count_parameters(model)
    print(f'The model has {num_params:,} trainable parameters')
    print('='*50)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    optimizer = optim.RMSprop(
            model.parameters(), lr=hparams.LR, weight_decay=hparams.WD, eps=1e-6, momentum=.9)

    criterion = nn.CrossEntropyLoss()
    criterion = criterion.to(device)

    # Start training
    best_test_loss = float('inf')
    train_losses = []
    train_accs = []
    test_losses = []
    test_accs = []
    
    # Warmup Scheduler
    WARMUP_STEPS = 200
    lr_scheduler = ConstantWithWarmup(optimizer, WARMUP_STEPS)

    best_acc = 0.
    for epoch in range(hparams.N_EPOCHS):
        train_losses, train_acc = train(train_dataloader, model, criterion, optimizer, lr_scheduler, device)
        test_losses, test_acc = evaluate(test_dataloader, model, criterion, device)
        
        if test_acc >= best_acc:
            best_acc = test_acc
            #torch.save(
            #    model.state_dict(),
            #    'best_lstm.pth'
            #)

        tqdm.write(f'epoch: {epoch+1}')
        tqdm.write(f'train_loss: {np.mean(train_losses):.3f}, train_acc: {train_acc:.3f}')
        tqdm.write(f'test_loss: {np.mean(test_losses):.3f}, test_acc: {test_acc:.3f}')
        tqdm.write('-'*50)

# training/evaluation

In [7]:
class HyperParams:
    def __init__(self):
        # Constance hyperparameters. They have been tested and don't need to be tuned.
        self.PAD_INDEX = 0
        self.UNK_INDEX = 1
        self.PAD_TOKEN = '<pad>'
        self.UNK_TOKEN = '<unk>'
        self.STOP_WORDS = set(stopwords.words('english'))
        self.MAX_LENGTH = 256
        self.BATCH_SIZE = 96
        self.EMBEDDING_DIM = 1
        self.HIDDEN_DIM = 100
        self.OUTPUT_DIM = 2
        self.N_LAYERS = 1
        self.DROPOUT_RATE = 0
        self.LR = 0.001
        self.N_EPOCHS = 5
        self.WD = 0
        self.BIDIRECTIONAL = False
        self.SEED = 2

In [8]:
hparams = HyperParams()
hparams.N_LAYERS = 1
hparams.DROPOUT_RATE = 0.
hparams.WD = 0.0001
hparams.BIDIRECTIONAL = False
hparams.MAX_LENGTH = 512
train_and_test_model_with_hparams([x_train, x_test, y_train, y_test], hparams)

Length of vocabulary is 318916
The model has 360,318 trainable parameters
training...: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 417/417 [00:15<00:00, 26.58it/s]
evaluating...: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 105/105 [00:01<00:00, 61.51it/s]
epoch: 1
train_loss: 0.544, train_acc: 0.711
test_loss: 0.428, test_acc: 0.815
--------------------------------------------------
training...: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 417/417 [00:15<00:00, 27.04it/s]
evaluating...: 100%|███████████████████████████████████████████████████████████

|embedding dim |layers |hidden dim |bidirectional |parameters |dropout rate |wd |epochs |train acc |test acc |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
|1 |1 |100 |False |360,318 |0 |0 |5 |0.992 |0.874 |
|1 |1 |100 |False |360,318 |0 |0 |10 |0.988 |0.833 |
|5 |1 |50 |False |1,606,082 |0 |0 |5 |0.994 |0.873 |
|1 |1 |100 |False |360,318 |0.1 |0 |5 |0.989 |0.876 |
|1 |1 |100 |False |360,318 |0.2 |0 |5 |0.851 |0.885 |
|1 |1 |100 |False |360,318 |0.3 |0 |5 |0.949 |0.888 |
|1 |1 |100 |False |360,318 |0 |5e-5 |5 |0.941 |0.882 |
|1 |1 |100 |False |360,318 |0 |1e-4 |5 |0.922 |0.891 |
|1 |1 |100 |False |360,318 |0 |5e-4 |5 |0.800 |0.733 |
|1 |1 |100 |False |360,318 |0.1 |1e-4 |5 |0.916 |0.888 |
|1 |1 |100 |False |360,318 |0.3 |1e-4 |5 |0.895 |0.887 |
|1 |2 |100 |False |441,118 |0 |1e-4 |5 |0.900 |0.879 |
|1 |1 |100 |True |401,518 |0 |1e-4 |5 |0.861 |0.881 |

# evaluate on synthetic data

In [8]:
method = 'ngram' # bayes
reviews, sentiments = load_imdb(f"synthetic_data/{method}.csv")

In [9]:
def evaluate(dataloader, model, criterion, device):
    model.eval()
    epoch_losses = []
    epoch_acc = 0
    correct = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc='evaluating...', file=sys.stdout):
            ids = batch['ids'].to(device)
            length = batch['length']
            label = batch['label'].to(device)
            prediction = model(ids, length)
            loss = criterion(prediction, label)
            epoch_losses.append(loss.item())
            epoch_acc += get_correct(prediction, label)
            correct.append(
                prediction.argmax(1).eq(label).cpu()
            )

    return epoch_losses, epoch_acc / len(dataloader.dataset), torch.cat(correct)


def test_model_with_hparams(x_train, x_test_syn, y_test_syn, hparams, model_ckpt_path, model_type="lstm", **kwargs):
    vocab = build_vocab(x_train, hparams=hparams)
    vocab_size = len(vocab)
    print(f'Length of vocabulary is {vocab_size}')

    test_data = IMDB(x_test_syn, y_test_syn, vocab, hparams.MAX_LENGTH)

    collate_fn = functools.partial(collate, pad_index=hparams.PAD_INDEX)

    test_dataloader = torch.utils.data.DataLoader(
        test_data, batch_size=hparams.BATCH_SIZE, collate_fn=collate_fn, shuffle=False)
    
    # Model
    model = LSTM(
        vocab_size, 
        hparams.EMBEDDING_DIM, 
        hparams.HIDDEN_DIM, 
        hparams.OUTPUT_DIM,
        hparams.N_LAYERS,
        hparams.DROPOUT_RATE, 
        hparams.PAD_INDEX,
        hparams.BIDIRECTIONAL,
        **kwargs)
    num_params = count_parameters(model)
    print(f'The model has {num_params:,} trainable parameters')
    print('='*50)
    
    model.load_state_dict(torch.load(model_ckpt_path))

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    criterion = nn.CrossEntropyLoss()
    test_losses, test_acc, test_correct = evaluate(test_dataloader, model, criterion, device)
    print(f'test_acc: {test_acc:.3f}')
    return test_correct

In [10]:
hparams = HyperParams()
hparams.N_LAYERS = 1
hparams.DROPOUT_RATE = 0.
hparams.WD = 0.0001
hparams.BIDIRECTIONAL = False
correct = test_model_with_hparams(
    x_train, reviews, sentiments, hparams, 
    'best_lstm.pth'
)

Length of vocabulary is 318916
The model has 360,318 trainable parameters
evaluating...: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 190.21it/s]
test_acc: 0.797


# qualitative examination

In [34]:
indices = np.array([10,20,30])

num = 5
np.array(x_test)[correct.numpy().nonzero()[0][indices]]

array(["I found it real shocking at first to see William Shakespeare's love masterpiece reworked into a gory, violent and kinky sensual movie adaptation. But after you watched it once, it sort of grows on you when you watch it the second and third times, as you come over the shock and start appreciating the movie on its own merits - solid acting, good dialogue, nice sequencing and choreography, not-too-bad soundtrack and some of the (special) effects that go on. Oh, and also the ending. What a riot! eos",
       "The beginning of this movie had me doubting that it would be little more than a typical B sci-fi flick. But, as it progressed I began to get interested and I saw the whole thing through. The premise is interesting, original, and has the makings of making a classic. Alas, it instead ended up a mediocre movie, done in by the usual factors which turn a potentially good movie into a bad movie (bad acting, low budget etc.). I'm interested to see how this would turn out if it were r

In [35]:
np.array(y_test)[correct.numpy().nonzero()[0][indices]]

array(['positive', 'negative', 'negative'], dtype=object)

In [30]:
np.array(x_test)[1-correct.numpy().nonzero()[0][:num]]

array(["This movie is beautiful in all ways. It is visually stunning, and this is a good thing since the dialogue would only take up a page or two of paper. The acting is superb; it is subtle, passionate and intense. Ben Daniels does a fabulous job of turning himself into an animal, and mixing that wild nature with a man's overbearing passion and honor. There is not one flaw, not one mistake or wrong moment to be found anywhere. It is completely perfect, but only if you understand what you're going to experience. It isn't a movie for anyone who wants normality. eos",
       "I thought the film could be a bit more complex,in a psychological sense perhaps, but the action and voice acting were top notch. The animation was heavy CG in many scenes, but very good ones at that. This is one of the Batman Returns/Forever type films, which include romances and the conflicts of Wayne and motives for dating. 007 fans would love this, and so would the females, great theme song! Wayne was portrayed 

In [31]:
np.array(y_test)[1-correct.numpy().nonzero()[0][:num]]

array(['positive', 'positive', 'negative', 'negative', 'negative'],
      dtype=object)

In [22]:
hparams = HyperParams()
hparams.N_LAYERS = 1
hparams.DROPOUT_RATE = 0.
hparams.WD = 0.0001
hparams.BIDIRECTIONAL = False
test_model_with_hparams(
    x_train, reviews, sentiments, hparams, 
    'best_lstm.pth'
)

Length of vocabulary is 318916
The model has 360,318 trainable parameters
evaluating...: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 342.81it/s]
test_acc: 0.355
