In [1]:
from copy import deepcopy
import time
import random
import numpy as np
from sklearn.metrics import roc_curve, auc

import nltk

nltk.download("punkt")
from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn

from torchtext.data import Field
from torchtext.data import TabularDataset
from torchtext.data import BucketIterator
from torchtext.data import Iterator

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
RANDOM_SEED = 2020
torch.manual_seed(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

# DATA_PATH = "data/processed/"
DATA_PATH = "/content/"

## Dataset

In [3]:
TEXT = Field(
    sequential=True,
    use_vocab=True,
    tokenize=word_tokenize,
    lower=True,
    batch_first=True,
)
LABEL = Field(
    sequential=False,
    use_vocab=False,
    batch_first=True,
)


cola_train_data, cola_valid_data, cola_test_data = TabularDataset.splits(
    path=DATA_PATH,
    train="cola_train.tsv",
    validation="cola_valid.tsv",
    test="cola_test.tsv",
    format="tsv",
    fields=[("text", TEXT), ("label", LABEL)],
    skip_header=1
)

TEXT.build_vocab(cola_train_data, min_freq=2)


cola_train_iterator, cola_valid_iterator, cola_test_iterator = BucketIterator.splits(
    (cola_train_data, cola_valid_data, cola_test_data), 
    batch_size=32, 
    device=None,
    sort=False,
)


sat_train_data, sat_valid_data, sat_test_data = TabularDataset.splits(
    path=DATA_PATH,
    train="sat_train.tsv",
    validation="sat_valid.tsv",
    test="sat_test.tsv",
    format="tsv",
    fields=[("text", TEXT), ("label", LABEL)],
    skip_header=1
)

sat_train_iterator, sat_valid_iterator, sat_test_iterator = BucketIterator.splits(
    (sat_train_data, sat_valid_data, sat_test_data), 
    batch_size=8, 
    device=None,
    sort=False,
)

## LSTM Pooling Classifier

In [4]:
class LSTMPoolingClassifier(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_size, num_layers, pad_idx):
        super(LSTMPoolingClassifier, self).__init__()
        self.embed_layer = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim, padding_idx=pad_idx)
        self.hidden_size = hidden_size
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        self.ih2h = nn.LSTM(embedding_dim, hidden_size, num_layers=num_layers,
                            bidirectional=True, batch_first=True, dropout=0.5)
        self.pool2o = nn.Linear(2 * hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax()
        self.dropout = nn.Dropout(p=0.5)

    def forward(self, x):
        x = self.embed_layer(x)
        o, _ = self.ih2h(x)
        pool = nn.functional.max_pool1d(o.transpose(1, 2), x.shape[1])
        pool = pool.transpose(1, 2).squeeze()
        pool = self.dropout(pool)
        output = self.sigmoid(self.pool2o(pool))
        return output.squeeze()

In [5]:
def train(model: nn.Module,
          iterator: Iterator,
          optimizer: torch.optim.Optimizer,
          criterion: nn.Module,
          device: str):
    model.train()

    epoch_loss = 0

    for _, batch in enumerate(iterator):
        optimizer.zero_grad()

        text = batch.text
        if text.shape[0] > 1:
            label = batch.label.type(torch.FloatTensor)
            text = text.to(device)
            label = label.to(device)

            output = model(text).flatten()
            loss = criterion(output, label)
            loss.backward()

            optimizer.step()

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)


def evaluate(model: nn.Module,
             iterator: Iterator,
             criterion: nn.Module,
             device: str):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for _, batch in enumerate(iterator):
            text = batch.text
            label = batch.label.type(torch.FloatTensor)
            text = text.to(device)
            label = label.to(device)
            output = model(text).flatten()
            loss = criterion(output, label)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)


def test(
    model: nn.Module,
    iterator: Iterator,
    device: str):

    with torch.no_grad():
        y_real = []
        y_pred = []
        model.eval()
        for batch in iterator:
            text = batch.text
            label = batch.label.type(torch.FloatTensor)
            text = text.to(device)

            output = model(text).flatten().cpu()

            y_real += [label]
            y_pred += [output]

        y_real = torch.cat(y_real)
        y_pred = torch.cat(y_pred)

    fpr, tpr, _ = roc_curve(y_real, y_pred)
    auroc = auc(fpr, tpr)

    return auroc

def epoch_time(start_time: int,
               end_time: int):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

## Pretrain with cola dataset

In [6]:
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
N_EPOCHS = 20

lstm_pool_classifier = LSTMPoolingClassifier(
    num_embeddings=len(TEXT.vocab),
    embedding_dim=100,
    hidden_size=200,
    num_layers=4,
    pad_idx=PAD_IDX,
)

if torch.cuda.is_available():
    device = "cuda:0"
else:
    device = "cpu"
_ = lstm_pool_classifier.to(device)

optimizer = torch.optim.Adam(lstm_pool_classifier.parameters())
bce_loss_fn = nn.BCELoss()

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss = train(lstm_pool_classifier, cola_train_iterator, optimizer, bce_loss_fn, device)
    valid_loss = evaluate(lstm_pool_classifier, cola_valid_iterator, bce_loss_fn, device)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.5f}')
    print(f'\t Val. Loss: {valid_loss:.5f}')

test_auroc = test(lstm_pool_classifier, cola_test_iterator, device)

print(f"| CoLA Dataset Test AUROC: {test_auroc:.5f}")

Epoch: 01 | Time: 0m 4s
	Train Loss: 0.60920
	 Val. Loss: 0.61525
Epoch: 02 | Time: 0m 4s
	Train Loss: 0.59613
	 Val. Loss: 0.61253
Epoch: 03 | Time: 0m 4s
	Train Loss: 0.56938
	 Val. Loss: 0.63960
Epoch: 04 | Time: 0m 4s
	Train Loss: 0.53397
	 Val. Loss: 0.62946
Epoch: 05 | Time: 0m 4s
	Train Loss: 0.48429
	 Val. Loss: 0.71793
Epoch: 06 | Time: 0m 4s
	Train Loss: 0.43031
	 Val. Loss: 0.75253
Epoch: 07 | Time: 0m 4s
	Train Loss: 0.37760
	 Val. Loss: 0.83188
Epoch: 08 | Time: 0m 4s
	Train Loss: 0.31898
	 Val. Loss: 0.84299
Epoch: 09 | Time: 0m 4s
	Train Loss: 0.26919
	 Val. Loss: 1.04541
Epoch: 10 | Time: 0m 4s
	Train Loss: 0.22640
	 Val. Loss: 1.07994
Epoch: 11 | Time: 0m 4s
	Train Loss: 0.18746
	 Val. Loss: 1.26130
Epoch: 12 | Time: 0m 4s
	Train Loss: 0.15813
	 Val. Loss: 1.37582
Epoch: 13 | Time: 0m 4s
	Train Loss: 0.12598
	 Val. Loss: 1.60522
Epoch: 14 | Time: 0m 4s
	Train Loss: 0.11131
	 Val. Loss: 1.52224
Epoch: 15 | Time: 0m 4s
	Train Loss: 0.10088
	 Val. Loss: 1.50670
Epoch: 16 

In [7]:
before_tuning_lstm_pool_classifier = deepcopy(lstm_pool_classifier)

pool_sat_test_auroc = test(lstm_pool_classifier, sat_test_iterator, device)
print(f'| SAT Dataset Test AUROC: {pool_sat_test_auroc:.5f}')

| SAT Dataset Test AUROC: 0.54545


## Fine Tuning

In [8]:
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
N_EPOCHS = 20


for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss = train(lstm_pool_classifier, sat_train_iterator, optimizer, bce_loss_fn, device)
    valid_loss = evaluate(lstm_pool_classifier, sat_valid_iterator, bce_loss_fn, device)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.5f}')
    print(f'\t Val. Loss: {valid_loss:.5f}')

pool_tuned_test_auroc = test(lstm_pool_classifier, sat_test_iterator, device)

print(f"| SAT Dataset Test AUROC: {pool_tuned_test_auroc:.5f}")

Epoch: 01 | Time: 0m 0s
	Train Loss: 0.62461
	 Val. Loss: 0.53197
Epoch: 02 | Time: 0m 0s
	Train Loss: 0.49336
	 Val. Loss: 0.51208
Epoch: 03 | Time: 0m 0s
	Train Loss: 0.47483
	 Val. Loss: 0.50871
Epoch: 04 | Time: 0m 0s
	Train Loss: 0.47292
	 Val. Loss: 0.50922
Epoch: 05 | Time: 0m 0s
	Train Loss: 0.45890
	 Val. Loss: 0.50705
Epoch: 06 | Time: 0m 0s
	Train Loss: 0.47014
	 Val. Loss: 0.51051
Epoch: 07 | Time: 0m 0s
	Train Loss: 0.44802
	 Val. Loss: 0.49968
Epoch: 08 | Time: 0m 0s
	Train Loss: 0.42673
	 Val. Loss: 0.49511
Epoch: 09 | Time: 0m 0s
	Train Loss: 0.40429
	 Val. Loss: 0.51105
Epoch: 10 | Time: 0m 0s
	Train Loss: 0.41333
	 Val. Loss: 0.60787
Epoch: 11 | Time: 0m 0s
	Train Loss: 0.36903
	 Val. Loss: 0.48466
Epoch: 12 | Time: 0m 0s
	Train Loss: 0.38612
	 Val. Loss: 0.55703
Epoch: 13 | Time: 0m 0s
	Train Loss: 0.41015
	 Val. Loss: 0.55473
Epoch: 14 | Time: 0m 0s
	Train Loss: 0.32040
	 Val. Loss: 0.50700
Epoch: 15 | Time: 0m 0s
	Train Loss: 0.29411
	 Val. Loss: 0.56336
Epoch: 16 

In [9]:
print(f"Before fine-tuning SAT Dataset Test AUROC: {pool_sat_test_auroc:.5f}")
print(f"After fine-tuning SAT Dataset Test AUROC: {pool_tuned_test_auroc:.5f}")

Before fine-tuning SAT Dataset Test AUROC: 0.54545
After fine-tuning SAT Dataset Test AUROC: 0.58081


In [13]:
def demo(classifier, device):
    sat_test = [ 
        "Speculations about the meaning and purpose of prehistoric art [rely] heavily on analogies drawn with modern-day hunter-gatherer societies.",
        "Such primitive societies, [as] Steven Mithen emphasizes in The Prehistory of the Modern Mind, tend to view man and beast, animal and plant, organic and inorganic spheres, as participants in an integrated, animated totality.",
        "The dual expressions of this tendency are anthropomorphism (the practice of regarding animals as humans) and totemism (the practice of regarding humans as animals), both of [which] spread through the visual art and the mythology of primitive cultures.",
        "When considered in this light, the visual preoccupation of early humans with the nonhuman creatures [inhabited] their world becomes profoundly meaningful.",
        "In the practice of totemism, he has suggested, an unlettered humanity “broods upon [itself] and its place in nature.”",
    ]
    sat_label = [1, 1, 1, 0, 1]
    sat_test = list(map(lambda x: x.replace("[", "").replace("]", ""), sat_test))
    tokenized_sentences = [word_tokenize(sentence) for sentence in sat_test]
    sentences = []
    for tokenized_sentence in tokenized_sentences:
        sentences.append([TEXT.vocab.stoi[word] for word in tokenized_sentence])

    classifier.eval()
    predict = []
    for sentence in sentences:
        sentence = torch.LongTensor([sentence])
        predict += [classifier(sentence.to(device)).item()]
    return predict

In [14]:
demo(before_tuning_lstm_pool_classifier, device)

  self.dropout, self.training, self.bidirectional, self.batch_first)


[0.9988611936569214,
 0.9957309365272522,
 0.9773626923561096,
 0.9539982676506042,
 0.8067118525505066]

In [15]:
demo(lstm_pool_classifier, device)

[0.9928597211837769,
 0.993202805519104,
 0.9178707599639893,
 0.9915441274642944,
 0.9843350648880005]