In [1]:
from copy import deepcopy
import time
import random
import numpy as np
from sklearn.metrics import roc_curve, auc

import nltk

nltk.download("punkt")
from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn

from torchtext.data import Field
from torchtext.data import TabularDataset
from torchtext.data import BucketIterator
from torchtext.data import Iterator

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
RANDOM_SEED = 2020
torch.manual_seed(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

# DATA_PATH = "data/processed/"
DATA_PATH = "/content/"

In [3]:
import pandas as pd

train_df = []
valid_df = []
test_df = []
for data_name in ["sat", "cola"]:
    train_df += [pd.read_csv(f"{DATA_PATH}/{data_name}_train.tsv", sep="\t")]
    valid_df += [pd.read_csv(f"{DATA_PATH}/{data_name}_valid.tsv", sep="\t")]
    test_df += [pd.read_csv(f"{DATA_PATH}/{data_name}_test.tsv", sep="\t")]

train_df.append(test_df.pop(1))
train_df = pd.concat(train_df)
valid_df = pd.concat(valid_df)
test_df = pd.concat(test_df)

train_df.to_csv("mix_train.tsv", sep="\t", index=False)
valid_df.to_csv("mix_valid.tsv", sep="\t", index=False)
test_df.to_csv("mix_test.tsv", sep="\t", index=False)

## Dataset

In [4]:
TEXT = Field(
    sequential=True,
    use_vocab=True,
    tokenize=word_tokenize,
    lower=True,
    batch_first=True,
)
LABEL = Field(
    sequential=False,
    use_vocab=False,
    batch_first=True,
)


mix_train_data, mix_valid_data, mix_test_data = TabularDataset.splits(
    path=DATA_PATH,
    train="mix_train.tsv",
    validation="mix_valid.tsv",
    test="mix_test.tsv",
    format="tsv",
    fields=[("text", TEXT), ("label", LABEL)],
    skip_header=1,
)

TEXT.build_vocab(mix_train_data, min_freq=2)


mix_train_iterator, mix_valid_iterator, mix_test_iterator = BucketIterator.splits(
    (mix_train_data, mix_valid_data, mix_test_data),
    batch_size=32,
    device=None,
    sort=False,
)

## LSTM Classifier

In [5]:
class LSTMClassifier(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_size, num_layers, pad_idx):
        super().__init__()
        self.embed_layer = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim, padding_idx=pad_idx)
        self.lstm_layer = nn.LSTM(
            input_size=embedding_dim, hidden_size=hidden_size, num_layers=num_layers, bidirectional=True, dropout=0.5
        )
        self.last_layer = nn.Sequential(
            nn.Linear(hidden_size * 2, hidden_size),
            nn.Dropout(0.5),
            nn.LeakyReLU(),
            nn.Linear(hidden_size, 1),
            nn.Sigmoid(),
        )

    def forward(self, x):
        embed_x = self.embed_layer(x)
        output, (_, _) = self.lstm_layer(embed_x)
        last_output = output[:, -1, :]
        last_output = self.last_layer(last_output)
        return last_output

In [6]:
def train(model: nn.Module, iterator: Iterator, optimizer: torch.optim.Optimizer, criterion: nn.Module, device: str):
    model.train()

    epoch_loss = 0

    for _, batch in enumerate(iterator):
        optimizer.zero_grad()

        text = batch.text
        if text.shape[0] > 1:
            label = batch.label.type(torch.FloatTensor)
            text = text.to(device)
            label = label.to(device)

            output = model(text).flatten()
            loss = criterion(output, label)
            loss.backward()

            optimizer.step()

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)


def evaluate(model: nn.Module, iterator: Iterator, criterion: nn.Module, device: str):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for _, batch in enumerate(iterator):
            text = batch.text
            label = batch.label.type(torch.FloatTensor)
            text = text.to(device)
            label = label.to(device)
            output = model(text).flatten()
            loss = criterion(output, label)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)


def test(model: nn.Module, iterator: Iterator, device: str):

    with torch.no_grad():
        y_real = []
        y_pred = []
        model.eval()
        for batch in iterator:
            text = batch.text
            label = batch.label.type(torch.FloatTensor)
            text = text.to(device)

            output = model(text).flatten().cpu()

            y_real += [label]
            y_pred += [output]

        y_real = torch.cat(y_real)
        y_pred = torch.cat(y_pred)

    fpr, tpr, _ = roc_curve(y_real, y_pred)
    auroc = auc(fpr, tpr)

    return auroc


def epoch_time(start_time: int, end_time: int):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

## Pretrain with cola dataset

In [7]:
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
N_EPOCHS = 20

lstm_classifier = LSTMClassifier(
    num_embeddings=len(TEXT.vocab),
    embedding_dim=100,
    hidden_size=200,
    num_layers=4,
    pad_idx=PAD_IDX,
)
if torch.cuda.is_available():
    device = "cuda:0"
else:
    device = "cpu"
_ = lstm_classifier.to(device)

optimizer = torch.optim.Adam(lstm_classifier.parameters())
bce_loss_fn = nn.BCELoss()

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss = train(lstm_classifier, mix_train_iterator, optimizer, bce_loss_fn, device)
    valid_loss = evaluate(lstm_classifier, mix_valid_iterator, bce_loss_fn, device)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f"Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s")
    print(f"\tTrain Loss: {train_loss:.5f}")
    print(f"\t Val. Loss: {valid_loss:.5f}")

test_auroc = test(lstm_classifier, mix_test_iterator, device)

print(f"| SAT Dataset Test AUROC: {test_auroc:.5f}")

Epoch: 01 | Time: 0m 6s
	Train Loss: 0.61426
	 Val. Loss: 0.61013
Epoch: 02 | Time: 0m 6s
	Train Loss: 0.61105
	 Val. Loss: 0.61659
Epoch: 03 | Time: 0m 6s
	Train Loss: 0.60937
	 Val. Loss: 0.60751
Epoch: 04 | Time: 0m 6s
	Train Loss: 0.61026
	 Val. Loss: 0.61041
Epoch: 05 | Time: 0m 6s
	Train Loss: 0.63725
	 Val. Loss: 0.61229
Epoch: 06 | Time: 0m 6s
	Train Loss: 0.60929
	 Val. Loss: 0.60745
Epoch: 07 | Time: 0m 6s
	Train Loss: 0.60911
	 Val. Loss: 0.60779
Epoch: 08 | Time: 0m 6s
	Train Loss: 0.60752
	 Val. Loss: 0.60850
Epoch: 09 | Time: 0m 6s
	Train Loss: 0.60842
	 Val. Loss: 0.60936
Epoch: 10 | Time: 0m 6s
	Train Loss: 0.60775
	 Val. Loss: 0.60795
Epoch: 11 | Time: 0m 6s
	Train Loss: 0.60837
	 Val. Loss: 0.60838
Epoch: 12 | Time: 0m 6s
	Train Loss: 0.60814
	 Val. Loss: 0.60780
Epoch: 13 | Time: 0m 6s
	Train Loss: 0.60756
	 Val. Loss: 0.60844
Epoch: 14 | Time: 0m 6s
	Train Loss: 0.60784
	 Val. Loss: 0.60756
Epoch: 15 | Time: 0m 6s
	Train Loss: 0.60693
	 Val. Loss: 0.60760
Epoch: 16 

In [8]:
lstm_sat_test_auroc = test(lstm_classifier, mix_test_iterator, device)
print(f'| SAT Dataset Test AUROC: {lstm_sat_test_auroc:.5f}')

| SAT Dataset Test AUROC: 0.41919


In [9]:
def demo(classifier, device):
    sat_test = [ 
        "Speculations about the meaning and purpose of prehistoric art [rely] heavily on analogies drawn with modern-day hunter-gatherer societies.",
        "Such primitive societies, [as] Steven Mithen emphasizes in The Prehistory of the Modern Mind, tend to view man and beast, animal and plant, organic and inorganic spheres, as participants in an integrated, animated totality.",
        "The dual expressions of this tendency are anthropomorphism (the practice of regarding animals as humans) and totemism (the practice of regarding humans as animals), both of [which] spread through the visual art and the mythology of primitive cultures.",
        "When considered in this light, the visual preoccupation of early humans with the nonhuman creatures [inhabited] their world becomes profoundly meaningful.",
        "In the practice of totemism, he has suggested, an unlettered humanity “broods upon [itself] and its place in nature.”",
    ]
    sat_label = [1, 1, 1, 0, 1]
    sat_test = list(map(lambda x: x.replace("[", "").replace("]", ""), sat_test))
    tokenized_sentences = [word_tokenize(sentence) for sentence in sat_test]
    sentences = []
    for tokenized_sentence in tokenized_sentences:
        sentences.append([TEXT.vocab.stoi[word] for word in tokenized_sentence])

    predict = []
    for sentence in sentences:
        sentence = torch.LongTensor([sentence])
        predict += [lstm_classifier(sentence.to(device)).item()]
    return predict

In [11]:
demo(lstm_classifier, device)

[0.7096898555755615,
 0.7096898555755615,
 0.7096898555755615,
 0.7096898555755615,
 0.7095038890838623]