In [1]:
import time
import random
import numpy as np
from sklearn.metrics import roc_curve, auc

import nltk

nltk.download("punkt")
from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn

from torchtext.data import Field
from torchtext.data import TabularDataset
from torchtext.data import BucketIterator
from torchtext.data import Iterator

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
RANDOM_SEED = 2020
torch.manual_seed(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

# DATA_PATH = "data/processed/"
DATA_PATH = "/content/"

## Dataset

In [3]:
TEXT = Field(
    sequential=True,
    use_vocab=True,
    tokenize=word_tokenize,
    lower=True,
    batch_first=True,
)
LABEL = Field(
    sequential=False,
    use_vocab=False,
    batch_first=True,
)


sat_train_data, sat_valid_data, sat_test_data = TabularDataset.splits(
    path=DATA_PATH,
    train="sat_train.tsv",
    validation="sat_valid.tsv",
    test="sat_test.tsv",
    format="tsv",
    fields=[("text", TEXT), ("label", LABEL)],
    skip_header=1,
)

sat_train_iterator, sat_valid_iterator, sat_test_iterator = BucketIterator.splits(
    (sat_train_data, sat_valid_data, sat_test_data),
    batch_size=8,
    device=None,
    sort=False,
)

TEXT.build_vocab(sat_train_data, min_freq=2)

## LSTM Classifier

In [4]:
class LSTMClassifier(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_size, num_layers, pad_idx):
        super().__init__()
        self.embed_layer = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim, padding_idx=pad_idx)
        self.lstm_layer = nn.LSTM(
            input_size=embedding_dim, hidden_size=hidden_size, num_layers=num_layers, bidirectional=True, dropout=0.5
        )
        self.last_layer = nn.Sequential(
            nn.Linear(hidden_size * 2, hidden_size),
            nn.Dropout(0.5),
            nn.LeakyReLU(),
            nn.Linear(hidden_size, 1),
            nn.Sigmoid(),
        )

    def forward(self, x):
        embed_x = self.embed_layer(x)
        output, (_, _) = self.lstm_layer(embed_x)
        last_output = output[:, -1, :]
        last_output = self.last_layer(last_output)
        return last_output

In [5]:
def train(model: nn.Module, iterator: Iterator, optimizer: torch.optim.Optimizer, criterion: nn.Module, device: str):
    model.train()

    epoch_loss = 0

    for _, batch in enumerate(iterator):
        optimizer.zero_grad()

        text = batch.text
        if text.shape[0] > 1:
            label = batch.label.type(torch.FloatTensor)
            text = text.to(device)
            label = label.to(device)

            output = model(text).flatten()
            loss = criterion(output, label)
            loss.backward()

            optimizer.step()

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)


def evaluate(model: nn.Module, iterator: Iterator, criterion: nn.Module, device: str):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for _, batch in enumerate(iterator):
            text = batch.text
            label = batch.label.type(torch.FloatTensor)
            text = text.to(device)
            label = label.to(device)
            output = model(text).flatten()
            loss = criterion(output, label)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)


def test(model: nn.Module, iterator: Iterator, device: str):

    with torch.no_grad():
        y_real = []
        y_pred = []
        model.eval()
        for batch in iterator:
            text = batch.text
            label = batch.label.type(torch.FloatTensor)
            text = text.to(device)

            output = model(text).flatten().cpu()

            y_real += [label]
            y_pred += [output]

        y_real = torch.cat(y_real)
        y_pred = torch.cat(y_pred)

    fpr, tpr, _ = roc_curve(y_real, y_pred)
    auroc = auc(fpr, tpr)

    return auroc


def epoch_time(start_time: int, end_time: int):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

## Pretrain with cola dataset

In [6]:
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
N_EPOCHS = 20

lstm_classifier = LSTMClassifier(
    num_embeddings=len(TEXT.vocab),
    embedding_dim=100,
    hidden_size=200,
    num_layers=4,
    pad_idx=PAD_IDX,
)
if torch.cuda.is_available():
    device = "cuda:0"
else:
    device = "cpu"
_ = lstm_classifier.to(device)

optimizer = torch.optim.Adam(lstm_classifier.parameters())
bce_loss_fn = nn.BCELoss()

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss = train(lstm_classifier, sat_train_iterator, optimizer, bce_loss_fn, device)
    valid_loss = evaluate(lstm_classifier, sat_valid_iterator, bce_loss_fn, device)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f"Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s")
    print(f"\tTrain Loss: {train_loss:.5f}")
    print(f"\t Val. Loss: {valid_loss:.5f}")

test_auroc = test(lstm_classifier, sat_test_iterator, device)

print(f"| SAT Dataset Test AUROC: {test_auroc:.5f}")

Epoch: 01 | Time: 0m 0s
	Train Loss: 0.51519
	 Val. Loss: 0.50922
Epoch: 02 | Time: 0m 0s
	Train Loss: 0.48105
	 Val. Loss: 0.49235
Epoch: 03 | Time: 0m 0s
	Train Loss: 0.46303
	 Val. Loss: 0.49233
Epoch: 04 | Time: 0m 0s
	Train Loss: 0.48957
	 Val. Loss: 0.49120
Epoch: 05 | Time: 0m 0s
	Train Loss: 0.48043
	 Val. Loss: 0.49304
Epoch: 06 | Time: 0m 0s
	Train Loss: 0.45758
	 Val. Loss: 0.49220
Epoch: 07 | Time: 0m 0s
	Train Loss: 0.47639
	 Val. Loss: 0.49857
Epoch: 08 | Time: 0m 0s
	Train Loss: 0.46478
	 Val. Loss: 0.49970
Epoch: 09 | Time: 0m 0s
	Train Loss: 0.47604
	 Val. Loss: 0.49115
Epoch: 10 | Time: 0m 0s
	Train Loss: 0.47026
	 Val. Loss: 0.49443
Epoch: 11 | Time: 0m 0s
	Train Loss: 0.48204
	 Val. Loss: 0.50864
Epoch: 12 | Time: 0m 0s
	Train Loss: 0.48837
	 Val. Loss: 0.50010
Epoch: 13 | Time: 0m 0s
	Train Loss: 0.48064
	 Val. Loss: 0.50467
Epoch: 14 | Time: 0m 0s
	Train Loss: 0.47022
	 Val. Loss: 0.49901
Epoch: 15 | Time: 0m 0s
	Train Loss: 0.46334
	 Val. Loss: 0.48793
Epoch: 16 

In [7]:
test_auroc = test(lstm_classifier, sat_test_iterator, device)

print(f"| SAT Dataset Test AUROC: {test_auroc:.5f}")

| SAT Dataset Test AUROC: 0.31818


1. Speculations about the meaning and purpose of prehistoric art [rely] heavily on analogies drawn with modern-day hunter-gatherer societies. 
2. Such primitive societies,  [as] Steven Mithen emphasizes in The Prehistory of the Modern Mind, tend to view man and beast, animal and plant, organic and inorganic spheres, as participants in an integrated, animated totality. 
3. The dual expressions of this tendency are anthropomorphism (the practice of regarding animals as humans) and totemism (the practice of regarding humans as animals), both of [which] spread through the visual art and the mythology of primitive cultures. 
4. When considered in this light, the visual preoccupation of early humans with the nonhuman creatures [inhabited] their world becomes profoundly meaningful. 
5. In the practice of totemism, he has suggested, an unlettered humanity “broods upon [itself] and its place in nature.” 

In [8]:
def demo(classifier, device):
    sat_test = [ 
        "Speculations about the meaning and purpose of prehistoric art [rely] heavily on analogies drawn with modern-day hunter-gatherer societies.",
        "Such primitive societies, [as] Steven Mithen emphasizes in The Prehistory of the Modern Mind, tend to view man and beast, animal and plant, organic and inorganic spheres, as participants in an integrated, animated totality.",
        "The dual expressions of this tendency are anthropomorphism (the practice of regarding animals as humans) and totemism (the practice of regarding humans as animals), both of [which] spread through the visual art and the mythology of primitive cultures.",
        "When considered in this light, the visual preoccupation of early humans with the nonhuman creatures [inhabited] their world becomes profoundly meaningful.",
        "In the practice of totemism, he has suggested, an unlettered humanity “broods upon [itself] and its place in nature.”",
    ]
    sat_label = [1, 1, 1, 0, 1]
    sat_test = list(map(lambda x: x.replace("[", "").replace("]", ""), sat_test))
    tokenized_sentences = [word_tokenize(sentence) for sentence in sat_test]
    sentences = []
    for tokenized_sentence in tokenized_sentences:
        sentences.append([TEXT.vocab.stoi[word] for word in tokenized_sentence])

    predict = []
    for sentence in sentences:
        sentence = torch.LongTensor([sentence])
        predict += [lstm_classifier(sentence.to(device)).item()]
    return predict

In [11]:
demo(lstm_classifier, device)

[0.6600065231323242,
 0.6600065231323242,
 0.6600065231323242,
 0.6600065231323242,
 0.6573175191879272]