In [1]:
import numpy as np

import time
import random
from sklearn.metrics import roc_curve, auc

import nltk

nltk.download("punkt")
from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn

from torchtext.data import Field
from torchtext.data import TabularDataset
from torchtext.data import BucketIterator
from torchtext.data import Iterator

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
RANDOM_SEED = 2020
torch.manual_seed(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

DATA_PATH = "data/processed/"

## Dataset

In [3]:
TEXT = Field(
    sequential=True,
    use_vocab=True,
    tokenize=word_tokenize,
    lower=True,
    batch_first=True,
)
LABEL = Field(
    sequential=False,
    use_vocab=False,
    batch_first=True,
)


cola_train_data, cola_valid_data, cola_test_data = TabularDataset.splits(
    path=DATA_PATH,
    train="cola_train.tsv",
    validation="cola_valid.tsv",
    test="cola_test.tsv",
    format="tsv",
    fields=[("text", TEXT), ("label", LABEL)],
    skip_header=1
)

TEXT.build_vocab(cola_train_data, min_freq=2)


cola_train_iterator, cola_valid_iterator, cola_test_iterator = BucketIterator.splits(
    (cola_train_data, cola_valid_data, cola_test_data), 
    batch_size=32, 
    device=None,
    sort=False,
)


sat_train_data, sat_valid_data, sat_test_data = TabularDataset.splits(
    path=DATA_PATH,
    train="sat_train.tsv",
    validation="sat_valid.tsv",
    test="sat_test.tsv",
    format="tsv",
    fields=[("text", TEXT), ("label", LABEL)],
    skip_header=1
)

sat_train_iterator, sat_valid_iterator, sat_test_iterator = BucketIterator.splits(
    (sat_train_data, sat_valid_data, sat_test_data), 
    batch_size=8, 
    device=None,
    sort=False,
)

## LSTM Classifier

In [4]:
class LSTMClassifier(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_size, num_layers, pad_idx):
        super().__init__()
        self.embed_layer = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim, padding_idx=pad_idx)
        self.lstm_layer = nn.LSTM(
            input_size=embedding_dim, hidden_size=hidden_size, num_layers=num_layers, bidirectional=True, dropout=0.5
        )
        self.last_layer = nn.Sequential(
            nn.Linear(hidden_size*2, hidden_size),
            nn.Dropout(0.5),
            nn.LeakyReLU(),
            nn.Linear(hidden_size, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        embed_x = self.embed_layer(x)
        output, (_, _) = self.lstm_layer(embed_x)
        last_output = output[:,-1,:]
        last_output = self.last_layer(last_output)
        return last_output

In [5]:
def train(model: nn.Module,
          iterator: Iterator,
          optimizer: torch.optim.Optimizer,
          criterion: nn.Module,
          device: str):
    # https://tutorials.pytorch.kr/beginner/torchtext_translation_tutorial.html
    model.train()

    epoch_loss = 0

    for _, batch in enumerate(iterator):
        optimizer.zero_grad()

        text = batch.text
        if text.shape[0] > 1:
            label = batch.label.type(torch.FloatTensor)
            text = text.to(device)
            label = label.to(device)

            output = model(text).flatten()
            loss = criterion(output, label)
            loss.backward()

            optimizer.step()

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)


def evaluate(model: nn.Module,
             iterator: Iterator,
             criterion: nn.Module,
             device: str):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for _, batch in enumerate(iterator):
            text = batch.text
            label = batch.label.type(torch.FloatTensor)
            text = text.to(device)
            label = label.to(device)
            output = model(text).flatten()
            loss = criterion(output, label)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)


def test(
    model: nn.Module,
    iterator: Iterator,
    device: str):

    with torch.no_grad():
        y_real = []
        y_pred = []
        model.eval()
        for batch in iterator:
            text = batch.text
            label = batch.label.type(torch.FloatTensor)
            text = text.to(device)

            output = model(text).flatten().cpu()

            y_real += [label]
            y_pred += [output]

        y_real = torch.cat(y_real)
        y_pred = torch.cat(y_pred)

    fpr, tpr, _ = roc_curve(y_real, y_pred)
    auroc = auc(fpr, tpr)

    return auroc

def epoch_time(start_time: int,
               end_time: int):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

## Pretrain with cola dataset

In [6]:
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
N_EPOCHS = 20

lstm_classifier = LSTMClassifier(
    num_embeddings=len(TEXT.vocab),
    embedding_dim=100,
    hidden_size=200,
    num_layers=4,
    pad_idx=PAD_IDX,
)
if torch.cuda.is_available():
    device = "cuda:0"
else:
    device = "cpu"
_ = lstm_classifier.to(device)

optimizer = torch.optim.Adam(lstm_classifier.parameters())
bce_loss_fn = nn.BCELoss()

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss = train(lstm_classifier, cola_train_iterator, optimizer, bce_loss_fn, device)
    valid_loss = evaluate(lstm_classifier, cola_valid_iterator, bce_loss_fn, device)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.5f}')
    print(f'\t Val. Loss: {valid_loss:.5f}')

test_auroc = test(lstm_classifier, cola_test_iterator, device)

print(f'| Test AUROC: {test_auroc:.5f}')

Epoch: 01 | Time: 0m 5s
	Train Loss: 0.61555
	 Val. Loss: 0.61686
Epoch: 02 | Time: 0m 5s
	Train Loss: 0.61094
	 Val. Loss: 0.61838
Epoch: 03 | Time: 0m 5s
	Train Loss: 0.61013
	 Val. Loss: 0.61736
Epoch: 04 | Time: 0m 5s
	Train Loss: 0.63810
	 Val. Loss: 0.61793
Epoch: 05 | Time: 0m 5s
	Train Loss: 0.61023
	 Val. Loss: 0.61750
Epoch: 06 | Time: 0m 5s
	Train Loss: 0.60996
	 Val. Loss: 0.61784
Epoch: 07 | Time: 0m 5s
	Train Loss: 0.60867
	 Val. Loss: 0.61822
Epoch: 08 | Time: 0m 5s
	Train Loss: 0.61013
	 Val. Loss: 0.61761
Epoch: 09 | Time: 0m 5s
	Train Loss: 0.60868
	 Val. Loss: 0.61935
Epoch: 10 | Time: 0m 5s
	Train Loss: 0.60879
	 Val. Loss: 0.61786
Epoch: 11 | Time: 0m 5s
	Train Loss: 0.60929
	 Val. Loss: 0.61792
Epoch: 12 | Time: 0m 5s
	Train Loss: 0.60853
	 Val. Loss: 0.62832
Epoch: 13 | Time: 0m 5s
	Train Loss: 0.60849
	 Val. Loss: 0.61932
Epoch: 14 | Time: 0m 5s
	Train Loss: 0.60850
	 Val. Loss: 0.61786
Epoch: 15 | Time: 0m 5s
	Train Loss: 0.60764
	 Val. Loss: 0.61783
Epoch: 16 

In [7]:
lstm_sat_test_auroc = test(lstm_classifier, sat_test_iterator, device)
lstm_sat_test_auroc

0.38383838383838387

## Fine Tuning

In [8]:
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
N_EPOCHS = 20


for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss = train(lstm_classifier, sat_train_iterator, optimizer, bce_loss_fn, device)
    valid_loss = evaluate(lstm_classifier, sat_valid_iterator, bce_loss_fn, device)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.5f}')
    print(f'\t Val. Loss: {valid_loss:.5f}')

lstm_tuned_test_auroc = test(lstm_classifier, sat_test_iterator, device)

print(f'| Test AUROC: {lstm_tuned_test_auroc:.5f}')

Epoch: 01 | Time: 0m 0s
	Train Loss: 0.47950
	 Val. Loss: 0.49541
Epoch: 02 | Time: 0m 0s
	Train Loss: 0.46661
	 Val. Loss: 0.49877
Epoch: 03 | Time: 0m 0s
	Train Loss: 0.46235
	 Val. Loss: 0.49688
Epoch: 04 | Time: 0m 0s
	Train Loss: 0.46105
	 Val. Loss: 0.49632
Epoch: 05 | Time: 0m 0s
	Train Loss: 0.48165
	 Val. Loss: 0.50624
Epoch: 06 | Time: 0m 0s
	Train Loss: 0.46324
	 Val. Loss: 0.51241
Epoch: 07 | Time: 0m 0s
	Train Loss: 0.46848
	 Val. Loss: 0.50255
Epoch: 08 | Time: 0m 0s
	Train Loss: 0.46075
	 Val. Loss: 0.50101
Epoch: 09 | Time: 0m 0s
	Train Loss: 0.47238
	 Val. Loss: 0.49961
Epoch: 10 | Time: 0m 0s
	Train Loss: 0.46845
	 Val. Loss: 0.50225
Epoch: 11 | Time: 0m 0s
	Train Loss: 0.45874
	 Val. Loss: 0.50258
Epoch: 12 | Time: 0m 0s
	Train Loss: 0.46244
	 Val. Loss: 0.50454
Epoch: 13 | Time: 0m 0s
	Train Loss: 0.45688
	 Val. Loss: 0.50131
Epoch: 14 | Time: 0m 0s
	Train Loss: 0.46865
	 Val. Loss: 0.50134
Epoch: 15 | Time: 0m 0s
	Train Loss: 0.46426
	 Val. Loss: 0.49735
Epoch: 16 

## LSTM Pooling Classifier

In [9]:
class LSTMPoolingClassifier(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_size, num_layers, pad_idx):
        super(LSTMPoolingClassifier, self).__init__()
        self.embed_layer = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim, padding_idx=pad_idx)
        self.hidden_size = hidden_size
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        self.ih2h = nn.LSTM(embedding_dim, hidden_size, num_layers=num_layers,
                            bidirectional=True, batch_first=True, dropout=0.5)
        self.pool2o = nn.Linear(2 * hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax()
        self.dropout = nn.Dropout(p=0.5)

    def forward(self, x):
        x = self.embed_layer(x)
        o, _ = self.ih2h(x)
        pool = nn.functional.max_pool1d(o.transpose(1, 2), x.shape[1])
        pool = pool.transpose(1, 2).squeeze()
        pool = self.dropout(pool)
        output = self.sigmoid(self.pool2o(pool))
        return output.squeeze()

## Pretrain with cola dataset

In [10]:
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
N_EPOCHS = 20

lstm_pool_classifier = LSTMPoolingClassifier(
    num_embeddings=len(TEXT.vocab),
    embedding_dim=100,
    hidden_size=200,
    num_layers=4,
    pad_idx=PAD_IDX,
)

if torch.cuda.is_available():
    device = "cuda:0"
else:
    device = "cpu"
_ = lstm_pool_classifier.to(device)

optimizer = torch.optim.Adam(lstm_pool_classifier.parameters())
bce_loss_fn = nn.BCELoss()

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss = train(lstm_pool_classifier, cola_train_iterator, optimizer, bce_loss_fn, device)
    valid_loss = evaluate(lstm_pool_classifier, cola_valid_iterator, bce_loss_fn, device)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.5f}')
    print(f'\t Val. Loss: {valid_loss:.5f}')

test_auroc = test(lstm_pool_classifier, cola_test_iterator, device)

print(f'| Test AUROC: {test_auroc:.5f}')

Epoch: 01 | Time: 0m 4s
	Train Loss: 0.60776
	 Val. Loss: 0.61853
Epoch: 02 | Time: 0m 4s
	Train Loss: 0.59857
	 Val. Loss: 0.61682
Epoch: 03 | Time: 0m 4s
	Train Loss: 0.57342
	 Val. Loss: 0.60749
Epoch: 04 | Time: 0m 4s
	Train Loss: 0.53378
	 Val. Loss: 0.62361
Epoch: 05 | Time: 0m 4s
	Train Loss: 0.47542
	 Val. Loss: 0.67913
Epoch: 06 | Time: 0m 4s
	Train Loss: 0.40751
	 Val. Loss: 0.75237
Epoch: 07 | Time: 0m 4s
	Train Loss: 0.34357
	 Val. Loss: 0.85223
Epoch: 08 | Time: 0m 4s
	Train Loss: 0.27945
	 Val. Loss: 0.93242
Epoch: 09 | Time: 0m 4s
	Train Loss: 0.22622
	 Val. Loss: 1.07371
Epoch: 10 | Time: 0m 4s
	Train Loss: 0.18782
	 Val. Loss: 1.20898
Epoch: 11 | Time: 0m 4s
	Train Loss: 0.14294
	 Val. Loss: 1.29697
Epoch: 12 | Time: 0m 4s
	Train Loss: 0.12605
	 Val. Loss: 1.37011
Epoch: 13 | Time: 0m 4s
	Train Loss: 0.11195
	 Val. Loss: 1.43069
Epoch: 14 | Time: 0m 4s
	Train Loss: 0.09254
	 Val. Loss: 1.57049
Epoch: 15 | Time: 0m 4s
	Train Loss: 0.08187
	 Val. Loss: 1.59783
Epoch: 16 

In [11]:
pool_sat_test_auroc = test(lstm_pool_classifier, sat_test_iterator, device)
pool_sat_test_auroc

0.6262626262626263

## Fine Tuning

In [12]:
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
N_EPOCHS = 20


for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss = train(lstm_pool_classifier, sat_train_iterator, optimizer, bce_loss_fn, device)
    valid_loss = evaluate(lstm_pool_classifier, sat_valid_iterator, bce_loss_fn, device)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.5f}')
    print(f'\t Val. Loss: {valid_loss:.5f}')

pool_tuned_test_auroc = test(lstm_pool_classifier, sat_test_iterator, device)

print(f'| Test AUROC: {pool_tuned_test_auroc:.5f}')

Epoch: 01 | Time: 0m 0s
	Train Loss: 0.83396
	 Val. Loss: 0.60393
Epoch: 02 | Time: 0m 0s
	Train Loss: 0.49799
	 Val. Loss: 0.53601
Epoch: 03 | Time: 0m 0s
	Train Loss: 0.49876
	 Val. Loss: 0.49614
Epoch: 04 | Time: 0m 0s
	Train Loss: 0.48081
	 Val. Loss: 0.48205
Epoch: 05 | Time: 0m 0s
	Train Loss: 0.46224
	 Val. Loss: 0.49455
Epoch: 06 | Time: 0m 0s
	Train Loss: 0.42684
	 Val. Loss: 0.45782
Epoch: 07 | Time: 0m 0s
	Train Loss: 0.45990
	 Val. Loss: 0.52117
Epoch: 08 | Time: 0m 0s
	Train Loss: 0.41528
	 Val. Loss: 0.45893
Epoch: 09 | Time: 0m 0s
	Train Loss: 0.39553
	 Val. Loss: 0.46371
Epoch: 10 | Time: 0m 0s
	Train Loss: 0.34969
	 Val. Loss: 0.47822
Epoch: 11 | Time: 0m 0s
	Train Loss: 0.31372
	 Val. Loss: 0.55054
Epoch: 12 | Time: 0m 0s
	Train Loss: 0.26339
	 Val. Loss: 0.55815
Epoch: 13 | Time: 0m 0s
	Train Loss: 0.26382
	 Val. Loss: 0.66740
Epoch: 14 | Time: 0m 0s
	Train Loss: 0.27018
	 Val. Loss: 0.55313
Epoch: 15 | Time: 0m 0s
	Train Loss: 0.31335
	 Val. Loss: 0.56001
Epoch: 16 

In [13]:
lstm_sat_test_auroc, lstm_tuned_test_auroc, pool_sat_test_auroc, pool_tuned_test_auroc

(0.38383838383838387,
 0.4393939393939394,
 0.6262626262626263,
 0.7777777777777778)