## Data preprocessing

In [103]:
import pandas as pd
import numpy as np

In [104]:
df_train = pd.read_csv("data/train.csv")
df_train["combined"] = df_train["paragraph1"].astype(str) + " " + df_train["paragraph2"]

df_val  = pd.read_csv("data/validation.csv")
df_val["combined"] = df_val["paragraph1"].astype(str) + " " + df_val["paragraph2"]

df_test = pd.read_csv("data/test.csv")
df_test["combined"] = df_test["paragraph1"].astype(str) + " " + df_test["paragraph2"]

In [3]:
#Todo remove stopwords

ModuleNotFoundError: No module named 'regex'

In [105]:
import os
import pandas as pd
import spacy
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset

spacy_eng = spacy.load("en_core_web_sm")

In [106]:
class Vocabulary:
    def __init__(self, freq_threshold):
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.freq_threshold = freq_threshold
        
    def __len__(self):
        return len(self.itos)
    
    @staticmethod
    def tokenizer_eng(text):
        return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]
    
    def build_vocabulary(self, sentence_list):
        frequencies = {}
        idx = 4
        
        for sentence in sentence_list:
            for word in self.tokenizer_eng(sentence):
                if word not in frequencies:
                    frequencies[word] = 1
                else:
                    frequencies[word] += 1
                    
                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1

    def numericalize(self, text):
        tokenized_text = self.tokenizer_eng(text)
        
        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokenized_text
        ]


class CustomDataset(Dataset):
    def __init__(self, data_csv, freq_threshold=5):
        self.df = data_csv

        self.paragraphs = self.df["combined"]
        self.labels = self.df["label"]

        self.vocab = Vocabulary(freq_threshold)
        self.vocab.build_vocabulary(self.paragraphs.tolist())

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        paragraphs = self.paragraphs[idx]
        label = self.labels[idx]

        numericalized_paragraph = [self.vocab.stoi["<SOS>"]]
        numericalized_paragraph += self.vocab.numericalize(paragraphs)
        numericalized_paragraph.append(self.vocab.stoi["<EOS>"])
        
        return torch.tensor(numericalized_paragraph), label
    

class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx
        
    def __call__(self, batch):
        paragraphs = [item[0] for item in batch]
        labels = [item[1] for item in batch]
        
        paragraphs = pad_sequence(paragraphs, batch_first=False, padding_value=self.pad_idx)
        
        return paragraphs, torch.tensor(labels)
    
def get_loader(data_csv, batch_size=32, num_workers=4, shuffle=True, pin_memory=True):
    dataset = CustomDataset(data_csv)
    pad_idx = dataset.vocab.stoi["<PAD>"]
    loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=shuffle,
        pin_memory=pin_memory,
        collate_fn=MyCollate(pad_idx=pad_idx),
    )
    
    return loader

dataloader = get_loader(df_train)

In [107]:
for idx, (paragraphs, labels) in enumerate(dataloader):
    print(paragraphs.shape)
    print(labels.shape)

torch.Size([259, 32])
torch.Size([32])
torch.Size([235, 32])
torch.Size([32])
torch.Size([249, 32])
torch.Size([32])
torch.Size([252, 32])
torch.Size([32])
torch.Size([248, 32])
torch.Size([32])
torch.Size([210, 32])
torch.Size([32])
torch.Size([284, 32])
torch.Size([32])
torch.Size([272, 32])
torch.Size([32])
torch.Size([233, 32])
torch.Size([32])
torch.Size([333, 32])
torch.Size([32])
torch.Size([224, 32])
torch.Size([32])
torch.Size([254, 32])
torch.Size([32])
torch.Size([232, 32])
torch.Size([32])
torch.Size([233, 32])
torch.Size([32])
torch.Size([539, 32])
torch.Size([32])
torch.Size([253, 32])
torch.Size([32])
torch.Size([242, 32])
torch.Size([32])
torch.Size([182, 32])
torch.Size([32])
torch.Size([298, 32])
torch.Size([32])
torch.Size([189, 32])
torch.Size([32])
torch.Size([281, 32])
torch.Size([32])
torch.Size([184, 32])
torch.Size([32])
torch.Size([289, 32])
torch.Size([32])
torch.Size([242, 32])
torch.Size([32])
torch.Size([280, 32])
torch.Size([32])
torch.Size([202, 32])
tor

In [108]:
train_loader = get_loader(df_train)
valid_loader = get_loader(df_val)
test_loader = get_loader(df_test)

In [109]:
class BinarySequenceClassifier(torch.nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        super().__init__()
        
        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)
        self.rnn = torch.nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, dropout=dropout, bidirectional=True)
        self.fc = torch.nn.Linear(hidden_dim*2, output_dim)
        self.dropout = torch.nn.Dropout(dropout)
        self.sigmoid = torch.nn.Sigmoid()
        
    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        output, (hidden, cell) = self.rnn(embedded)
        hidden = self.dropout(output[-1, :, :])
        
        return self.sigmoid(self.fc(hidden))

In [110]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

input_dim = len(dataloader.dataset.vocab)
embedding_dim = 200
hidden_dim = 256
output_dim = 1
n_layers = 2
dropout = 0.5

model = BinarySequenceClassifier(input_dim, embedding_dim, hidden_dim, output_dim, n_layers, dropout)
model = model.to(device)

In [111]:
from torcheval.metrics import BinaryF1Score

def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()

    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(torch.float32).to(device)

        pred = model(X).squeeze()

        loss = loss_fn(pred, y)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f} [{current:>5d}|{size:>5d}]")

def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0

    metric = BinaryF1Score()

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(torch.float32).to(device)
            pred = model(X).squeeze()

            metric.update(pred, y)
            test_loss += loss_fn(pred, y).item()
            correct += (torch.round(pred) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size

    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, F1-score: {metric.compute()}, Avg loss: {test_loss:>8f} \n")


In [112]:
lr=1e-5
loss_fn = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
epochs = 50

for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_loader, model, loss_fn, optimizer)
    test(valid_loader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 0.687704 [  250|51962]
loss: 0.679936 [24139|51962]
loss: 0.692452 [46029|51962]
loss: 0.712233 [90601|51962]
loss: 0.622166 [215738|51962]
loss: 0.691409 [89679|51962]
loss: 0.695736 [128013|51962]
loss: 0.627823 [154921|51962]
loss: 0.723487 [167409|51962]
loss: 0.622638 [238765|51962]
loss: 0.701639 [223223|51962]
loss: 0.669605 [243321|51962]
loss: 0.629686 [309858|51962]
loss: 0.600286 [215966|51962]
loss: 0.745802 [469335|51962]
loss: 0.662443 [373749|51962]
loss: 0.654818 [339412|51962]
Test Error: 
 Accuracy: 59.9%, F1-score: 0.7493436336517334, Avg loss: 0.674640 

Epoch 2
-------------------------------
loss: 0.633675 [  219|51962]
loss: 0.670774 [29694|51962]
loss: 0.700333 [36783|51962]
loss: 0.596019 [54180|51962]
loss: 0.666163 [101854|51962]
loss: 0.652889 [114729|51962]
loss: 0.617303 [133422|51962]
loss: 0.719626 [140901|51962]
loss: 0.696111 [142578|51962]
loss: 0.707457 [162180|51962]
loss: 0.718239 [350350|51962]
loss: 0

In [2]:
from torchtext.vocab import GloVe

