## Data preprocessing

In [2]:
import pandas as pd
import numpy as np

In [3]:
df_train = pd.read_csv("../data/train.csv")
df_train["combined"] = df_train["paragraph1"].astype(str) + " " + df_train["paragraph2"]

df_val  = pd.read_csv("../data/validation.csv")
df_val["combined"] = df_val["paragraph1"].astype(str) + " " + df_val["paragraph2"]

df_test = pd.read_csv("../data/test.csv")
df_test["combined"] = df_test["paragraph1"].astype(str) + " " + df_test["paragraph2"]

In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import spacy

spacy_eng = spacy.load("en_core_web_sm")

class Vocabulary:
    def __init__(self, freq_threshold):
        self.itos = {0: "<PAD>", 1: "<UNK>"}
        self.stoi = {"<PAD>": 0, "<UNK>": 1}
        self.freq_threshold = freq_threshold
        
    def __len__(self):
        return len(self.itos)
    
    @staticmethod
    def tokenizer_eng(text):
        # remove stopwords and convert to lowercase

        for word in spacy_eng.Defaults.stop_words:
            lexeme = spacy_eng.vocab[word]
            lexeme.is_stop = True


        return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]
    
    def build_vocabulary(self, sentence_list):
        frequencies = {}
        idx = 2
        
        for sentence in sentence_list:
            for word in self.tokenizer_eng(sentence):
                if word not in frequencies:
                    frequencies[word] = 1
                else:
                    frequencies[word] += 1
                    
                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1

    def numericalize(self, text):
        tokenized_text = self.tokenizer_eng(text)
        
        return [
            self.stoi[token] if token in self.stoi else self.stoi["<UNK>"]
            for token in tokenized_text
        ]


class CustomDataset(Dataset):
    def __init__(self, data_csv, freq_threshold=1):
        self.df = data_csv

        self.paragraph1 = self.df["paragraph1"]
        self.paragraph2 = self.df["paragraph2"]
        self.labels = self.df["label"]

        self.vocab = Vocabulary(freq_threshold)
        self.vocab.build_vocabulary(self.paragraph1.tolist())
        self.vocab.build_vocabulary(self.paragraph2.tolist())

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        paragraph1 = self.paragraph1.iloc[idx]
        paragraph2 = self.paragraph2.iloc[idx]
        label = self.labels.iloc[idx]
        
        paragraph1 = torch.tensor(self.vocab.numericalize(paragraph1))
        paragraph2 = torch.tensor(self.vocab.numericalize(paragraph2))

        return paragraph1, paragraph2, label
    

class MyCollate:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx
        
    def __call__(self, batch):
        paragraphs1 = [item[0] for item in batch]
        paragraphs2 = [item[1] for item in batch]
        labels = [item[2] for item in batch]
        
        paragraphs1 = pad_sequence(paragraphs1, batch_first=False, padding_value=self.pad_idx)
        paragraphs2 = pad_sequence(paragraphs2, batch_first=False, padding_value=self.pad_idx)
        
        return paragraphs1, paragraphs2, torch.tensor(labels).to(torch.float32)
    
    
def get_loader(data_csv, batch_size=32, num_workers=4, shuffle=True, pin_memory=True):
    dataset = CustomDataset(data_csv)
    pad_idx = dataset.vocab.stoi["<PAD>"]
    loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=shuffle,
        pin_memory=pin_memory,
        collate_fn=MyCollate(pad_idx),
    )
    
    return loader

def glove2dict(glove_path):
    with open(glove_path, 'r') as f:
        model = {}
        for line in f:
            splitLine = line.split()
            word = splitLine[0]
            embedding = np.array([float(val) for val in splitLine[1:]])
            model[word] = embedding
        
        model["<PAD>"] = np.zeros(100)
        model["<UNK>"] = np.mean(list(model.values()), axis=0)
        
    return model

def get_pretrained_weights(glove_path, word2idx, embedding_dim):
    glove = glove2dict(glove_path)
    weights_matrix = np.zeros((len(word2idx), embedding_dim))

    for word, idx in word2idx.items():
        try:
            weights_matrix[idx] = glove[word]
        except KeyError:
            pass
    return weights_matrix

dataset = CustomDataset(df_train)
weights_matrix = torch.tensor(get_pretrained_weights("glove.6B.100d.txt", dataset.vocab.stoi, 100))

# train_loader = get_loader(df_train)
# next(iter(train_loader))

In [5]:
import torch.nn as nn

def create_emb_layer(weights_matrix, non_trainable=True):
    num_embeddings, embedding_dim = weights_matrix.shape
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': weights_matrix})
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

class SiameseNetwork(nn.Module):
    def __init__(self, weights_matrix, hidden_size, num_layers):
        super().__init__()
        self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, True)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=False, bidirectional=True)
        self.fc = nn.Linear(hidden_size*2, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x1, x2):
        x1 = self.embedding(x1)
        x2 = self.embedding(x2)
        x1, _ = self.lstm(x1)
        x2, _ = self.lstm(x2)
        x1 = self.fc(x1[-1, :, :])
        x2 = self.fc(x2[-1, :, :])
        #cosine similarity between x1 and x2
        #x = torch.nn.functional.cosine_similarity(x1, x2, dim=1).unsqueeze(1)
        x = torch.abs(x1 - x2)
        x = self.sigmoid(x)

        return x

In [6]:

def train(model, iterator, optimizer, criterion, device):
    model.train()
    epoch_loss = 0

    for paragraphs1, paragraphs2, labels in iterator:
        paragraphs1, paragraphs2, labels = paragraphs1.to(device), paragraphs2.to(device), labels.to(device)
        optimizer.zero_grad()
        output = model(paragraphs1, paragraphs2).squeeze()
        #print(output.shape, labels.shape)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [7]:
from torcheval.metrics import BinaryF1Score


def evaluate(model, iterator, criterion, device):
    metric = BinaryF1Score()
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for paragraphs1, paragraphs2, labels in iterator:
            paragraphs1, paragraphs2, labels = paragraphs1.to(device), paragraphs2.to(device), labels.to(device)
            output = model(paragraphs1, paragraphs2).squeeze()
            loss = criterion(output, labels)
            epoch_loss += loss.item()
            metric.update(output, labels)


    return epoch_loss / len(iterator), metric.compute()

In [8]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SiameseNetwork(weights_matrix, 64, 4).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

train_loader = get_loader(df_train, batch_size=16)
val_loader = get_loader(df_val, batch_size=16)

In [9]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):
    train_loss = train(model, train_loader, optimizer, criterion, device)

    print(f"Epoch: {epoch+1}")
    print(f"Train Loss: {train_loss:.3f}")

Epoch: 1
Train Loss: 0.654
Epoch: 2
Train Loss: 0.585
Epoch: 3
Train Loss: 0.580
Epoch: 4
Train Loss: 0.573
Epoch: 5
Train Loss: 0.582


In [10]:


df_test = pd.read_csv("../data/test.csv")#
df_test["combined"] = df_test["paragraph1"].astype(str) + " " + df_test["paragraph2"]

df_test_easy = df_test[df_test["difficulty"] == "easy"]
df_test_medium = df_test[df_test["difficulty"] == "medium"]
df_test_hard = df_test[df_test["difficulty"] == "hard"]
test_loader_easy = get_loader(df_test_easy)
test_loader_medium = get_loader(df_test_medium)
test_loader_hard = get_loader(df_test_hard)

results_easy = evaluate(model, test_loader_easy, criterion, device)
results_medium = evaluate(model, test_loader_medium, criterion, device)
results_hard = evaluate(model, test_loader_hard, criterion, device)

print(f"Easy Test Loss: {results_easy[0]:.3f}, F1 Score: {results_easy[1]:.7f}")
print(f"Medium Test Loss: {results_medium[0]:.3f}, F1 Score: {results_medium[1]:.7f}")
print(f"Hard Test Loss: {results_hard[0]:.3f}, F1 Score: {results_hard[1]:.7f}")

Easy Test Loss: 0.682, F1 Score: 0.9454340
Medium Test Loss: 0.690, F1 Score: 0.7278308
Hard Test Loss: 0.696, F1 Score: 0.6261510
