# Demo code
In this notebook, the code csv file is evaluated by loading an already trained model. The approach is
Sentence-Level Evidence Embedding for Claim Verification with Hierarchical Attention Networks implemented according to [research done by Jing MA, Wei GAO, Shafiq JOTY and Kam-Fai Wong at Singapore Management Universtiy](https://ink.library.smu.edu.sg/sis_research/4557/)


Download and import necessary modules

In [45]:
!pip install torch torchvision
!pip install numpy
!pip install pandas
!pip install google.colab
!pip install nltk



In [46]:
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import word_tokenize
import nltk
import torch.nn as nn
import torch.nn.functional as F
import io
import os
from google.colab import files
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

All of the classes and functions below are ones developed in the training. They are used for loading the already trained model. Also for data preparation into correct format, and using GloVe embeddings on the test data.

In [47]:
class SequenceAttentionNetwork(nn.Module):
    def __init__(self, embedding_dim, hidden_size, num_classes):
        super(SequenceAttentionNetwork, self).__init__()
        # initialize recurrent neural network layer suited for sequence processing
        self.gru = nn.GRU(embedding_dim, hidden_size, batch_first=True)
        # A linear layer that transforms the output of the GRU to produce a coherence weight for each element in the sequence.
        self.coherence_attention = nn.Linear(hidden_size, hidden_size)
        # Another linear layer that outputs a single weight per sequence element, used to assess how strongly each element entails or contradicts a claim.
        self.entailment_attention = nn.Linear(hidden_size, 1)
        # The final classification layer that predicts the class of the input sequence based on the attended features.
        self.classifier = nn.Linear(hidden_size, num_classes)

    # The input to the forward method is a batch of embeddings which is pre-processed text data.
    def forward(self, embeddings):
        # The GRU outputs a new representation that captures dependencies in the sequence.
        gru_out, _ = self.gru(embeddings)

        # computes a softmax over the linear transformation of GRU outputs, weighting each part of the sequence by its coherence.
        coherence_weights = F.softmax(self.coherence_attention(gru_out), dim=1)
        coherence_context = torch.sum(coherence_weights * gru_out, dim=1)
        # similar as previous but entailment weights
        entailment_weights = F.softmax(self.entailment_attention(gru_out), dim=1)
        entailment_context = torch.sum(entailment_weights * gru_out, dim=1)

        combined_context = coherence_context + entailment_context
        # Features are fed into a classifier to produce the final class logits.
        logits = self.classifier(combined_context)
        return logits

def simple_tokenize(text):
    return word_tokenize(text.lower())  # Tokenizes and converts to lower case

#This function converts text into a fixed-length array of embeddings
def text_to_embeddings(text, embeddings_dict, max_length):
    tokens = simple_tokenize(text)
    # It retrieves embeddings for each token from embeddings_dict
    # If a word is not found, it uses a zero vector of size 100 as a placeholder.
    embeddings = [embeddings_dict.get(token, np.zeros(100)) for token in tokens[:max_length]]
    # Ensures the resulting list of embeddings has a length of max_length
    if len(embeddings) < max_length:
        embeddings += [np.zeros(100)] * (max_length - len(embeddings))
    return np.array(embeddings)

# Custom class that extends PyTorch’s Dataset class.
# It is designed to handle loading and transforming text data for a model
class TextDataset(Dataset):
    def __init__(self, embeddings_dict, df, max_length=256):
        self.embeddings_dict = embeddings_dict
        self.claims = df['Claim'].tolist()
        self.evidences = df['Evidence'].tolist()
        self.max_length = max_length

    def __len__(self):
        return len(self.claims)
    # retrieves the combined claim and its evidence converts this text to an embedding tensor. Needed for dataLoader
    def __getitem__(self, idx):
        text = self.claims[idx] + " " + self.evidences[idx]
        embeddings = text_to_embeddings(text, self.embeddings_dict, self.max_length)
        embeddings_tensor = torch.tensor(embeddings, dtype=torch.float32)

        return embeddings_tensor


Uploads the model and the test set to google collab. **Change the names to fit the specifc file**

In [48]:
if not os.path.exists('test.csv') or not os.path.exists('best_model_b.pth'):
  uploaded = files.upload()

if not os.path.exists('glove.6B.zip'):
    !wget http://nlp.stanford.edu/data/glove.6B.zip
    !unzip glove.6B.zip -d glove.6B

Load csv file, model and GloVe embeddings

In [49]:
def load_glove_embeddings(path):
    embeddings_dict = {}
    with open(path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
    return embeddings_dict

# loads model
def load_model(model_path, embedding_dim, hidden_size, num_classes):
    model = SequenceAttentionNetwork(embedding_dim, hidden_size, num_classes)
    model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
    model.eval()
    return model



glove_path = 'glove.6B/glove.6B.100d.txt'
glove_embeddings = load_glove_embeddings(glove_path)
#Preparing test data
test_df = pd.read_csv('dev.csv')
test_dataset = TextDataset(glove_embeddings, test_df)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

#This optimizes predicting by leveraging GPU acceleration when available. If gpu is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#load model
model = load_model('best_model_b.pth', 100, 256, 2)


Predicts the labels for the test.

In [50]:
def predict(model, dataloader):
    predictions = []
    #Disables gradient calculation to reduce memory consumption and increase computation speed
    with torch.no_grad():
        for embeddings in dataloader: #loop iterates over the dataloader, which yields batches of embeddings.
            embeddings = embeddings.to(device)
            outputs = model(embeddings) # passes the current batch of embeddings through the model, obtaining the raw output logits for each example in the batch.
            _, predicted = torch.max(outputs, dim=1) #Computes the maximum value along dimension 1 (across columns), which corresponds to deciding the most likely class
            # Extend builds up a list of predictions across all the batches processed by the dataloader.
            predictions.extend(predicted.cpu().numpy())#Moves the predictions back to the CPU from the GPU (if used) and converts them to a NumPy array
    return predictions

predictions = predict(model, test_loader)


Saves the predictions to a csv file

In [52]:
predictions_df = pd.DataFrame({
    'prediction': predictions
})

predictions_df.to_csv('Group_70_b.csv', index=False)