In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [2]:
class NLIDataset(Dataset):
    def __init__(self, premises, hypotheses, labels=None, tokenizer=None, max_len=128):
        self.premises = premises
        self.hypotheses = hypotheses
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.premises)

    def __getitem__(self, item):
        premise = str(self.premises[item])
        hypothesis = str(self.hypotheses[item])
        encoding = self.tokenizer.encode_plus(
            premise,
            hypothesis,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        data = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
        
        # create label
        if self.labels is not None:
            label = self.labels[item]
            data['labels'] = torch.tensor(label, dtype=torch.long)
        
        return data


In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# load csv file and preprocess data
trial_data = pd.read_csv('./test.csv')

trial_dataset = NLIDataset(trial_data['premise'], trial_data['hypothesis'], tokenizer=tokenizer, max_len=128)

trial_loader = DataLoader(trial_dataset, batch_size=16, shuffle=False)


In [15]:
#Load model
model = torch.load('./bert_model.pth')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [19]:
# function to output predictions
def generate_predictions(model, data_loader):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            predictions.extend(preds.cpu().numpy())
    return predictions

predictions = generate_predictions(model, trial_loader)

In [20]:
# save prections to dataframe and output
df_predictions = pd.DataFrame(predictions, columns=['Prediction'])
df_predictions.to_csv('predictions_test.csv', index=False)


In [21]:
df_predictions = pd.read_csv('predictions_test.csv')
print(df_predictions.head())

   Prediction
0           1
1           1
2           1
3           1
4           0
