In [10]:
import torch
import json

from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import f1_score
from tqdm import tqdm

In [11]:
# Check if CUDA is available and set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
# Load pre-trained model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Move the model to the device
model.to(device)
print(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [13]:
def predict_answer(para_steps, question):
    # Join the steps into a single text
    text = ' '.join(para_steps)
    
    # Prepare the inputs for the model
    inputs = tokenizer(text, question, return_tensors='pt')

    # Move the inputs to the device
    inputs = {name:tensor.to(device) for name, tensor in inputs.items()}
    
    # Get the model's predictions
    outputs = model(**inputs)

    # The model returns the logits (predictions before activation function)
    # We take the argmax to get the most likely answer label
    preds = torch.argmax(outputs.logits, dim=1)
    
    # Convert tensor to integer
    pred = preds.item()

    # Map predicted label to corresponding text
    label_to_text = {0: 'more', 1: 'less', 2: 'no_effect'}
    predicted_answer = label_to_text[pred]
    
    return predicted_answer

In [14]:
# Load your data
JSONL_PATH = '../datasets/wiqa-dataset-v2-october-2019/test.jsonl'
with open(JSONL_PATH, 'r') as json_file:
    data = list(json_file)

In [15]:
# List to store the true and predicted labels
y_true = []
y_pred = []

# Map text to corresponding label
text_to_label = {'more': 0, 'less': 1, 'no_effect': 2}

In [16]:
for entry in tqdm(data):
    entry = json.loads(entry)
    question = entry['question']['stem']
    para_steps = entry['question']['para_steps']
    true_answer = entry['question']['answer_label']
    predicted_answer = predict_answer(para_steps, question)
    
    y_true.append(true_answer)
    y_pred.append(predicted_answer)

100%|██████████| 3003/3003 [00:27<00:00, 108.17it/s]


In [17]:
# Calculate the F1 score
f1 = f1_score(y_true, y_pred, average='weighted')

print(f'F1 score: {f1}')

F1 score: 0.1372323448933312
