In [1]:
import torch
import json
import numpy as np
import random

from tqdm import tqdm
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from collections import Counter
from torch.optim import AdamW

In [2]:
# Define Hyperparameters
EPOCH_NUMBERS = 10
BATCH_SIZE = 16
WIQA_TRAIN = '../datasets/wiqa-dataset-v2-october-2019/train.jsonl'
WIQA_TEST = '../datasets/wiqa-dataset-v2-october-2019/test.jsonl'

In [3]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [4]:
with open(WIQA_TRAIN, 'r') as json_file:
    train_data = [json.loads(line) for line in json_file]

with open(WIQA_TEST, 'r') as json_file:
    test_data = [json.loads(line) for line in json_file]

In [5]:
# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels = 3, # The number of output labels. Here, it's 3: more, less, no effect.
    output_attentions = False, 
    output_hidden_states = False, 
)

# Extract context (para_steps), questions (stem), and labels (answer_label)
contexts = [". ".join(item['question']['para_steps']) for item in train_data]
questions = [item['question']['stem'][:-1] + "?" for item in train_data]
labels = [item['question']['answer_label'] for item in train_data]

# Convert labels to numerical form ("more": 0, "less": 1, "no effect": 2)
label_dict = {"more": 0, "less": 1, "no_effect": 2}
labels = [label_dict[label] for label in labels]

# Split your data into train and validation sets
train_contexts, val_contexts, train_questions, val_questions, train_labels, val_labels = train_test_split(contexts, questions, labels, test_size=.2)

# Tokenize the context and questions
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

# Convert to PyTorch data types
train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']), torch.tensor(train_encodings['attention_mask']), torch.tensor(train_labels))
val_dataset = TensorDataset(torch.tensor(val_encodings['input_ids']), torch.tensor(val_encodings['attention_mask']), torch.tensor(val_labels))

# Create data loaders
train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=BATCH_SIZE)
val_loader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=BATCH_SIZE)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [6]:
# Count the number of instances for each label
label_counts = Counter(labels)

# Print the counts
for label, count in label_counts.items():
    print(f"Label: {label}, Count: {count}")
print("As you can see we have balanced classes so we do not need class weighting")

Label: 0, Count: 9936
Label: 2, Count: 9936
Label: 1, Count: 9936
As you can see we have balanced classes so we do not need class weighting


In [7]:
# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(device)

cuda


In [8]:
optimizer = AdamW(model.parameters(), lr=2e-5)

# Define the training function
def train(epoch):
    model.train()
    for _,data in enumerate(train_loader, 0):
        ids = data[0].to(device, dtype = torch.long)
        mask = data[1].to(device, dtype = torch.long)
        targets = data[2].to(device, dtype = torch.long)

        outputs = model(ids, mask, labels = targets)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
            
        # Validation step
        if _ % 5000 == 0:  # adjust this to run validation every N steps
            model.eval()
            val_losses = []
            for val_data in val_loader:
                val_ids = val_data[0].to(device, dtype = torch.long)
                val_mask = val_data[1].to(device, dtype = torch.long)
                val_targets = val_data[2].to(device, dtype = torch.long)

                with torch.no_grad():
                    outputs = model(val_ids, val_mask, labels=val_targets)
                    val_loss = outputs[0]
                    val_losses.append(val_loss.item())
                    
            avg_val_loss = sum(val_losses) / len(val_losses)
            print(f'Epoch: {epoch}, Validation Loss: {avg_val_loss}')
                    

            
            # Switch back to train mode
            model.train()

# Train the model
for epoch in range(1, EPOCH_NUMBERS+1):
    train(epoch)

Epoch: 1, Loss:  1.1616610288619995
Epoch: 1, Validation Loss: 1.1415713012378272
Epoch: 2, Loss:  0.49950483441352844
Epoch: 2, Validation Loss: 0.6087750989054866
Epoch: 3, Loss:  0.41587209701538086
Epoch: 3, Validation Loss: 0.5827380743966345
Epoch: 4, Loss:  0.40718644857406616
Epoch: 4, Validation Loss: 0.5942742222915066
Epoch: 5, Loss:  0.4850940406322479
Epoch: 5, Validation Loss: 0.5939471165552216
Epoch: 6, Loss:  0.33411142230033875
Epoch: 6, Validation Loss: 0.6087316660555054
Epoch: 7, Loss:  0.4086441099643707
Epoch: 7, Validation Loss: 0.6254913878025382
Epoch: 8, Loss:  0.5053379535675049
Epoch: 8, Validation Loss: 0.6316750355204692
Epoch: 9, Loss:  0.6028084754943848
Epoch: 9, Validation Loss: 0.6278014159873727
Epoch: 10, Loss:  0.3517250120639801
Epoch: 10, Validation Loss: 0.60191412437857


In [9]:
def predict(para_steps, question):
    # Join the steps into a single text
    context = ' '.join(para_steps)
    
    # Prepare the inputs for the model
    inputs = tokenizer(context, question, return_tensors='pt', max_length=512, padding='max_length', truncation=True)

    # Move the inputs to the device
    inputs = {name: tensor.to(device) for name, tensor in inputs.items()}
    
    # Get the model's predictions
    output = model(**inputs)

    _, prediction = torch.max(output.logits, dim=1)

    # Convert numerical prediction back to original label
    reverse_label_dict = {v: k for k, v in label_dict.items()}
    return reverse_label_dict[prediction.item()]

In [None]:
# List to store the true and predicted labels
y_true = []
y_pred = []

# Map text to corresponding label
text_to_label = {'more': 0, 'less': 1, 'no_effect': 2}

for entry in tqdm(test_data):
    question = entry['question']['stem']
    para_steps = entry['question']['para_steps']
    true_answer = entry['question']['answer_label']
    predicted_answer = predict(para_steps, question)
    
    y_true.append(true_answer)
    y_pred.append(predicted_answer)

100%|██████████| 3003/3003 [00:35<00:00, 85.39it/s]


In [None]:
# Calculate the F1 score
f1 = f1_score(y_true, y_pred, average='weighted')

print(f'F1 score: {f1}')

F1 score: 0.6753108120784266
