In [1]:
import torch
import json
import numpy as np
import random
import os

from tqdm import tqdm
from transformers import BertTokenizer, BertForSequenceClassification, AutoModel
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from collections import Counter
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification

In [2]:
# Define Hyperparameters
EPOCH_NUMBERS = 10
BATCH_SIZE = 16
WIQA_TRAIN = '../datasets/wiqa-dataset-v2-october-2019/train.jsonl'
WIQA_DEV = '../datasets/wiqa-dataset-v2-october-2019/dev.jsonl'
WIQA_TEST = '../datasets/wiqa-dataset-v2-october-2019/test.jsonl'

In [3]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [4]:
with open(WIQA_TRAIN, 'r') as json_file:
    train_data = [json.loads(line) for line in json_file]
    
with open(WIQA_DEV, 'r') as json_file:
    dev_data = [json.loads(line) for line in json_file]

with open(WIQA_TEST, 'r') as json_file:
    test_data = [json.loads(line) for line in json_file]

In [5]:
# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels = 3,  # The number of output labels: "more", "less", "no_effect".
    output_attentions = False, 
    output_hidden_states = False
)

# Define a mapping for answer options
options = ["more", "less", "no_effect"]
label_dict = {option: i for i, option in enumerate(options)}

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, label_dict):
        self.examples = []
        self.tokenizer = tokenizer
        self.label_dict = label_dict

        for item in data:
            context = " ".join([p.strip() for p in item["question"]["para_steps"] if len(p) > 0])
            question = item['question']['stem'].strip()
            label = item['question']['answer_label'].strip()

            for option in options:
                encoding = self.tokenizer(context, f"{question} {option}", truncation=True, padding='max_length', max_length=512)
                self.examples.append({
                    "input_ids": encoding["input_ids"],
                    "attention_mask": encoding["attention_mask"],
                    "label": self.label_dict[label] if label == option else self.label_dict['no_effect']
                })

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.examples[idx]["input_ids"]),
            "attention_mask": torch.tensor(self.examples[idx]["attention_mask"]),
            "labels": torch.tensor(self.examples[idx]["label"])
        }

    def __len__(self):
        return len(self.examples)


# Assuming your original data is stored in `data`
# train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

train_dataset = CustomDataset(train_data, tokenizer, label_dict)
dev_dataset = CustomDataset(dev_data, tokenizer, label_dict)

train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=16)
dev_loader = DataLoader(dataset=dev_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=16)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [6]:
# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(device)

cuda


In [7]:
# Define the directory where you want to save your models
save_directory = 'saved_models'

# Make sure the directory exists
if not os.path.exists(save_directory):
    os.makedirs(save_directory)

In [8]:
# Set the optimizer (Adam is a common choice)
optimizer = AdamW(model.parameters(), lr=2e-5)

# Define the training function
def train_epoch(model, data_loader, optimizer):
    model.train()
    total_loss = 0

    progress = tqdm(data_loader, desc='Training', position=0, leave=True)
    
    for batch in progress:
        # Get the inputs and labels
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # Reset gradients
        optimizer.zero_grad()
 
        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        # Get the loss from the outputs
        loss = outputs.loss

        # Backward pass
        loss.backward()

        # Update weights
        optimizer.step()
        
        total_loss += loss.item()
        
        # Update the progress bar
        progress.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})

    return total_loss / len(data_loader)


# Define the validation function
def validate_epoch(model, data_loader):
    model.eval()
    total_loss = 0
    total_accuracy = 0

    progress = tqdm(data_loader, desc='Validating', position=0, leave=True)
    
    for batch in progress:
        # Get the inputs and labels
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # Forward pass with no gradient computation
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        # Get the loss from the outputs
        loss = outputs.loss
        total_loss += loss.item()

        # Compute accuracy
        preds = torch.argmax(outputs.logits, dim=1)
        total_accuracy += (preds == labels).sum().item()
        
        # Update the progress bar
        progress.set_postfix({'validation_loss': '{:.3f}'.format(loss.item()/len(batch))})

    return total_loss / len(data_loader), total_accuracy / len(data_loader.dataset)

In [9]:
# Define the early stopping criteria
n_no_improve = 0
early_stop_after_n_epochs = 3
best_loss = float('inf')

model_path = os.path.join(save_directory, f'best_model')

if os.path.exists(model_path):
    print(f'Loading model from {model_path}')
    model = BertForSequenceClassification.from_pretrained(model_path)
    model.to(device)
else:
    print('Pretrained model not found. Training a new model.')

    # Training loop
    for epoch in range(EPOCH_NUMBERS):
        print(f'Epoch {epoch+1}/{EPOCH_NUMBERS}')
        train_loss = train_epoch(model, train_loader, optimizer)
        dev_loss, dev_accuracy = validate_epoch(model, dev_loader)
        print(f'Training loss: {train_loss}')
        print(f'Validation loss: {dev_loss}')
        print(f'Validation accuracy: {dev_accuracy}')

        # Check for loss improvement
        if dev_loss < best_loss:
            best_loss = dev_loss
            n_no_improve = 0
            model.save_pretrained(model_path)
        else:
            n_no_improve += 1

        # If the validation loss hasn't improved for early_stop_after_n_epochs epochs, stop training
        if n_no_improve >= early_stop_after_n_epochs:
            print('Early stopping triggered')
            break

Pretrained model not found. Training a new model.
Epoch 1/10


Training: 100%|██████████| 5589/5589 [32:57<00:00,  2.83it/s, training_loss=0.154]
Validating: 100%|██████████| 1293/1293 [02:30<00:00,  8.59it/s, validation_loss=0.050]


Training loss: 0.383632930385915
Validation loss: 0.3764258969483961
Validation accuracy: 0.7710569577410309
Epoch 2/10


Training: 100%|██████████| 5589/5589 [32:54<00:00,  2.83it/s, training_loss=0.123]
Validating: 100%|██████████| 1293/1293 [02:30<00:00,  8.60it/s, validation_loss=0.052]


Training loss: 0.3439787210045773
Validation loss: 0.3696134852113819
Validation accuracy: 0.7635141669084228
Epoch 3/10


Training: 100%|██████████| 5589/5589 [32:54<00:00,  2.83it/s, training_loss=0.067]
Validating: 100%|██████████| 1293/1293 [02:30<00:00,  8.60it/s, validation_loss=0.040]


Training loss: 0.3348071646091283
Validation loss: 0.3606078447104192
Validation accuracy: 0.7778261290010637
Epoch 4/10


Training: 100%|██████████| 5589/5589 [32:54<00:00,  2.83it/s, training_loss=0.132]
Validating: 100%|██████████| 1293/1293 [02:30<00:00,  8.60it/s, validation_loss=0.012]


Training loss: 0.31097746046896463
Validation loss: 0.37403982880612446
Validation accuracy: 0.829803694033459
Epoch 5/10


Training: 100%|██████████| 5589/5589 [32:56<00:00,  2.83it/s, training_loss=0.030]
Validating: 100%|██████████| 1293/1293 [02:30<00:00,  8.60it/s, validation_loss=0.002]


Training loss: 0.20200832101375119
Validation loss: 0.38699030586588934
Validation accuracy: 0.8503046127067014
Epoch 6/10


Training: 100%|██████████| 5589/5589 [32:56<00:00,  2.83it/s, training_loss=0.002]
Validating: 100%|██████████| 1293/1293 [02:30<00:00,  8.60it/s, validation_loss=0.001]

Training loss: 0.1255892313758785
Validation loss: 0.47013822328482957
Validation accuracy: 0.8475002417561164
Early stopping triggered





In [10]:
def predict(para_steps, question):
    # Join the steps into a single text
    context = ' '.join(para_steps)
    
    # Prepare the inputs for the model
    inputs = tokenizer(context, question, return_tensors='pt', max_length=512, padding='max_length', truncation=True)

    # Move the inputs to the device
    inputs = {name: tensor.to(device) for name, tensor in inputs.items()}
    
    # Get the model's predictions
    output = model(**inputs)

    _, prediction = torch.max(output.logits, dim=1)

    # Convert numerical prediction back to original label
    reverse_label_dict = {v: k for k, v in label_dict.items()}
    return reverse_label_dict[prediction.item()]

In [11]:
# List to store the true and predicted labels
y_true = []
y_pred = []

# Map text to corresponding label
label_dict = {"more": 0, "less": 1, "no_effect": 2}


for entry in tqdm(test_data):
    question = entry['question']['stem']
    para_steps = entry['question']['para_steps']
    true_answer = entry['question']['answer_label']
    predicted_answer = predict(para_steps, question)
    
    y_true.append(true_answer)
    y_pred.append(predicted_answer)

100%|██████████| 3003/3003 [00:33<00:00, 89.50it/s]


In [12]:
# Calculate the F1 score
f1 = f1_score(y_true, y_pred, average='weighted')

print(f'F1 score: {f1}')

F1 score: 0.71407537422356023
