In [1]:
import torch
import json
import numpy as np
import random
import hanlp

from tqdm import tqdm
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from collections import Counter
from torch.optim import AdamW

In [2]:
# Define Hyperparameters
EPOCH_NUMBERS = 20
BATCH_SIZE = 16
WIQA_TRAIN = '../datasets/wiqa-dataset-v2-october-2019/train.jsonl'
WIQA_TEST = '../datasets/wiqa-dataset-v2-october-2019/test.jsonl'

In [3]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [4]:
with open(WIQA_TRAIN, 'r') as json_file:
    train_data = [json.loads(line) for line in json_file]

with open(WIQA_TEST, 'r') as json_file:
    test_data = [json.loads(line) for line in json_file]

In [5]:
# Load the BERT tokenizer and model
HanLP = hanlp.load(hanlp.pretrained.mtl.UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels = 3, # The number of output labels. Here, it's 3: more, less, no effect.
    output_attentions = False, 
    output_hidden_states = False, 
)

# Extract context (para_steps), questions (stem), and labels (answer_label)
contexts=[]
questions = []
labels = []

for item in train_data:
    para = " ".join([p.strip() for p in item["question"]["para_steps"] if len(p) > 0])
    contexts.append(para)
    questions.append(item['question']['stem'].strip())
    labels.append(item['question']['answer_label'].strip())

                                             

In [6]:
# Create separate lists for each answer option
options = ["more", "less", "no_effect"]
labels_option = [options.index(label) for label in labels]

# Tokenization Process with Constituency Parsing
def encode_example(context, question, option):
    hanlp_parser = HanLP([question])
    parser_question = hanlp_parser['con']
    
    # Join together the context, question, and answer option using BERT's special tokens
    encoded = tokenizer.encode_plus(f"[CLS] {context} [SEP] {parser_question} [SEP] {option}", truncation=True, padding='max_length', max_length=512)
    return encoded["input_ids"], encoded["attention_mask"]

train_encodings = []
for i in tqdm(range(len(contexts))):
    context = contexts[i]
    question = questions[i]
    for option in options:
        encoding = encode_example(context, question, option)
        train_encodings.append(encoding)

train_labels_option = []
for label in labels_option:
    for _ in options:
        train_labels_option.append(label)

100%|██████████| 29808/29808 [00:11<00:00,  8.71it/s]


In [7]:
# Convert to PyTorch data types
train_dataset = TensorDataset(torch.tensor([item[0] for item in train_encodings]), torch.tensor([item[1] for item in train_encodings]), torch.tensor(train_labels_option))

label_dict = {"more": 0, "less": 1, "no_effect": 2}

# Create data loaders
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)

In [8]:
# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(device)

cuda


In [9]:
optimizer = AdamW(model.parameters(), lr=2e-5)

# Adjust model forward pass
def train(epoch):
    model.train()
    total_loss = 0
    for _,data in enumerate(train_loader, 0):
        ids = data[0].to(device, dtype = torch.long)
        mask = data[1].to(device, dtype = torch.long)
        targets = data[2].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        logits = outputs.logits

        # Compute the cross-entropy loss
        loss_function = torch.nn.CrossEntropyLoss()
        loss = loss_function(logits.view(-1, 3), targets.view(-1))
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
    print(f'Average loss in epoch {epoch}: {total_loss/len(train_loader)}')

# Train the model
for epoch in range(1, EPOCH_NUMBERS+1):
    train(epoch)

Average loss in epoch 1: 1.102248163599717
Average loss in epoch 2: 1.0745071862873279
Average loss in epoch 3: 1.022660704035508
Average loss in epoch 4: 0.9097064670763517
Average loss in epoch 5: 0.9007854367557325
Average loss in epoch 6: 0.7459529214783719
Average loss in epoch 7: 0.6374646142909401
Average loss in epoch 8: 0.5237770519758526
Average loss in epoch 9: 0.3616815044691688
Average loss in epoch 10: 0.2635155759359661
Average loss in epoch 11: 0.24531049712708122
Average loss in epoch 12: 0.15274907805417715
Average loss in epoch 13: 0.12143663297358312
Average loss in epoch 14: 0.09793458271183465
Average loss in epoch 15: 0.0798404565767238
Average loss in epoch 16: 0.09473790316597412
Average loss in epoch 17: 0.04080364059068655
Average loss in epoch 18: 0.04383237743260045
Average loss in epoch 19: 0.03539923622616028
Average loss in epoch 20: 0.031377225231967475


In [10]:
def predict(para_steps, question):
    # Join the steps into a single text
    context = ' '.join(para_steps)
    
    # Prepare the inputs for the model
    inputs = tokenizer(context, question, return_tensors='pt', max_length=512, padding='max_length', truncation=True)

    # Move the inputs to the device
    inputs = {name: tensor.to(device) for name, tensor in inputs.items()}
    
    # Get the model's predictions
    output = model(**inputs)

    _, prediction = torch.max(output.logits, dim=1)

    # Convert numerical prediction back to original label
    reverse_label_dict = {v: k for k, v in label_dict.items()}
    return reverse_label_dict[prediction.item()]

In [11]:
# List to store the true and predicted labels
y_true = []
y_pred = []

# Map text to corresponding label
label_dict = {"more": 0, "less": 1, "no_effect": 2}


for entry in tqdm(test_data):
    question = entry['question']['stem']
    para_steps = entry['question']['para_steps']
    true_answer = entry['question']['answer_label']
    predicted_answer = predict(para_steps, question)
    
    y_true.append(true_answer)
    y_pred.append(predicted_answer)

100%|██████████| 3003/3003 [00:34<00:00, 86.34it/s]


In [12]:
# Calculate the F1 score
f1 = f1_score(y_true, y_pred, average='weighted')

print(f'F1 score: {f1}')

F1 score: 0.7325377661080531
