In [None]:
import torch
import numpy as np
import contextGenerator
import utils
import pandas as pd
import matplotlib.pyplot as plt
from optuna.storages import RDBStorage
import re
from time import perf_counter_ns
import itertools
from datasets import load_from_disk, load_dataset, DatasetDict
from transformers import (
    TrainingArguments, Trainer, BatchEncoding,
    DistilBertTokenizerFast, DefaultDataCollator, DistilBertForQuestionAnswering, 
    EarlyStoppingCallback
)


checkpoint = 'distilbert-base-cased-distilled-squad'
tokenizer = DistilBertTokenizerFast.from_pretrained(checkpoint)
contextGen = contextGenerator.LuceneRetrieval()


try:
    ds1 = load_from_disk('../res/data/QANTA-IgnoreIMP')
    ds2 = load_from_disk('../res/data/QANTA-IncludeNA')
    ds3 = load_from_disk('../res/data/guess_train')
    
except:
    ds = load_dataset("community-datasets/qanta", "mode=first,char_skip=25")


# Preprocessing the data 
Given how BERT is a extractive model it will attempt to highlight its prediction in the provided context. In other words our task is to fine tune the model to predict the start and end positions of the answer in the context.  
#### 1. Retreive context
For each question we will need a relevent document where the answer may exist. 

In [2]:
try: 
    ds['guesstest']['context']
except:
    ds = ds.map(lambda x: {'context':  contextGen(x['full_question'], 1)[0]})

In [4]:
def reformat_context(row):
    context = row['context']
    contents = re.sub('\n', ' ', context['contents'])
    new = contextGen.remove_adj_dup(contents)
    row['context']['contents'] = new
    return row['context']

ds = ds.map(lambda x: {'context':  reformat_context(x)})

Map: 100%|██████████| 96221/96221 [00:17<00:00, 5483.00 examples/s]
Map: 100%|██████████| 16706/16706 [00:03<00:00, 5076.99 examples/s]
Map: 100%|██████████| 1055/1055 [00:00<00:00, 5606.27 examples/s]
Map: 100%|██████████| 1161/1161 [00:00<00:00, 5665.19 examples/s]
Map: 100%|██████████| 2151/2151 [00:00<00:00, 4463.02 examples/s]
Map: 100%|██████████| 1953/1953 [00:00<00:00, 5404.71 examples/s]
Map: 100%|██████████| 1145/1145 [00:00<00:00, 5758.19 examples/s]


#### 2. Find the start and end postions
The contexts and questions are just strings to so we need to find the positions for the answers in the context. 

In [5]:
try: 
    ds['test']['char_pos']
except:
    ds = ds.map(lambda x: {'char_pos':  utils.term_char_index(x['answer'], x['context']['contents'])})

Map: 100%|██████████| 96221/96221 [00:18<00:00, 5141.64 examples/s]
Map: 100%|██████████| 16706/16706 [00:03<00:00, 5461.68 examples/s]
Map: 100%|██████████| 1055/1055 [00:00<00:00, 5314.49 examples/s]
Map: 100%|██████████| 1161/1161 [00:00<00:00, 5898.43 examples/s]
Map: 100%|██████████| 2151/2151 [00:00<00:00, 4473.73 examples/s]
Map: 100%|██████████| 1953/1953 [00:00<00:00, 6093.68 examples/s]
Map: 100%|██████████| 1145/1145 [00:00<00:00, 5938.83 examples/s]


### 2. Tokenize context/question pair and find the token positions
Ensure the context comes first in the pair to align the character index with the token index. BERT limits the combined token count of context and question to 512. Since the context is capped at 400 words, this won’t cause issues, but we’ll use padding and truncation for consistency and edge cases.

In [224]:
unpack = lambda x, y, z: {"start_positions": x, "end_positions": y, "encodings": z}

def tokenize_row(row: dict, tokenizer) -> dict[str, BatchEncoding]:
    try: 
        encoding =  tokenizer(
            text = row['context']['contents'], 
            text_pair = row['full_question'], 
            padding = 'max_length', 
            truncation = 'only_first', 
            max_length = 512, 
            return_tensors = 'pt', 
            padding_side = 'right',
            return_length = False
            )
    except:
        cleaned = utils.clean_text(row['full_question'])
        encoding =  tokenizer(
            text = row['context']['contents'], 
            text_pair = cleaned, 
            padding = 'max_length', 
            truncation = 'only_first', 
            max_length = 512, 
            return_tensors = 'pt', 
            padding_side = 'right',
            return_length = False
            )
    start_pos = []
    end_pos = []
    for (x, y) in row['char_pos']:
        st = encoding.char_to_token(x)
        try:
            ed = encoding.char_to_token(y-1)
        except:
            ed = encoding.char_to_token(y)

        if st != None and ed != None:
            start_pos.append(st)
            end_pos.append(ed)
    if len(start_pos) == 0:
        # # no answer set to the [CLS] token
        start_pos.append(0)
        end_pos.append(0)        
        # no answer set to invalid
        # start_pos.append(-1)
        # end_pos.append(-1)
    encoding.update({'start_positions': start_pos, 'end_positions': end_pos})
    return {"encodings": encoding}

try: 
    ds['test']['encodings']
except:
    ds = ds.map(lambda x: tokenize_row(x, tokenizer))
    
    

Map: 100%|██████████| 96221/96221 [04:07<00:00, 388.22 examples/s] 
Map: 100%|██████████| 16706/16706 [00:40<00:00, 414.56 examples/s]
Map: 100%|██████████| 1055/1055 [00:01<00:00, 559.13 examples/s]
Map: 100%|██████████| 1161/1161 [00:02<00:00, 559.82 examples/s]
Map: 100%|██████████| 2151/2151 [00:03<00:00, 555.78 examples/s]
Map: 100%|██████████| 1953/1953 [00:03<00:00, 563.91 examples/s]
Map: 100%|██████████| 1145/1145 [00:01<00:00, 590.28 examples/s]


In [9]:
def equ_len_pad(encs): 
    # I want to find the largest list of start positions, from this pa all of rest to meet this size
    longest_len = max([len(x['start_positions']) for x in encs])
    for x in encs: 
        x_len = len(x['start_positions'])
        x['start_positions'] = x['start_positions'] + ([-1 for x in range(longest_len - x_len)])
        x['end_positions'] = x['end_positions'] + ([-1 for x in range(longest_len - x_len)])
        x['input_ids'] =  x['input_ids'][0]
        x['attention_mask'] = x['attention_mask'][0]
    return encs

train = equ_len_pad(ds2['guesstrain']['encodings'])
val = equ_len_pad(ds2['guessdev']['encodings'])
test = equ_len_pad(ds2['guesstest']['encodings'])


In [8]:
train_2 = equ_len_pad(ds1['guesstrain']['encodings'])
val_2 = equ_len_pad(ds1['guessdev']['encodings'])
test_2 = equ_len_pad(ds1['guesstest']['encodings'])


In [227]:
guessTrain = DatasetDict({
    'train': ds2['guesstrain'],
    'val': ds2['guessdev'],
    'test': ds2['guesstest'],
})
guessTrain.save_to_disk('../res/data/QANTA-IncludeNA')

Saving the dataset (3/3 shards): 100%|██████████| 96221/96221 [00:03<00:00, 29042.09 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1055/1055 [00:00<00:00, 113858.34 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2151/2151 [00:00<00:00, 177398.35 examples/s]


# Custom Training Class

In [10]:
from typing import Optional, Union
from datasets import Dataset
import time
class BartTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        # generate model's guess
        outputs = model(input_ids = inputs['input_ids'] , attention_mask = inputs['attention_mask'])
        # find the model's predictions 
        start_yhat = torch.argmax(outputs['start_logits'], dim= 1)
        end_yhat = torch.argmax(outputs['end_logits'], dim= 1)
        # check if this is a possible target
        # if it is set it as the target, else choose a random valid target 
        start_target = []
        end_target = []

        valid_start_targets = []
        valid_end_targets = []

        for idx, x in enumerate(inputs['start_positions']): 
            cur_start_targ = []
            cur_end_targ = []

            for idx2, y in enumerate(x): 
                if y != -1:
                    cur_start_targ.append(y)
                    cur_end_targ.append(inputs['end_positions'][idx][idx2])

                else: 
                    break 
                
                
            valid_start_targets.append(cur_start_targ if cur_start_targ else [-1])
            valid_end_targets.append(cur_end_targ if cur_end_targ else [-1])          
            
        for x in range(len(inputs['input_ids'])):
            if start_yhat[x] in valid_start_targets[x]:
                start_target.append(start_yhat[x])                
                end_target.append(end_yhat[x])
            else: 
                ran_int = np.random.randint(len(valid_start_targets[x]))
                s_rand = valid_start_targets[x][ran_int]
                e_rand = valid_end_targets[x][ran_int]
     
                start_target.append(s_rand)
                end_target.append(e_rand)

        
        device = outputs['start_logits'].device
        start_target = torch.tensor(start_target, dtype=torch.long, device=device)
        end_target = torch.tensor(end_target, dtype=torch.long, device=device)

        # Compute loss
        loss_fct = torch.nn.CrossEntropyLoss()
        start_loss = loss_fct(outputs['start_logits'], start_target)
        end_loss = loss_fct(outputs['end_logits'], end_target)
        
        total_loss = (start_loss + end_loss) / 2
        return (total_loss, outputs) if return_outputs else total_loss
    
    
def model_init():
    return (DistilBertForQuestionAnswering.from_pretrained(checkpoint))


In [5]:
# Formatting the results and logs for analysis
def reformat(all_tests, names):
    formated = []
    final = []
    # join a, a+1 rows excluding last
    for idx, cur in enumerate(all_tests): 
        try: 
            cur['test']
        except:
            cur.insert(0, "test", names[idx])
        test_output = test_output = cur.iloc[[-1]].dropna(axis='columns')
        final.append(test_output)
        test_train_log = pd.DataFrame(cur.iloc[lambda x: x.index % 2 == 0])
        test_train_log = test_train_log.drop(test_train_log.index[-1]).dropna(axis='columns')
        test_eval_log = pd.DataFrame(cur.iloc[lambda x: x.index % 2 != 0]).dropna(axis='columns')
        formated.append(test_train_log.merge(test_eval_log, on = ['test', 'epoch', 'step']))
    final = pd.concat(final)
    return final, formated

# Fine Tuning

In [6]:
from sklearn.model_selection import train_test_split


DISTILBERT_DROPOUT = 0.4
DISTILBERT_ATT_DROPOUT = 0.4

mini_train = train[:7500]
tokenizer = DistilBertTokenizerFast.from_pretrained(checkpoint)
earlyStop = EarlyStoppingCallback(early_stopping_patience= 5, early_stopping_threshold=.1)
    
def model_init():
    return (DistilBertForQuestionAnswering.from_pretrained(checkpoint, dropout=DISTILBERT_DROPOUT, attention_dropout=DISTILBERT_ATT_DROPOUT))

# If regression or non-stratified case
X_train_subset, _ = train_test_split(train,test_size=0.8)

In [None]:
def optuna_hp_space(trial):
    return {
        "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 4, log=True),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [2, 4, 8, 16]),
        "gradient_accumulation_steps": trial.suggest_categorical("gradient_accumulation_steps", [1, 2, 4, 8]),
        "per_device_eval_batch_size": trial.suggest_categorical("per_device_eval_batch_size", [2, 4, 8, 16]),
        "eval_accumulation_steps": trial.suggest_categorical("eval_accumulation_steps", [1, 2, 4, 8]),
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-2, log=True),
        "warmup_ratio": trial.suggest_float("warmup_ratio", 0.1, 0.6, log=True),
        "weight_decay": trial.suggest_float("weight_decay", 0.01, 0.1, log=True), 
        "lr_scheduler_type": trial.suggest_categorical("lr_scheduler_type", ["linear", "cosine", "cosine_with_restarts", "inverse_sqrt"]),
}

def model_init(trial):
    if trial is not None:
        DISTILBERT_DROPOUT = trial.suggest_float("dropout", 0.1, 0.6, log=True)
        DISTILBERT_ATT_DROPOUT = trial.suggest_float("dropout", 0.1, 0.6, log=True)
    else:
        DISTILBERT_DROPOUT = 0
        DISTILBERT_ATT_DROPOUT = 0
        
    if not torch.backends.mps.is_available():
        if not torch.backends.mps.is_built():
            print("MPS not available because the current PyTorch install was not "
                "built with MPS enabled.")
        else:
            print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")
    else:
            mps_device = torch.device("mps")

    model = DistilBertForQuestionAnswering.from_pretrained(checkpoint, dropout=DISTILBERT_DROPOUT, attention_dropout=DISTILBERT_ATT_DROPOUT)
    model.to(mps_device)

    return model

training_args = TrainingArguments(
    logging_steps=128,
    eval_strategy="steps",
    save_strategy="steps",
    eval_steps=128,
    save_steps=512,
    push_to_hub=False,
)

trainer = BartTrainer(
    model_init=model_init,
    model=None,
    args=training_args,
    train_dataset=X_train_subset,
    eval_dataset=val,
    data_collator=DefaultDataCollator(),
    processing_class=tokenizer,
)

storage = RDBStorage(url="sqlite:///QBAM_study.db")
    
    
# best_trial = trainer.hyperparameter_search(
#     backend="optuna",
#     hp_space=optuna_hp_space,
#     n_trials=0,
#     load_if_exists=True,
#     study_name="QBAM_study",
#     storage=storage,
# )

In [None]:
def model_init():

    DISTILBERT_DROPOUT = 0.4
    DISTILBERT_ATT_DROPOUT = 0.4
    
    if not torch.backends.mps.is_available():
        if not torch.backends.mps.is_built():
            print("MPS not available because the current PyTorch install was not "
                "built with MPS enabled.")
        else:
            print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")
    else:
            mps_device = torch.device("mps")

    model = DistilBertForQuestionAnswering.from_pretrained(checkpoint, dropout=DISTILBERT_DROPOUT, attention_dropout=DISTILBERT_ATT_DROPOUT)
    model.to(mps_device)

    return model

training_args = TrainingArguments(
    # General Training Settings
    output_dir="../res/models/optuna_QBAM",
    num_train_epochs=2,
    logging_steps=128,
    
    # Batch Size & Accumulation
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=16,
    eval_accumulation_steps=4,
    
    # Learning Rate and Scheduler
    learning_rate= 0.0001295,
    warmup_ratio=0.2,
    lr_scheduler_type="inverse_sqrt",
    
    # Weight Decay & Regularization
    weight_decay=0.012,
    
    # Checkpoints & Saving
    eval_strategy="steps",
    save_strategy="best",
    eval_steps=128,
    save_steps=512,
    load_best_model_at_end=True,
    
    # Metrics & Evaluation
    include_for_metrics=['loss'],
    metric_for_best_model="loss",
    
    # Hub
    push_to_hub=False,
)

# Trainer Initialization
trainer = BartTrainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train,
    eval_dataset=val,
    data_collator=DefaultDataCollator(),
    processing_class=tokenizer,
)

# Training and Model Saving
trainer.train(resume_from_checkpoint=True)
trainer.save_model("../res/models/optuna_QBAM")

In [11]:
def model_init():

    DISTILBERT_DROPOUT = 0.4
    DISTILBERT_ATT_DROPOUT = 0.4
    
    if not torch.backends.mps.is_available():
        if not torch.backends.mps.is_built():
            print("MPS not available because the current PyTorch install was not "
                "built with MPS enabled.")
        else:
            print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")
    else:
            mps_device = torch.device("mps")

    model = DistilBertForQuestionAnswering.from_pretrained(checkpoint, dropout=DISTILBERT_DROPOUT, attention_dropout=DISTILBERT_ATT_DROPOUT)
    model.to(mps_device)

    return model

training_args = TrainingArguments(
    # General Training Settings
    output_dir="../res/models/optuna_IgnoreIMP",
    num_train_epochs=2,
    logging_steps=128,
    
    # Batch Size & Accumulation
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=16,
    eval_accumulation_steps=4,
    
    # Learning Rate and Scheduler
    learning_rate= 0.0001295,
    warmup_ratio=0.2,
    lr_scheduler_type="inverse_sqrt",
    
    # Weight Decay & Regularization
    weight_decay=0.012,
    
    # Checkpoints & Saving
    eval_strategy="steps",
    save_strategy="best",
    eval_steps=128,
    save_steps=512,
    load_best_model_at_end=True,
    
    # Metrics & Evaluation
    include_for_metrics=['loss'],
    metric_for_best_model="loss",
    
    # Hub
    push_to_hub=False,
)

# Trainer Initialization
trainer = BartTrainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_2,
    eval_dataset=val_2,
    data_collator=DefaultDataCollator(),
    processing_class=tokenizer,
)

# Training and Model Saving
trainer.train(resume_from_checkpoint=True)
trainer.save_model("../res/models/optuna_IgnoreIMP")

  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
  checkpoint_rng_state = torch.load(rng_file)


Step,Training Loss,Validation Loss
1664,0.6929,0.749141
1792,0.6766,0.762972
1920,0.6717,0.716213
2048,0.679,0.706112
2176,0.6319,0.651514
2304,0.6115,0.692766
2432,0.6273,0.649843
2560,0.6351,0.646807
2688,0.5725,0.649976
2816,0.6157,0.635251


In [9]:
def model_init():

    DISTILBERT_DROPOUT = 0.4
    DISTILBERT_ATT_DROPOUT = 0.4
    
    if not torch.backends.mps.is_available():
        if not torch.backends.mps.is_built():
            print("MPS not available because the current PyTorch install was not "
                "built with MPS enabled.")
        else:
            print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")
    else:
            mps_device = torch.device("mps")

    model = DistilBertForQuestionAnswering.from_pretrained(checkpoint, dropout=DISTILBERT_DROPOUT, attention_dropout=DISTILBERT_ATT_DROPOUT)
    model.to(mps_device)

    return model

training_args = TrainingArguments(
    # General Training Settings
    output_dir="../res/models/optuna_IncludeNA",
    num_train_epochs=2,
    logging_steps=128,
    
    # Batch Size & Accumulation
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=16,
    eval_accumulation_steps=4,
    
    # Learning Rate and Scheduler
    learning_rate= 0.0001295,
    warmup_ratio=0.2,
    lr_scheduler_type="inverse_sqrt",
    
    # Weight Decay & Regularization
    weight_decay=0.012,
    
    # Checkpoints & Saving
    eval_strategy="steps",
    save_strategy="best",
    eval_steps=128,
    save_steps=512,
    load_best_model_at_end=True,
    
    # Metrics & Evaluation
    include_for_metrics=['loss'],
    metric_for_best_model="loss",
    
    # Hub
    push_to_hub=False,
)

# Trainer Initialization
trainer = BartTrainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train,
    eval_dataset=val,
    data_collator=DefaultDataCollator(),
    processing_class=tokenizer,
)

# Training and Model Saving
trainer.train(resume_from_checkpoint=True)
trainer.save_model("../res/models/optuna_IncludeNA")

  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
  checkpoint_rng_state = torch.load(rng_file)


Step,Training Loss,Validation Loss
2048,0.9311,0.96134
2176,0.8986,1.032816
2304,0.8818,0.924747
2432,0.8901,0.908818
2560,0.885,0.952159
2688,0.8395,0.96115
2816,0.8745,0.976212
2944,0.7844,0.95621


In [250]:

def extract_final_answer(text): 
    words = {}
    final = []
    for word in text.split(" "):
        if word not in words:
            words[word] = word
            final.append(word)
        else: 
            if final[-1] == "The" or final[-1] == "A":
                final.pop(-1)
            break
    return "_".join(final)



def valid_spans(st_pos, ed_pos, k):
        top_k_idx_start = np.argpartition(st_pos, range(-k, 0, 1), None)[-k:]
        top_k_idx_end = np.argpartition(ed_pos, range(-k, 0, 1), None)[-k:]
        zeroes = None
        if 0 in top_k_idx_start or 0 in top_k_idx_end:
            top_k_idx_start = np.delete(top_k_idx_start, np.where(top_k_idx_start == 0))
            top_k_idx_end = np.delete(top_k_idx_end, np.where(top_k_idx_end == 0))
            zeroes = [(0,0)]
            
        try:
            pair_matrix = list(itertools.product(top_k_idx_start, top_k_idx_end)) + zeroes
        except:
             pair_matrix = list(itertools.product(top_k_idx_start, top_k_idx_end))
             
        for x in pair_matrix: 
            st, ed = x
            if st > ed: 
                pair_matrix.remove(x)
        score_matrix = np.full(len(pair_matrix), np.NINF)

        for i, pair in enumerate(pair_matrix):
            start, end = pair
            score_matrix[i] = st_pos[0,start] + ed_pos[0,end]
        
        lst = ([pair_matrix[x] for x in np.argpartition(score_matrix, range(-k, 0, 1), None)[-k:]])
        lst.reverse()
        
        return lst
    


$$ 
\text{recall} = \frac{\text{\# of matches}}{\text{\# of terms in ground truth}}
$$
$$
\text{precision} = \frac{\text{\# of matches}}{\text{\# of terms in pred}}
$$

In [265]:
def eval(model, k, tokenizer, data):
    exact_match_with_ans = 0
    exact_match_no_ans = 0
    num_imp = 0
    num_no_ans = 0
    num_with_ans = 0
    F1 = []
    time = []
    top_k_acc = []
    precsion = []
    recall = []

    for x in data:
        test_question = x
        s_time = perf_counter_ns()
        question, text = (
            test_question["full_question"],
            test_question["context"]["contents"],
        )
        try:
            inputs = tokenizer(
                text=text,
                text_pair=question,
                padding="max_length",
                truncation="only_first",
                max_length=512,
                return_tensors="pt",
                padding_side="right",
            )
        except:
            cleaned = utils.clean_text(question)
            inputs = tokenizer(
                text=text,
                text_pair=cleaned,
                padding="max_length",
                truncation="only_first",
                max_length=512,
                return_tensors="pt",
                padding_side="right",
                return_length=True,
            )
        outputs = model(**inputs)
        e_time = perf_counter_ns()
        time.append(e_time - s_time)

        top_k = valid_spans(
            outputs.start_logits.detach(), outputs.end_logits.detach(), k
        )
        answer_start_index, answer_end_index = top_k[0]
        start_indexes = x["encodings"]["start_positions"]
        end_indexes = x["encodings"]["end_positions"]


        # exact match
        if start_indexes[0] == 0:
            if (answer_start_index in start_indexes) and (end_indexes[start_indexes.index(answer_start_index)] == answer_end_index):
                exact_match_no_ans += 1
        elif (answer_start_index in start_indexes) and (end_indexes[start_indexes.index(answer_start_index)] == answer_end_index):
                exact_match_with_ans += 1
            
        if start_indexes[0] == -1:
            num_imp += 1  
        elif start_indexes[0] == 0:
            num_no_ans += 1
        else: 
            num_with_ans += 1


        pred_span = set(range(answer_start_index, (answer_end_index + 1)))

        matches = 0
        for i, curr_ans in enumerate(start_indexes):
            if curr_ans == -1:
                continue

            if answer_start_index > answer_end_index:
                cur_matches = 0
            else:
                cur_span = set(range(curr_ans, (end_indexes[i] + 1)))
                cur_matches = len(pred_span & cur_span)

            if cur_matches > matches:
                matches = cur_matches

        prec = matches / (answer_end_index - answer_start_index + 1)
        if np.isnan(prec):
            print(matches, answer_end_index, answer_start_index)
        rec = matches / (end_indexes[0] - start_indexes[0] + 1)

        F1.append((2 * prec * rec) / (prec + rec)) if (prec + rec) != 0 else 0
        precsion.append(prec)
        recall.append(rec)

        acc = 0
        for start, end in top_k:
            if (start in start_indexes) and (end in end_indexes):
                acc = 1
        
        if start_indexes[0] != -1:
            top_k_acc.append(acc)

    if num_no_ans == 0:
        no_ans_acc = 0
    else:
        no_ans_acc = exact_match_no_ans / num_no_ans

    scores = {
        "exact_match_no_ans": no_ans_acc,
        "exact_match_with_ans": exact_match_with_ans / num_with_ans,
        "num_no_ans": num_no_ans,
        "num_with_ans": num_with_ans,
        "num_imp": num_imp,
        "F1": np.mean(F1),
        "time(ns)": np.mean(time),
        f"top_{k}_acc": np.mean(top_k_acc),
        "recall": np.mean(recall),
        "precision": np.mean(precsion),
    }

    return scores


In [272]:
optuna_IgnoreIMP = DistilBertForQuestionAnswering.from_pretrained("../res/models/optuna_IgnoreIMP")
optuna_IgnoreIMP_tokenizer = DistilBertTokenizerFast.from_pretrained("../res/models/optuna_IgnoreIMP")

eval(optuna_IgnoreIMP, 5, optuna_IgnoreIMP_tokenizer, ds1['guesstest'])

{'exact_match_no_ans': 0,
 'exact_match_with_ans': 0.6703601108033241,
 'num_no_ans': 0,
 'num_with_ans': 1444,
 'num_imp': 707,
 'F1': 0.90692353,
 'time(ns)': 70900623.83170618,
 'top_5_acc': 0.7901662049861495,
 'recall': 0.5611749426254308,
 'precision': 0.5089982}

This model was trained on a dataset that excludes any questions where the answer does not appear within the pre-generated context. As a result, the model only needs to learn to identify the most likely answer tokens. It achieved an accuracy of around 67%. However, this figure does not reflect real-world performance, as it assumes the context always contains the correct answer. In practice, this is not guaranteed. For instance, in the test set, 33% of the contexts did not include the correct answer, though the overall average context hit rate is about 83%. The model’s top-5 accuracy was around 80%, meaning that in most cases, at least one of the top five predicted spans was an exact match with the correct answer.

In [267]:
optuna_IncludeNA = DistilBertForQuestionAnswering.from_pretrained("../res/models/optuna_IncludeNA")
optuna_IncludeNA_tokenizer = DistilBertTokenizerFast.from_pretrained("../res/models/optuna_IncludeNA")
eval(optuna_IncludeNA, 5, optuna_IncludeNA_tokenizer, ds2['guesstest'])

{'exact_match_no_ans': 0.9533239038189534,
 'exact_match_with_ans': 0.036011080332409975,
 'num_no_ans': 707,
 'num_with_ans': 1444,
 'num_imp': 0,
 'F1': 0.9438061129263381,
 'time(ns)': 62142272.69688517,
 'top_5_acc': 0.7754532775453278,
 'recall': 0.35977064931039826,
 'precision': 0.3400126411593621}

This model was trained on a dataset that, instead of ignoring unanswerable questions, required the model to either find the most likely answer span or determine that the context did not contain the answer. This increased the task complexity, making the decision process more difficult. As a result, the model struggled to perform both tasks effectively.

It achieved 95% accuracy in identifying contexts with no answer, which is a strong result. However, its accuracy for correctly answering questions with an answer present was only 3%. This suggests that the model is either defaulting to predicting “no answer” for most questions or is correctly identifying unanswerable cases but failing to extract correct answer spans when the answer is present.

Both precision and recall were low, indicating poor overlap between the predicted spans and the true answer spans. This generally reflects poor span prediction performance. While the model’s top-5 accuracy was comparable to that of the first model, it’s important to note that this includes unanswerable questions, which make up about a third of the dataset. When considering only answerable questions, the effective top-5 accuracy drops to around 50%. Even so, identifying the unanswerable questions is an improtant task, and if the model can effectivly make this distinction It can be used as a intermediary model that identifies if the context needs to be regenerated, improving overall performance. 

In [268]:
optuna_QBAM = DistilBertForQuestionAnswering.from_pretrained("../res/models/optuna_QBAM")
optuna_QBAM_tokenizer = DistilBertTokenizerFast.from_pretrained("../res/models/optuna_QBAM")
eval(optuna_QBAM, 5, optuna_QBAM_tokenizer, ds3['test'])

{'exact_match_no_ans': 0,
 'exact_match_with_ans': 0.27303523035230354,
 'num_no_ans': 0,
 'num_with_ans': 1476,
 'num_imp': 675,
 'F1': 0.69548494,
 'time(ns)': 63985076.74012087,
 'top_5_acc': 0.7947154471544715,
 'recall': 0.5808254558603234,
 'precision': 0.35171145}

This model was trained using the same techniques and methodology as the first model, which ignored unanswerable questions. However, the dataset used here had a duplication issue: the title and the first term of the body text were often identical, and placed directly adjacent to each other. This caused confusion for the model, making it difficult to determine the correct start and end positions for answer spans. As a result, it produced redundant predictions such as “Texas, The state Texas”.

This duplication affected precision more than recall. Since the ground truth and predicted spans often included similar tokens, the recall remained relatively stable. However, precision suffered because the predictions contained excessive tokens resulting in the precsion essentially halved, given how precision is a function of matches over prediciton span.

In [269]:
squad_QBAM = DistilBertForQuestionAnswering.from_pretrained(checkpoint)
squad_QBAM_tokenizer = DistilBertTokenizerFast.from_pretrained(checkpoint)
eval(squad_QBAM, 5, squad_QBAM_tokenizer, ds3['test'])

{'exact_match_no_ans': 0,
 'exact_match_with_ans': 0.0,
 'num_no_ans': 0,
 'num_with_ans': 1476,
 'num_imp': 675,
 'F1': 0.29166666,
 'time(ns)': 62484182.24918643,
 'top_5_acc': 0.09688346883468835,
 'recall': 0.0005578800557880056,
 'precision': 0.0005313143418146698}

Finally, when evaluating a general-purpose pretrained model not designed for pyramid-style questions, the performance was very poor—it almost never predicted any of the correct answer spans. This highlights the importance of training models specifically for the task at hand. The improvement seen with the specialized model suggests that further specialization could yield even better results. However, given the nature of the Quiz Bowl format, where questions gradually reveal information and vary significantly in structure, developing highly specialized models may not be practical or scalable.

In [193]:
import spacy 
nlp = spacy.load("en_core_web_lg")

def rand_example(k:int, data, model, tokenizer):
    examples = [data[int(x)] for x in (np.random.default_rng().integers(low=0, high=(len(data) - 1), size=k))]
    for quest in examples:
        question, text = quest['full_question'], quest['context']['contents']
        try:
            inputs = tokenizer(
                text = text, 
                text_pair=question, 
                padding = 'max_length', 
                truncation = 'only_first', 
                max_length = 512, 
                return_tensors = 'pt', 
                padding_side = 'right'
                )
        except:
            cleaned = utils.clean_text(question)
            inputs =  tokenizer(
                text = text,
                text_pair = cleaned, 
                padding = 'max_length', 
                truncation = 'only_first', 
                max_length = 512, 
                return_tensors = 'pt', 
                padding_side = 'right',
                return_length = True
                )
        outputs = model(**inputs)
        answer_start_index = None
        answer_end_index = None
        
        top_k = valid_spans(outputs.start_logits.detach(), outputs.end_logits.detach(), k)
        answer_start_index, answer_end_index= top_k[0]
        
        if answer_start_index == None : 
            answer_start_index = 0
            answer_end_index = 0                    
        decoded_str = tokenizer.decode(inputs['input_ids'][0,answer_start_index:answer_end_index+ 1])


        # pred = extract_final_answer(decoded_str)
  
        print("Question: " + quest["first_sentence"])
        print("Answer: " + quest['answer'])
        print("Final Prediction: "+ decoded_str)
        print(f"Score: {outputs.start_logits[0,answer_start_index]+ outputs.end_logits[0,answer_end_index]}\n")

In [280]:
optuna_IgnoreIMP = DistilBertForQuestionAnswering.from_pretrained("../res/models/optuna_IgnoreIMP")
optuna_IgnoreIMP_tokenizer = DistilBertTokenizerFast.from_pretrained("../res/models/optuna_IgnoreIMP")

rand_example(5, ds1['guesstrain'], optuna_IgnoreIMP, optuna_IgnoreIMP_tokenizer)

Question: When this compound is added to proteins with aromatic rings, they turn yellow; that test makes use of the xanthoproteic reaction.
Answer: Nitric_acid
Final Prediction: Nitric acid
Score: 20.90878677368164

Question: This novel gave rise to a namesake genre of books, one of which by Philip Cozans is named after a figure from this novel who has an aunt named Ophelia from Vermont and who teaches a girl named Topsy about God.
Answer: Uncle_Tom's_Cabin
Final Prediction: Uncle Tom ' s Cabin Cabin
Score: 23.21484375

Question: Pulsed amperometry is combined with this technique in analysis of sugars.
Answer: High-performance_liquid_chromatography
Final Prediction: Liquid chromatography – mass spectrometry
Score: 12.616650581359863

Question: In Chilean mythology, throwing the Trauco's staff in a fire will cause it to produce this substance which can be used to heal people who were hurt by the Trauco's curses.  
Answer: Oil
Final Prediction: silken mailcoat ” BULLET : : : : - Babr - e