The code for pre-processing was obtained from HuggingFace: https://huggingface.co/docs/transformers/tasks/question_answering
<br><br>
The code for post-processing was obtained also from HuggingFace as directed by the above: https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/videos/qa_postprocessing_pt.ipynb
<br><br>
The code for hyperparameter optimisation was obtained also from Weights & Biases: 
https://wandb.ai/matt24/vit-snacks-sweeps/reports/Hyperparameter-Search-for-HuggingFace-Transformer-Models--VmlldzoyMTUxNTg0#exploring-hyperparameter-combinations-with-sweeps

## Setting the seeds

In [None]:
import transformers
import torch

seed = 362935
torch.manual_seed(seed) # for torch
transformers.set_seed(seed) # for transformers

## Define base and target model path/name

In [None]:
base_model = "bert-base-uncased"
target_model = "BERT SQuAD V1"

## WandB setup

In [None]:
import wandb

In [None]:
! wandb login [API KEY GOES HERE]

In [None]:
sweep_config = {
    'method': 'bayes',
    'name': target_model,
    'metric':{
        'name': 'eval/loss',
        'goal': 'minimize'
    }
    
}

parameters_dict = {
    'epochs': {
        'values': [1, 2, 3, 4, 5]
    },
    'learning_rate': {
        'distribution': 'log_uniform_values',
        'min': 1e-5,
        'max': 2e-5
    },
    'weight_decay': {
        'values': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
    },
    'dropout':{
        'values': [0.1, 0.15, 0.20, 0.25, 0.3]
    }
}

sweep_config['parameters'] = parameters_dict
sweep_id = wandb.sweep(sweep_config, project='fyp') 

## Loading the training dataset

In [None]:
from datasets import load_dataset, load_metric, Dataset

In [None]:
# squad = Dataset.from_file("train.arrow")
squad = load_dataset("squad", split="train")

## Pre processing the dataset

In [None]:
squad = squad.train_test_split(test_size=0.1, shuffle=True, seed=seed)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(base_model)

In [None]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

## Fine tuning

In [None]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [None]:
from transformers import TrainingArguments, Trainer, AutoModelForQuestionAnswering, AutoConfig

def train(config=None):
    with wandb.init(config=config):
        config = wandb.config
        
        training_args = TrainingArguments(
            output_dir=f'checkpoints-run-{wandb.run.id}',
            report_to='wandb',
            num_train_epochs=config.epochs,
            learning_rate=config.learning_rate,
            weight_decay=config.weight_decay,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            save_strategy='epoch',
            evaluation_strategy='epoch',
            logging_strategy='epoch',
            run_name=f"run-{wandb.run.id}",
            seed=seed
        )
        
        model_config = AutoConfig.from_pretrained(base_model)
        model_config.hidden_dropout_prob = config.dropout
        model_config.attention_probs_dropout_prob = config.dropout
        model = AutoModelForQuestionAnswering.from_pretrained(base_model, config=config)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(device)    
        
        trainer = Trainer(
            model= model,
            args=training_args,
            train_dataset=tokenized_squad["train"],
            eval_dataset=tokenized_squad["test"],
            tokenizer=tokenizer,
            data_collator=data_collator,
        )

        trainer.train()
        torch.cuda.empty_cache()

In [None]:
wandb.agent(sweep_id, train, count=10)

In [None]:
wandb.finish()

## Save best model

In [None]:
import os

def get_last_checkpoint(folder_path):
    checkpoints = [f for f in os.listdir(folder_path) if f.startswith("checkpoint-")]

    if not checkpoints:
        return None

    last_checkpoint = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))[-1]
    return os.path.join(folder_path, last_checkpoint)

In [None]:
api = wandb.Api()
sweep = api.sweep(f"andreaborg/fyp/sweeps/{sweep_id}")
best_run = sweep.best_run()
best_params = best_run.config
best_checkpoint = AutoModelForQuestionAnswering.from_pretrained(get_last_checkpoint(f"checkpoints-run-{best_run.id}"))
best_checkpoint.save_pretrained(target_model)
tokenizer.save_pretrained(target_model)

In [None]:
import json
import shutil

with open(f"Best Hyperparameters for {target_model}.json", 'w') as json_file:
    json_file.write(json.dumps(best_params, indent=4))

shutil.move(f"Best Hyperparameters for {target_model}.json", target_model)

## Post-processing

In [None]:
# valid = Dataset.from_file("valid.arrow")
valid = load_dataset("squad", split="validation")

In [None]:
def find_labels(offsets, answer_start, answer_end, sequence_ids):
    idx = 0
    while sequence_ids[idx] != 1:
        idx += 1
    context_start = idx
    while sequence_ids[idx] == 1:
        idx += 1
    context_end = idx - 1

    if offsets[context_start][0] > answer_end or offsets[context_end][1] < answer_start:
        return (0, 0)
    else:
        idx = context_start
        while idx <= context_end and offsets[idx][0] <= answer_start:
            idx += 1
        start_position = idx - 1

        idx = context_end
        while idx >= context_start and offsets[idx][1] >= answer_end:
            idx -= 1
        end_position = idx + 1

        return start_position, end_position

In [None]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        padding="max_length",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
    )

    offset_mapping = inputs["offset_mapping"]
    sample_map = inputs.pop("overflow_to_sample_mapping")
    inputs["start_positions"] = []
    inputs["end_positions"] = []
    inputs["example_id"] = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        inputs["example_id"].append(examples["id"][sample_idx])
        sequence_ids = inputs.sequence_ids(i)
        offset_mapping[i] = [(o if s == 1 else None) for o, s in zip(offset, sequence_ids)]
        start, end = find_labels(
            offset, examples["answer_start"][sample_idx], examples["answer_end"][sample_idx], sequence_ids
        )

        inputs["start_positions"].append(start)
        inputs["end_positions"].append(end)

    return inputs

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(target_model)

valid = valid.remove_columns(["title"])

def prepare_data(example):
    answer = example["answers"]["text"][0]
    example["answer_start"] = example["answers"]["answer_start"][0]
    example["answer_end"] = example["answer_start"] + len(answer)
    return example

validation_set = valid.map(prepare_data, remove_columns=["answers"])

validation_features = validation_set.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=validation_set.column_names,
)

len(validation_set), len(validation_features)

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForQuestionAnswering, default_data_collator

model = AutoModelForQuestionAnswering.from_pretrained(target_model)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

dataloader = DataLoader(
    validation_features.remove_columns(["example_id", "offset_mapping"]),
    batch_size=64,
    collate_fn=default_data_collator
)

In [None]:
from tqdm.auto import tqdm

start_logits = []
end_logits = []

for batch in tqdm(dataloader):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    start_logits.append(outputs.start_logits.cpu())
    end_logits.append(outputs.end_logits.cpu())

start_logits = torch.cat(start_logits, dim=0).numpy()
end_logits = torch.cat(end_logits, dim=0).numpy()

In [None]:
import collections

example_to_feature = collections.defaultdict(list)
for idx, feature in enumerate(validation_features):
    example_id = feature["example_id"]
    example_to_feature[example_id].append(idx)

In [None]:
import numpy as np

start_logit = start_logits[0]
end_logit = end_logits[0]
offsets = validation_features[0]["offset_mapping"]

context = validation_set[0]["context"]

start_indexes = np.argsort(start_logit)[-1 : -21 : -1].tolist()
end_indexes = np.argsort(end_logit)[-1 : -21 : -1].tolist()
answers = []
for start_index in start_indexes:
    for end_index in end_indexes:
        # Predicting (0, 0) means no answer.
        if start_index == 0 and end_index == 0:
            answers.append({"text": "", "logit_score": start_logit[start_index] + end_logit[end_index]})
        # Skip answers that are not fully in the context.
        elif offsets[start_index] is None or offsets[end_index] is None:
            continue
        # Skip answers with a length that is either < 0 or > max_answer_length.
        elif end_index < start_index or end_index - start_index + 1 > 30:
            continue
        else:
            answers.append({
                "text": context[offsets[start_index][0]: offsets[end_index][1]],
                "logit_score": start_logit[start_index] + end_logit[end_index],
            })

In [None]:
predicted_answer = max(answers, key = lambda x: x["logit_score"])
print(f"Predicted answer: {predicted_answer}")

answer_start = validation_set[0]["answer_start"]
answer_end = validation_set[0]["answer_end"]
right_answer = context[answer_start: answer_end]
print(f"Theorerical answer: {right_answer}")

In [None]:
predicted_answers = {}
for example in tqdm(validation_set):
    example_id = example["id"]
    context = example["context"]
    answers = []

    for feature_index in example_to_feature[example_id]:
        start_logit = start_logits[feature_index]
        end_logit = end_logits[feature_index]
        offsets = validation_features[feature_index]["offset_mapping"]

        start_indexes = np.argsort(start_logit)[-1 : -11 : -1].tolist()
        end_indexes = np.argsort(end_logit)[-1 : -11 : -1].tolist()
        for start_index in start_indexes:
            for end_index in end_indexes:
                # Predicting (0, 0) means no answer.
                if start_index == 0 and end_index == 0:
                    answers.append({"text": "", "logit_score": start_logit[start_index] + end_logit[end_index]})
                # Skip answers that are not fully in the context.
                elif offsets[start_index] is None or offsets[end_index] is None:
                    continue
                # Skip answers with a length that is either < 0 or > max_answer_length.
                elif end_index < start_index or end_index - start_index + 1 > 30:
                    continue
                else:
                    answers.append({
                        "text": context[offsets[start_index][0]: offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    })

    best_answer = max(answers, key= lambda x: x["logit_score"])
    predicted_answers[example_id] = best_answer["text"]

In [None]:
metric = load_metric("squad")

In [None]:
references = [{"id": ex["id"], "answers": ex["answers"]} for ex in valid]
predictions = [{ 'id': example_id, 'prediction_text': predicted_text } for example_id, predicted_text in predicted_answers.items()]
metrics = metric.compute(predictions=predictions, references=references)

In [None]:
metrics

In [None]:
import json
import shutil

with open(f"Metrics for {target_model}.json", 'w') as json_file:
    json_file.write(json.dumps(metrics))
    
shutil.move(f"Metrics for {target_model}.json", target_model)