In [None]:
# Transformers installation
! pip install transformers datasets evaluate

In [17]:
model_meta_data = [
    {"model": 'buddhilive/bert-finetuned-squad', "tokenizer": 'bert-base-uncased'},
    {"model": 'buddhilive/albert-finetuned-squad', "tokenizer": 'albert-base-v2'},
    {"model": 'buddhilive/distilbert-finetuned-squad', "tokenizer": 'distilbert-base-uncased'},
    {"model": 'buddhilive/mobilebert-finetuned-squad', "tokenizer": 'google/mobilebert-uncased'},
    {"model": 'buddhilive/roberta-finetuned-squad', "tokenizer": 'roberta-base'},
    {"model": 'buddhilive/bert-nopretrained-squad', "tokenizer": 'bert-base-uncased'},
]

In [None]:
# Load SQuAD dataset
from datasets import load_dataset

squad = load_dataset("squad", split="validation[:2000]")

In [None]:
import evaluate
from transformers import AutoTokenizer
from transformers import TFAutoModelForQuestionAnswering
# create a data collector
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator(return_tensors="tf")
metric = evaluate.load("squad")

In [5]:
# Load Tokenizers
def loadTokenizer(tokenizer_name):
  _tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
  return _tokenizer

In [6]:
# Load Models for Inference
def loadModel(model_name):
  _base_model = TFAutoModelForQuestionAnswering.from_pretrained(model_name)
  return _base_model

In [7]:
# validation preprocess
def preprocess_validation_examples(examples, tokenizer):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [8]:
def getValidationData(tokenizer):
  tokenizer = tokenizer
  validation_dataset = squad.map(
      lambda data: preprocess_validation_examples(data, tokenizer),
      batched=True,
      remove_columns=squad.column_names,
  )
  return validation_dataset

In [9]:
def getTFDataset(model, validation_dataset):
  tf_dataset = model.prepare_tf_dataset(
      validation_dataset,
      shuffle=False,
      batch_size=16,
      collate_fn=data_collator,
  )
  return tf_dataset

In [10]:
# evaluate Function
from tqdm.auto import tqdm
import collections
import numpy as np

n_best = 20
max_answer_length = 30
predicted_answers = []

def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [11]:
#predict
def getPrediction(model, tf_validation_set):
  predictions = model.predict(tf_validation_set)
  return predictions

In [12]:
def evaluateData(predictions, validation_dataset):
  eval_result = compute_metrics(
      predictions["start_logits"],
      predictions["end_logits"],
      validation_dataset,
      squad,
  )
  return eval_result

In [None]:
eval_results = []

for x in tqdm(model_meta_data):
  # Load tokenizer
  _base_tokenizer = loadTokenizer(x["tokenizer"])
  # Load Model
  _base_model = loadModel(x["model"])
  # Tokenize data
  _base_val_data = getValidationData(_base_tokenizer)
  # Prediction dataset
  _base_dataset = getTFDataset(_base_model, _base_val_data)
  # Predictions
  _base_prediction = getPrediction(_base_model, _base_dataset)
  # result
  _base_result = evaluateData(_base_prediction, _base_val_data)
  print(x["model"], _base_result)

  _base_result["model_name"] = x["model"]
  eval_results.append(_base_result)

In [22]:
import pandas as pd

eval_df = pd.DataFrame(eval_results)

def getEvalDF(df):
    df['f1'] = df['f1'].round(2)
    df['exact_match'] = df['exact_match'].round(2)
    return df

eval_df = getEvalDF(eval_df.copy())

eval_df

Unnamed: 0,exact_match,f1,model_name
0,62.5,71.73,buddhilive/bert-finetuned-squad
1,78.3,85.52,buddhilive/albert-finetuned-squad
2,58.4,67.47,buddhilive/distilbert-finetuned-squad
3,2.8,9.73,buddhilive/mobilebert-finetuned-squad
4,79.6,86.12,buddhilive/roberta-finetuned-squad
5,2.75,9.19,buddhilive/bert-nopretrained-squad
