<a href="https://colab.research.google.com/github/FatemehArabzadeh/nlp-qa/blob/main/evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from tqdm import tqdm



class AnswerPredictor:
  def __init__(self, model, tokenizer, device='cuda', n_best=10, max_length=512, stride=256, no_answer=False):
      """
          n_best (int): Number best answers
          max_length (int): Tokenizer max length
          stride (int): Tokenizer stride
          no_answer: If True, model return "no answer"
      """
      self.model = model.eval().to(device)
      self.tokenizer = tokenizer
      self.device = device
      self.max_length = max_length
      self.stride = stride
      self.no_answer = no_answer
      self.n_best = n_best


  def model_predictions(self, questions, contexts, batch_size=1):
      n = len(contexts)
      if n%batch_size!=0:
          raise Exception("batch_size must be divisible by sample length")

      tokens = self.tokenizer(questions, contexts, add_special_tokens=True,
                              return_token_type_ids=True, return_tensors="pt", padding=True,
                              return_offsets_mapping=True, truncation="only_second",
                              max_length=self.max_length, stride=self.stride)

      start_logits, end_logits = [], []
      for i in tqdm(range(0, n-batch_size+1, batch_size)):
          with torch.no_grad():
              out = self.model(tokens['input_ids'][i:i+batch_size].to(self.device),
                          tokens['attention_mask'][i:i+batch_size].to(self.device),
                          tokens['token_type_ids'][i:i+batch_size].to(self.device))

              start_logits.append(out.start_logits)
              end_logits.append(out.end_logits)

      return tokens, torch.stack(start_logits).view(n, -1), torch.stack(end_logits).view(n, -1)


  def __call__(self, questions, contexts, batch_size=1, answer_max_len=100):
      """Creates model prediction

          answer_max_len: longests possible len answer

      Return:
          dict: best prediction of the model

      """
      tokens, starts, ends = self.model_predictions(questions, contexts, batch_size=batch_size)
      start_indexes = starts.argsort(dim=-1, descending=True)[:, :self.n_best]
      end_indexes = ends.argsort(dim=-1, descending=True)[:, :self.n_best]
      preds = {}
      for i, (c, q) in enumerate(zip(contexts, questions)):
          min_null_score = (starts[i][0] + ends[i][0]).item() # 0 is CLS Token
          start_context = tokens['input_ids'][i].tolist().index(self.tokenizer.sep_token_id)
          offset = tokens['offset_mapping'][i]
          valid_answers = []
          for start_index in start_indexes[i]:
              # Don't consider answers that are in questions
              if start_index<start_context:
                  continue
              for end_index in end_indexes[i]:

                  if (start_index >= len(offset) or end_index >= len(offset)
                      or offset[start_index] is None or offset[end_index] is None):
                      continue
                  # answers with len < 0 or > max_answer_length
                  if end_index < start_index or (end_index-start_index+1) > answer_max_len:
                      continue
                  start_char = offset[start_index][0]
                  end_char = offset[end_index][1]
                  valid_answers.append({"score": (starts[i][start_index] + ends[i][end_index]).item(),
                                        "text": c[start_char: end_char]})

          if len(valid_answers) > 0:
              best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
          else:
              best_answer = {"text": "", "score": min_null_score}
          if self.no_answer:
              preds[i] = best_answer if best_answer["score"] >= min_null_score else {"text": "", "score": min_null_score}
          else:
              preds[i] = best_answer
      return preds

In [1]:
!pip install datasets evaluate transformers[sentencepiece]
from datasets import load_dataset
datasets  = load_dataset("SajjadAyoubi/persian_qa")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/165k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9008 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/930 [00:00<?, ? examples/s]

In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from datasets import load_metric
from collections import Counter
import re

model_name = "Farabzadeh/qa-bert-base-multilingual-uncased"
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)



questions, contexts, answers = datasets["validation"]['question'], datasets["validation"]['context'], datasets["validation"]['answers']

#  predictions
predictor = AnswerPredictor(model, tokenizer, device='cuda')
preds = predictor(questions, contexts, batch_size=10)

# cleaner function
def cleaner(text):
    return re.sub('\u200c', " ", text).strip()


# squad_v2 HuggingFace metrics
metric = load_metric("squad_v2")

formatted_preds = [{"id": str(k),
                    "prediction_text": cleaner(v['text']),
                    "no_answer_probability": 0.0}
                    for k, v in preds.items()]

references = [{"id": str(i),
               "answers": {'answer_start': a['answer_start'],
                          'text': map(cleaner, a['text'])}}
              for i, a in enumerate(answers)]

print(metric.compute(predictions=formatted_preds, references=references))


100%|██████████| 93/93 [00:22<00:00,  4.15it/s]
  metric = load_metric("squad_v2")


Downloading builder script:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.19k [00:00<?, ?B/s]

The repository for squad_v2 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/squad_v2.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y
{'exact': 29.35483870967742, 'f1': 48.88644788455928, 'total': 930, 'HasAns_exact': 41.935483870967744, 'HasAns_f1': 69.83778269222755, 'HasAns_total': 651, 'NoAns_exact': 0.0, 'NoAns_f1': 0.0, 'NoAns_total': 279, 'best_exact': 30.967741935483872, 'best_exact_thresh': 0.0, 'best_f1': 49.20902852972062, 'best_f1_thresh': 0.0}
