# Evaluate models on testset

In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
!pip install -q transformers
!pip install -q datasets
!pip install -q sentencepiece

In [16]:
from datasets import load_dataset, load_from_disk, Dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from tqdm import tqdm
from IPython.display import clear_output
import torch
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from nltk.translate.bleu_score import sentence_bleu
model_checkpoint = "HooshvareLab/bert-fa-base-uncased"
DRIVE_PATH = "/content/drive/MyDrive/PQA/"
max_length = 512 # The maximum length of a feature (question and context)
doc_stride = 256 # The authorized overlap between two part of the context when splitting it is needed.

# Load and prepare Test set

In [17]:
test_dataset = load_from_disk(DRIVE_PATH + "test2.hf").shuffle(seed=42)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



# EM and F1 and BLEU parameters 

In [18]:
def compute_f1(prediction, answer):
    pred_tokens = prediction.split()
    answer_tokens = answer.split()
    
    if len(pred_tokens) == 0 or len(answer_tokens) == 0:
        return int(pred_tokens == answer_tokens)
    
    common_tokens = set(pred_tokens) & set(answer_tokens)
    
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(answer_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

def compute_exact_match(prediction, answer):
    return int(prediction == answer)

def bleu(prediction, answer) : 
  reference = [answer.split(' ')]
  candidate = pred.split(' ')
  BLEU = sentence_bleu(reference, candidate)
  BLEU1 = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0))
  BLEU4 = sentence_bleu(reference, candidate, weights=(0, 0, 0, 1))

  return BLEU, BLEU1, BLEU4


# Prediction Class

In [None]:
class AnswerPredictor:
  def __init__(self, model, tokenizer, device='cuda', n_best=10, max_length=512, stride=256, no_answer=False):
      """Initializes PyTorch Question Answering Prediction
      It's best to leave use the default values.
      Args:
          model: Fine-tuned torch model
          tokenizer: Transformers tokenizer
          device (torch.device): Running device
          n_best (int): Number of best possible answers
          max_length (int): Tokenizer max length
          stride (int): Tokenizer stride
          no_answer (bool): If True, model can return "no answer"
      """
      self.model = model.eval().to(device)
      self.tokenizer = tokenizer
      self.device = device
      self.max_length = max_length
      self.stride = stride
      self.no_answer = no_answer
      self.n_best = n_best


  def model_pred(self, questions, contexts, batch_size=1):
      n = len(contexts)
      if n%batch_size!=0:
          raise Exception("batch_size must be divisible by sample length")

      tokens = self.tokenizer(questions, contexts, add_special_tokens=True, 
                              return_token_type_ids=True, return_tensors="pt", padding=True, 
                              return_offsets_mapping=True, truncation="only_second", 
                              max_length=self.max_length, stride=self.stride)

      start_logits, end_logits = [], []
      for i in tqdm(range(0, n-batch_size+1, batch_size)):
          with torch.no_grad():
              out = self.model(tokens['input_ids'][i:i+batch_size].to(self.device), 
                          tokens['attention_mask'][i:i+batch_size].to(self.device), 
                          tokens['token_type_ids'][i:i+batch_size].to(self.device))

              start_logits.append(out.start_logits)
              end_logits.append(out.end_logits)

      return tokens, torch.stack(start_logits).view(n, -1), torch.stack(end_logits).view(n, -1)


  def __call__(self, questions, contexts, batch_size=1, answer_max_len=100):
      """Creates model prediction
      
      Args: 
          questions (list): Question strings
          contexts (list): Contexts strings
          batch_size (int): Batch size
          answer_max_len (int): Sets the longests possible length for any answer
        
      Returns:
          dict: The best prediction of the model
              (e.g {0: {"text": str, "score": int}})
      """
      tokens, starts, ends = self.model_pred(questions, contexts, batch_size=batch_size)
      start_indexes = starts.argsort(dim=-1, descending=True)[:, :self.n_best]
      end_indexes = ends.argsort(dim=-1, descending=True)[:, :self.n_best]

      preds = {}
      for i, (c, q) in enumerate(zip(contexts, questions)):  
          min_null_score = starts[i][0] + ends[i][0] # 0 is CLS Token
          start_context = tokens['input_ids'][i].tolist().index(self.tokenizer.sep_token_id)
          
          offset = tokens['offset_mapping'][i]
          valid_answers = []
          for start_index in start_indexes[i]:
              # Don't consider answers that are in questions
              if start_index<start_context:
                  continue
              for end_index in end_indexes[i]:
                  # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                  # to part of the input_ids that are not in the context.
                  if (start_index >= len(offset) or end_index >= len(offset)
                      or offset[start_index] is None or offset[end_index] is None):
                      continue
                  # Don't consider answers with a length that is either < 0 or > max_answer_length.
                  if end_index < start_index or (end_index-start_index+1) > answer_max_len:
                      continue

                  start_char = offset[start_index][0]
                  end_char = offset[end_index][1]
                  valid_answers.append({"score": (starts[i][start_index] + ends[i][end_index]).item(),
                                        "text": c[start_char: end_char]})
                  
          if len(valid_answers) > 0:
              best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
          else:
              best_answer = {"text": "", "score": min_null_score}

          if self.no_answer:
              preds[i] = best_answer if best_answer["score"] >= min_null_score else {"text": "", "score": min_null_score}
          else:
              preds[i] = best_answer

      return preds


# ParsBERT Evaluation

In [None]:
DRIVE_PATH = "/content/drive/MyDrive/PQA/"
model_path = DRIVE_PATH + f"checkpoints/checkpoint-2548/"  ## load model trained for 2 epochs
model = AutoModelForQuestionAnswering.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
predictor = AnswerPredictor(model, tokenizer, device='cuda', n_best=10, no_answer=True)

EH = F1 = BLEU = BLEU1 = BLEU4 = 0

for example in test_dataset: 

  if example['answers']['text'] == [] :
    context = example['context'] 
    question = example['question'] 
    preds = predictor([question], [context], batch_size=1)
    pred = preds[0]['text'].strip()
    if len(pred) == 0 : 
      EH += 1
      F1 += 1
      BLEU += 1
      BLEU1 += 1
      BLEU4 += 1

    continue

  context = example['context'] 
  question = example['question'] 
  preds = predictor([question], [context], batch_size=1)
  pred = preds[0]['text'].strip()

  ######### find best match #######################
  index = 0
  max_score = 0
  for m in range(len(example['answers']['text'])) : 
    temp_ans = example['answers']['text'][m]
    temp_f1 = compute_f1(pred, temp_ans)
    if temp_f1 > max_score : 
      index = m
      max_score = temp_f1
  ##################################################

  answer = example['answers']['text'][index]


  EH += compute_exact_match(pred, answer)
  F1 += compute_f1(pred, answer)
  b, b1, b4 = bleu(pred, answer)
  BLEU += b
  BLEU1 += b1
  BLEU4 += b4

EH /= len(test_dataset)
F1 /= len(test_dataset)
BLEU /= len(test_dataset)
BLEU1 /= len(test_dataset)
BLEU4 /= len(test_dataset)

In [None]:
print('Exact match of Pars BERT on testset : ' + str(EH))
print('F1 score of Pars BERT on testset : ' + str(F1))
print('BLEU score of Pars BERT on testset : ' + str(BLEU))
print('BLEU1 score of Pars BERT on testset : ' + str(BLEU1))
print('BLEU4 score of Pars BERT on testset : ' + str(BLEU4))

Exact match of Pars BERT on testset : 0.46153846153846156
F1 score of Pars BERT on testset : 0.6146364364558041
BLEU score of Pars BERT on testset : 0.4131725420582979
BLEU1 score of Pars BERT on testset : 0.5793761765330291
BLEU4 score of Pars BERT on testset : 0.4074068024567045


#Evaluation of ParsT5

In [None]:
! gdown 1Lcs5eGTIhy0JUY9FW2pn-80m3CHyVtvQ
! unzip ParsT5.zip
!pip install transformers

In [None]:
from transformers import T5ForConditionalGeneration, AutoTokenizer
model_path = '/content/content/drive/MyDrive/parsT5_QA/model_4'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

In [None]:
EH = F1 = BLEU = BLEU1 = BLEU4 

for example in test_dataset: 

  context = example['context'] 
  question = example['question']
  
  input = 'متن: ' + context + '، پرسش: ' + question
  input_ids = tokenizer.encode(input, return_tensors='pt')
  output_ids = model.generate(input_ids, max_length=150, num_beams=2, repetition_penalty=2.5, length_penalty=1.0, early_stopping=True)
  output = ' '.join([tokenizer.decode(id) for id in output_ids])
  pred = output.replace('<pad>', '').replace('</s>', '').strip()

  
  if example['answers']['text'] == [] :
    if pred == 'بدون پاسخ' : 
      EH += 1
      F1 += 1
      BLEU += 1
      BLEU1 += 1
      BLEU4 += 1
    continue

   ######### find best match #######################
  index = 0
  max_score = 0
  for m in range(len(example['answers']['text'])) : 
    temp_ans = example['answers']['text'][m]
    temp_f1 = compute_f1(pred, temp_ans)
    if temp_f1 > max_score : 
      index = m
      max_score = temp_f1
  ##################################################

  answer = example['answers']['text'][index]


  EH += compute_exact_match(pred, answer)
  F1 += compute_f1(pred, answer)
  b, b1, b4 = bleu(pred, answer)
  BLEU += b
  BLEU1 += b1
  BLEU4 += b4

EH /= len(test_dataset)
F1 /= len(test_dataset)
BLEU /= len(test_dataset)
BLEU1 /= len(test_dataset)
BLEU4 /= len(test_dataset)

In [None]:
print('Exact match of ParsT5 on testset : ' + str(EH))
print('F1 score of ParsT5 on testset : ' + str(F1))
print('BLEU score of ParsT5 on testset : ' + str(BLEU))
print('BLEU1 score of ParsT5 on testset : ' + str(BLEU1))
print('BLEU4 score of ParsT5 on testset : ' + str(BLEU4))

Exact match of ParsT5 on testset : 0.45327524038461536
F1 score of ParsT5 on testset : 0.4815071358518649
BLEU score of ParsT5 on testset : 0.3314958137059677
BLEU1 score of ParsT5 on testset : 0.47223412366086
BLEU4 score of ParsT5 on testset : 0.33133417034113266


# ALBERT Evaluation

In [None]:
DRIVE_PATH = "/content/drive/MyDrive/PQA/checkpoints/model/"
model_path = DRIVE_PATH + f"checkpoint-1699/" 
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForQuestionAnswering.from_pretrained(model_path)
device = 'cuda'
model.eval().to(device)

In [None]:
predictor = AnswerPredictor(model, tokenizer, device='cuda', n_best=10, no_answer=True)

EH = F1 = BLEU = BLEU1 = BLEU4 = 0

for example in test_dataset: 

  if example['answers']['text'] == [] :
    context = example['context'] 
    question = example['question'] 
    preds = predictor([question], [context], batch_size=1)
    pred = preds[0]['text'].strip()
    if len(pred) == 0 : 
      EH += 1
      F1 += 1
      BLEU += 1
      BLEU1 += 1
      BLEU4 += 1

    continue

  context = example['context'] 
  question = example['question'] 
  preds = predictor([question], [context], batch_size=1)
  pred = preds[0]['text'].strip()

  ######### find best match #######################
  index = 0
  max_score = 0
  for m in range(len(example['answers']['text'])) : 
    temp_ans = example['answers']['text'][m]
    temp_f1 = compute_f1(pred, temp_ans)
    if temp_f1 > max_score : 
      index = m
      max_score = temp_f1
  ##################################################

  answer = example['answers']['text'][index]


  EH += compute_exact_match(pred, answer)
  F1 += compute_f1(pred, answer)
  b, b1, b4 = bleu(pred, answer)
  BLEU += b
  BLEU1 += b1
  BLEU4 += b4

EH /= len(test_dataset)
F1 /= len(test_dataset)
BLEU /= len(test_dataset)
BLEU1 /= len(test_dataset)
BLEU4 /= len(test_dataset)

In [None]:
print('Exact match of ALBERT on testset : ' + str(EH))
print('F1 score of ALBERT on testset : ' + str(F1))
print('BLEU score of ALBERT on testset : ' + str(BLEU))
print('BLEU1 score of ALBERT on testset : ' + str(BLEU1))
print('BLEU4 score of ALBERT on testset : ' + str(BLEU4))

Exact match of ALBERT on testset : 0.4737079326923077
F1 score of ALBERT on testset : 0.5291084943589603
BLEU score of ALBERT on testset : 0.3697263492063661
BLEU1 score of ALBERT on testset : 0.5156283202327573
BLEU4 score of ALBERT on testset : 0.3680442027180467


# mBERT Evaluation

In [19]:
DRIVE_PATH = "/content/drive/MyDrive/PQA/"
model_path = DRIVE_PATH + f"checkpoints/checkpoint-17000/"  ## load model trained for 2 epochs
model = AutoModelForQuestionAnswering.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
predictor = AnswerPredictor(model, tokenizer, device='cuda', n_best=10, no_answer=True)

EH = F1 = BLEU = BLEU1 = BLEU4 = 0
cnt = 0
for example in test_dataset: 
  cnt+=1
  print(cnt)

  if example['answers']['text'] == [] :
    context = example['context'] 
    question = example['question'] 
    preds = predictor([question], [context], batch_size=1)
    pred = preds[0]['text'].strip()
    if len(pred) == 0 : 
      EH += 1
      F1 += 1
      BLEU += 1
      BLEU1 += 1
      BLEU4 += 1

    continue

  context = example['context'] 
  question = example['question'] 
  preds = predictor([question], [context], batch_size=1)
  pred = preds[0]['text'].strip()

  ######### find best match #######################
  index = 0
  max_score = 0
  for m in range(len(example['answers']['text'])) : 
    temp_ans = example['answers']['text'][m]
    temp_f1 = compute_f1(pred, temp_ans)
    if temp_f1 > max_score : 
      index = m
      max_score = temp_f1
  ##################################################

  answer = example['answers']['text'][index]


  EH += compute_exact_match(pred, answer)
  F1 += compute_f1(pred, answer)
  b, b1, b4 = bleu(pred, answer)
  BLEU += b
  BLEU1 += b1
  BLEU4 += b4

EH /= len(test_dataset)
F1 /= len(test_dataset)
BLEU /= len(test_dataset)
BLEU1 /= len(test_dataset)
BLEU4 /= len(test_dataset)

In [21]:
print('Exact match of mBERT on testset : ' + str(EH))
print('F1 score of mBERT on testset : ' + str(F1))
print('BLEU score of mBERT on testset : ' + str(BLEU))
print('BLEU1 score of mBERT on testset : ' + str(BLEU1))
print('BLEU4 score of mBERT on testset : ' + str(BLEU4))

Exact match of mBERT on testset : 0.6484375
F1 score of mBERT on testset : 0.6926880638353844
BLEU score of mBERT on testset : 0.42744129904426637
BLEU1 score of mBERT on testset : 0.6832573296203045
BLEU4 score of mBERT on testset : 0.4261544478449596
