- Fine-tuning ParsBERT on PersianQA and ParSQUAD Datasets 

[Model and Datasets Path](https://drive.google.com/drive/folders/1_lgylP6jWCZGiqmvJWO-u7NOoqpD7LUp?usp=sharing)

# Setup

In [1]:
!pip install -q transformers
!pip install -q datasets

[K     |████████████████████████████████| 4.7 MB 30.4 MB/s 
[K     |████████████████████████████████| 596 kB 55.4 MB/s 
[K     |████████████████████████████████| 6.6 MB 65.8 MB/s 
[K     |████████████████████████████████| 101 kB 13.4 MB/s 
[K     |████████████████████████████████| 365 kB 33.4 MB/s 
[K     |████████████████████████████████| 115 kB 75.8 MB/s 
[K     |████████████████████████████████| 212 kB 45.8 MB/s 
[K     |████████████████████████████████| 141 kB 66.0 MB/s 
[K     |████████████████████████████████| 127 kB 71.1 MB/s 
[?25h

In [2]:
import json
import pandas as pd

In [3]:
from datasets import load_dataset, load_from_disk, Dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import TrainingArguments, Trainer

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
model_checkpoint = "HooshvareLab/bert-fa-base-uncased"
max_length = 512 # The maximum length of a feature (question and context)
doc_stride = 256 # The authorized overlap between two part of the context when splitting it is needed.
batch_size = 8
lr = 3e-5
epoch = 2
DRIVE_PATH = "/content/drive/MyDrive/NLP_Final/"


# Load and Process Data

In [7]:
train_path = DRIVE_PATH + 'Datasets/train_merged.json'
dev_path = DRIVE_PATH + 'Datasets/dev_merged.json'
test_path = DRIVE_PATH + 'Datasets/test_merged.json'


In [8]:
with open(train_path,'r',encoding='utf-8') as f:
  train_data = json.load(f)

with open(dev_path,'r',encoding='utf-8') as f:
  dev_data = json.load(f)

with open(test_path,'r',encoding='utf-8') as f:
  test_data = json.load(f)


In [9]:
train_df = pd.DataFrame(columns=['context','question','answers'])
for row in train_data:
  for par in row['paragraphs']:
    context = par['context']
    qas = par['qas']
    for qa in qas:
      question = qa['question']
      starts = []
      texts = []
      for ans in qa['answers']:
        if len(ans) != 0:
          answer_start = ans['answer_start']
          answer_text = ans['text']
        else:
          answer_start = ''
          answer_text = ''
        starts.append(answer_start)
        texts.append(answer_text)
      train_df = train_df.append({'context': context, 'question': question, 'answers': {'answer_start': starts, 'text': texts}}, ignore_index=True)


In [10]:
dev_df = pd.DataFrame(columns=['context','question','answers'])
for row in dev_data:
  for par in row['paragraphs']:
    context = par['context']
    qas = par['qas']
    for qa in qas:
      question = qa['question']
      starts = []
      texts = []
      for ans in qa['answers']:
        if len(ans) != 0:
          answer_start = ans['answer_start']
          answer_text = ans['text']
        else:
          answer_start = ''
          answer_text = ''
        starts.append(answer_start)
        texts.append(answer_text)
      dev_df = dev_df.append({'context': context, 'question': question, 'answers': {'answer_start': starts, 'text': texts}}, ignore_index=True)


In [11]:
test_df = pd.DataFrame(columns=['context','question','answers'])
for row in test_data:
  for par in row['paragraphs']:
    context = par['context']
    qas = par['qas']
    for qa in qas:
      question = qa['question']
      starts = []
      texts = []
      for ans in qa['answers']:
        if len(ans) != 0:
          answer_start = ans['answer_start']
          answer_text = ans['text']
        else:
          answer_start = ''
          answer_text = ''
        starts.append(answer_start)
        texts.append(answer_text)
      test_df = test_df.append({'context': context, 'question': question, 'answers': {'answer_start': starts, 'text': texts}}, ignore_index=True)


In [13]:
print(train_df.shape)

(68010, 3)


In [17]:
train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(dev_df)
test_dataset = Dataset.from_pandas(test_df)

In [25]:
train_df = train_df.to_csv(DRIVE_PATH + 'Datasets/train.csv')
dev_df = dev_df.to_csv(DRIVE_PATH + 'Datasets/dev.csv')
test_df = test_df.to_csv(DRIVE_PATH + 'Datasets/test.csv')

In [18]:
train_dataset.save_to_disk("train.hf")
dev_dataset.save_to_disk("dev.hf")
test_dataset.save_to_disk("test.hf")

In [22]:
!cp -r '/content/train.hf' "/content/drive/MyDrive/NLP_Final/Datasets/train.hf"
!cp -r '/content/test.hf' "/content/drive/MyDrive/NLP_Final/Datasets/test.hf"
!cp -r '/content/dev.hf' "/content/drive/MyDrive/NLP_Final/Datasets/dev.hf"

# Load Datasets

In [6]:
train_dataset = load_from_disk(DRIVE_PATH + "Datasets/train.hf").shuffle(seed=42)
dev_dataset = load_from_disk(DRIVE_PATH + "Datasets/dev.hf").shuffle(seed=42)
test_dataset = load_from_disk(DRIVE_PATH + "Datasets/test.hf").shuffle(seed=42)



In [7]:
print(len(train_dataset))

68010


# Tokenize Datasets

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading config.json:   0%|          | 0.00/440 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

In [9]:
def prepare_train_features(examples):
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,)
    
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []
    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])
            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1
            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1
            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [10]:
# the datasets library does cashing itself, batched is multitreading for fast-tokenizer
tokenized_train = train_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)
tokenized_dev = dev_dataset.map(prepare_train_features, batched=True, remove_columns=dev_dataset.column_names)
tokenized_test = test_dataset.map(prepare_train_features, batched=True, remove_columns=test_dataset.column_names)

  0%|          | 0/69 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

# Training 

In [11]:
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

Downloading pytorch_model.bin:   0%|          | 0.00/624M [00:00<?, ?B/s]

Some weights of the model checkpoint at HooshvareLab/bert-fa-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized 

In [12]:
args = TrainingArguments(
    DRIVE_PATH + f"checkpoints2",
    save_strategy = 'epoch',
    evaluation_strategy = "epoch",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epoch,
    weight_decay=0.0001) 

In [13]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev,
    tokenizer=tokenizer)

In [14]:
# start training
trainer.train()

***** Running training *****
  Num examples = 68086
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 17022


Epoch,Training Loss,Validation Loss
1,1.3827,1.5298


***** Running Evaluation *****
  Num examples = 5959
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/NLP_Final/checkpoints2/checkpoint-8511
Configuration saved in /content/drive/MyDrive/NLP_Final/checkpoints2/checkpoint-8511/config.json
Model weights saved in /content/drive/MyDrive/NLP_Final/checkpoints2/checkpoint-8511/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/NLP_Final/checkpoints2/checkpoint-8511/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/NLP_Final/checkpoints2/checkpoint-8511/special_tokens_map.json


Epoch,Training Loss,Validation Loss
1,1.3827,1.5298
2,0.8224,1.609336


***** Running Evaluation *****
  Num examples = 5959
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/NLP_Final/checkpoints2/checkpoint-17022
Configuration saved in /content/drive/MyDrive/NLP_Final/checkpoints2/checkpoint-17022/config.json
Model weights saved in /content/drive/MyDrive/NLP_Final/checkpoints2/checkpoint-17022/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/NLP_Final/checkpoints2/checkpoint-17022/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/NLP_Final/checkpoints2/checkpoint-17022/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=17022, training_loss=1.2338444949905203, metrics={'train_runtime': 8082.9396, 'train_samples_per_second': 16.847, 'train_steps_per_second': 2.106, 'total_flos': 1.88476616843136e+16, 'train_loss': 1.2338444949905203, 'epoch': 2.0})

# Test Model

In [15]:
!pip install -q sentencepiece

from tqdm import tqdm
from IPython.display import clear_output
import torch
from transformers import AutoModelForQuestionAnswering, AutoTokenizer


[?25l[K     |▎                               | 10 kB 27.4 MB/s eta 0:00:01[K     |▌                               | 20 kB 35.0 MB/s eta 0:00:01[K     |▊                               | 30 kB 40.6 MB/s eta 0:00:01[K     |█                               | 40 kB 42.0 MB/s eta 0:00:01[K     |█▎                              | 51 kB 44.6 MB/s eta 0:00:01[K     |█▌                              | 61 kB 49.2 MB/s eta 0:00:01[K     |█▉                              | 71 kB 33.7 MB/s eta 0:00:01[K     |██                              | 81 kB 33.7 MB/s eta 0:00:01[K     |██▎                             | 92 kB 35.5 MB/s eta 0:00:01[K     |██▋                             | 102 kB 37.1 MB/s eta 0:00:01[K     |██▉                             | 112 kB 37.1 MB/s eta 0:00:01[K     |███                             | 122 kB 37.1 MB/s eta 0:00:01[K     |███▍                            | 133 kB 37.1 MB/s eta 0:00:01[K     |███▋                            | 143 kB 37.1 MB/s eta 0:

In [17]:
DRIVE_PATH = "/content/drive/MyDrive/NLP_Final/"
model_path = DRIVE_PATH + f"checkpoints2/checkpoint-17022/"  ## load model trained for 2 epochs
model = AutoModelForQuestionAnswering.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)


loading configuration file /content/drive/MyDrive/NLP_Final/checkpoints2/checkpoint-17022/config.json
Model config BertConfig {
  "_name_or_path": "/content/drive/MyDrive/NLP_Final/checkpoints2/checkpoint-17022/",
  "architectures": [
    "BertForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.21.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 100000
}

loading weights file /content/drive/MyDrive/NLP_Final/checkpoints2/checkpoint-17022/pytorch_model.bin
All model checkpoint weights were used when initializing BertForQuestionAnswering.

Al

In [18]:
class AnswerPredictor:
  def __init__(self, model, tokenizer, device='cuda', n_best=10, max_length=512, stride=256, no_answer=False):
      """Initializes PyTorch Question Answering Prediction
      It's best to leave use the default values.
      Args:
          model: Fine-tuned torch model
          tokenizer: Transformers tokenizer
          device (torch.device): Running device
          n_best (int): Number of best possible answers
          max_length (int): Tokenizer max length
          stride (int): Tokenizer stride
          no_answer (bool): If True, model can return "no answer"
      """
      self.model = model.eval().to(device)
      self.tokenizer = tokenizer
      self.device = device
      self.max_length = max_length
      self.stride = stride
      self.no_answer = no_answer
      self.n_best = n_best


  def model_pred(self, questions, contexts, batch_size=1):
      n = len(contexts)
      if n%batch_size!=0:
          raise Exception("batch_size must be divisible by sample length")

      tokens = self.tokenizer(questions, contexts, add_special_tokens=True, 
                              return_token_type_ids=True, return_tensors="pt", padding=True, 
                              return_offsets_mapping=True, truncation="only_second", 
                              max_length=self.max_length, stride=self.stride)

      start_logits, end_logits = [], []
      for i in tqdm(range(0, n-batch_size+1, batch_size)):
          with torch.no_grad():
              out = self.model(tokens['input_ids'][i:i+batch_size].to(self.device), 
                          tokens['attention_mask'][i:i+batch_size].to(self.device), 
                          tokens['token_type_ids'][i:i+batch_size].to(self.device))

              start_logits.append(out.start_logits)
              end_logits.append(out.end_logits)

      return tokens, torch.stack(start_logits).view(n, -1), torch.stack(end_logits).view(n, -1)


  def __call__(self, questions, contexts, batch_size=1, answer_max_len=100):
      """Creates model prediction
      
      Args: 
          questions (list): Question strings
          contexts (list): Contexts strings
          batch_size (int): Batch size
          answer_max_len (int): Sets the longests possible length for any answer
        
      Returns:
          dict: The best prediction of the model
              (e.g {0: {"text": str, "score": int}})
      """
      tokens, starts, ends = self.model_pred(questions, contexts, batch_size=batch_size)
      start_indexes = starts.argsort(dim=-1, descending=True)[:, :self.n_best]
      end_indexes = ends.argsort(dim=-1, descending=True)[:, :self.n_best]

      preds = {}
      for i, (c, q) in enumerate(zip(contexts, questions)):  
          min_null_score = starts[i][0] + ends[i][0] # 0 is CLS Token
          start_context = tokens['input_ids'][i].tolist().index(self.tokenizer.sep_token_id)
          
          offset = tokens['offset_mapping'][i]
          valid_answers = []
          for start_index in start_indexes[i]:
              # Don't consider answers that are in questions
              if start_index<start_context:
                  continue
              for end_index in end_indexes[i]:
                  # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                  # to part of the input_ids that are not in the context.
                  if (start_index >= len(offset) or end_index >= len(offset)
                      or offset[start_index] is None or offset[end_index] is None):
                      continue
                  # Don't consider answers with a length that is either < 0 or > max_answer_length.
                  if end_index < start_index or (end_index-start_index+1) > answer_max_len:
                      continue

                  start_char = offset[start_index][0]
                  end_char = offset[end_index][1]
                  valid_answers.append({"score": (starts[i][start_index] + ends[i][end_index]).item(),
                                        "text": c[start_char: end_char]})
                  
          if len(valid_answers) > 0:
              best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
          else:
              best_answer = {"text": "", "score": min_null_score}

          if self.no_answer:
              preds[i] = best_answer if best_answer["score"] >= min_null_score else {"text": "", "score": min_null_score}
          else:
              preds[i] = best_answer

      return preds


In [20]:
predictor = AnswerPredictor(model, tokenizer, device='cuda', n_best=10, no_answer=True)


In [28]:
test_dataset[9]

{'context': 'BSkyB سرویس HDTV خود را با نام Sky + HD در 22 مه 2006 راه اندازی کرد. قبل از راه اندازی ، BSkyB ادعا کرد که 40،000 نفر برای دریافت سرویس HD ثبت نام کرده اند. در هفته قبل از راه اندازی ، شایعاتی مبنی بر اینکه BSkyB در تهیه جعبه بالا (STB) خود از تولید کننده تامسون مشکل دارد ، آشکار شد. در روز پنجشنبه 18 مه 2006 ، و در آخر هفته قبل از راه اندازی ، مردم گزارش دادند که BSkyB نصب آن را لغو کرده یا دوباره برنامه ریزی کرده است. سرانجام ، بی بی سی گزارش داد که 17000 مشتری هنوز به دلیل تحویل ناموفق ، خدمات دریافت نکرده اند. در تاریخ 31 مارس 2012 ، آسمان تعداد کل خانه های دارای Sky + HD را 4222000 اعلام کرد.',
 'question': 'چه زمانی BSkyB سرویس HDTV خود را راه اندازی کرد؟',
 'answers': {'answer_start': [43, 40, 43],
  'text': ['22 مه 2006', 'در 22 مه 2006', '22 مه 2006']}}

In [29]:
context = test_dataset[9]['context'] 
question = test_dataset[9]['question'] 
answer = test_dataset[9]['answers']['text']
print(answer)


preds = predictor([question], [context], batch_size=1)
pred = preds[0]['text'].strip()
print('\n\n Model Prediction: ', preds[0]['text'].strip())


# print(compute_exact_match(answer, pred))
# print(compute_f1(answer, pred))

['22 مه 2006', 'در 22 مه 2006', '22 مه 2006']


100%|██████████| 1/1 [00:00<00:00, 14.38it/s]



 Model Prediction:  22 مه 2006





In [None]:
print(len(test_dataset))

1144


In [30]:
def compute_f1(prediction, answer):
    pred_tokens = prediction.split()
    answer_tokens = answer.split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(answer_tokens) == 0:
        return int(pred_tokens == answer_tokens)
    
    common_tokens = set(pred_tokens) & set(answer_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(answer_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

def compute_exact_match(prediction, answer):
    return int(prediction == answer)

In [23]:
def compute_exact_match(prediction, answer):
    return int(prediction == answer)

In [None]:
EH = 0
F1 = 0
pred_df = pd.DataFrame(columns = ('answer', 'pred'))

for example in test_dataset: 

  #print(len(example['answers']))
  #if len(example['answers']) != 2: 
    #print(example['answers'])
  if example['answers']['text'] == [] :
    context = example['context'] 
    question = example['question'] 
    preds = predictor([question], [context], batch_size=1)
    pred = preds[0]['text'].strip()
    # if pred == [] : 
    #   EH += 1
    #   F1 += 1

    continue

  context = example['context'] 
  question = example['question'] 
  answer = example['answers']['text'][0]
  preds = predictor([question], [context], batch_size=1)
  pred = preds[0]['text'].strip()
  pred_df = pred_df.append({'answer':answer, 'pred': pred}, ignore_index=True)
  # EH += compute_exact_match(pred, answer)
  # F1 += compute_f1(pred, answer)

# EH /= len(test_dataset)
# F1 /= len(test_dataset)


# print(EH)
# print(F1)

In [33]:
pred_df


Unnamed: 0,answer,pred
0,هفت,
1,2018,2018
2,مصرف,مصرف
3,1788,1788
4,22 مه 2006,22 مه 2006
...,...,...
283,1624,1624
284,کربن موجود در پوشش گیاهی,
285,50٪ تا 60٪,50
286,رودخانه چارلز,رودخانه چارلز


In [None]:
predictor = AnswerPredictor(model, tokenizer, device='cuda', n_best=10, no_answer=True)

EH = 0
F1 = 0

for example in test_dataset: 

  #print(len(example['answers']))
  #if len(example['answers']) != 2: 
    #print(example['answers'])
  if example['answers']['text'] == [] :
    context = example['context'] 
    question = example['question'] 
    preds = predictor([question], [context], batch_size=1)
    pred = preds[0]['text'].strip()
    if pred == [] : 
      EH += 1
      F1 += 1

    continue

  context = example['context'] 
  question = example['question'] 
  answer = example['answers']['text'][0]
  preds = predictor([question], [context], batch_size=1)
  pred = preds[0]['text'].strip()

  EH += compute_exact_match(pred, answer)
  F1 += compute_f1(pred, answer)

EH /= len(test_dataset)
F1 /= len(test_dataset)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
100%|██████████| 1/1 [00:00<00:00, 67.92it/s]
100%|██████████| 1/1 [00:00<00:00, 68.44it/s]
100%|██████████| 1/1 [00:00<00:00, 62.69it/s]
100%|██████████| 1/1 [00:00<00:00, 57.91it/s]
100%|██████████| 1/1 [00:00<00:00, 57.09it/s]
100%|██████████| 1/1 [00:00<00:00, 86.96it/s]
100%|██████████| 1/1 [00:00<00:00, 79.58it/s]
100%|██████████| 1/1 [00:00<00:00, 63.47it/s]
100%|██████████| 1/1 [00:00<00:00, 62.58it/s]
100%|██████████| 1/1 [00:00<00:00, 59.32it/s]
100%|██████████| 1/1 [00:00<00:00, 52.65it/s]
100%|██████████| 1/1 [00:00<00:00, 75.82it/s]
100%|██████████| 1/1 [00:00<00:00, 65.89it/s]
100%|██████████| 1/1 [00:00<00:00, 52.27it/s]
100%|██████████| 1/1 [00:00<00:00, 84.69it/s]
100%|██████████| 1/1 [00:00<00:00, 85.12it/s]
100%|██████████| 1/1 [00:00<00:00, 72.62it/s]
100%|██████████| 1/1 [00:00<00:00, 79.92it/s]
100%|██████████| 1/1 [00:00<00:00, 60.23it/s]
100%|██████████| 1/1 [00:00<00:00, 70.04it/s]
100%|██████████

In [35]:
print('Exact match of Pars BERT on testset : ' + str(EH))
print('F1 score of Pars BERT on testset : ' + str(F1))

Exact match of Pars BERT on testset : 0.22295673076923078
F1 score of Pars BERT on testset : 0.2890025996589516
