In [None]:
! pip install datasets
! pip install evaluate
! pip install rouge_score

In [101]:
from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("Skratch99/bert-pretrained", from_tf = True)
model = TFAutoModelForQuestionAnswering.from_pretrained("Skratch99/finetuned-bert-squadv2")

All model checkpoint layers were used when initializing TFBertForQuestionAnswering.

All the layers of TFBertForQuestionAnswering were initialized from the model checkpoint at Skratch99/finetuned-bert-squadv2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForQuestionAnswering for predictions without further training.


In [102]:
from datasets import load_dataset, load_metric, concatenate_datasets

raw_datasets = load_dataset("squad_v2")

In [103]:
def remove_no_answers(row):
    if len(row["answers"]["text"]) > 0:
        return True
    else:
        return False


def remove_no_start(row):
    if len(row["answers"]["answer_start"]) > 0:
        return True
    else:
        return False

raw_datasets = raw_datasets.filter(remove_no_answers)
raw_datasets = raw_datasets.filter(remove_no_start)

In [104]:
t1 = raw_datasets["train"]
t2 = raw_datasets["validation"]

raw_datasets = concatenate_datasets([t1, t2])

split_dataset = raw_datasets.class_encode_column("title").train_test_split(
    test_size=0.2, stratify_by_column="title", seed=1
)

In [105]:
test_dataset = split_dataset["test"]

In [106]:
from transformers import pipeline

qa_pipeline = pipeline(
    "question-answering",
    model=model,
    tokenizer=tokenizer
)

In [107]:
from tqdm import tqdm
import concurrent.futures

predictions = []
references = []

def process_example(example):
    context = example["context"]
    question = example["question"]


    # Check if all expected answers are the same
    # for expected_answer in set(expected_answer_list):
        # Your question answering model pipeline or function (replace qa_pipeline with your actual function)
    predicted_answer = qa_pipeline(question=question, context=context)

        # Append predictions in the required format
    predictions.append({'prediction_text': predicted_answer['answer'], 'id': example['id'], 'no_answer_probability': 0.})

    references.append({"id": example["id"], "answers": example["answers"]})

# Number of parallel processes, adjust as needed
num_processes = 5

# Using concurrent.futures to parallelize the processing
with concurrent.futures.ThreadPoolExecutor(max_workers=num_processes) as executor:
    list(tqdm(executor.map(process_example, test_dataset), total=len(test_dataset)))


100%|██████████| 1855/1855 [24:28<00:00,  1.26it/s]


In [111]:
import pickle

with open('p9.pkl', 'wb') as f:
  pickle.dump(predictions, f)

with open('r9.pkl', 'wb') as f:
  pickle.dump(references, f)

In [123]:
import evaluate

metric = evaluate.load("squad_v2")
vals = metric.compute(predictions=predictions, references=references)
print(vals)

metric = load_metric("rouge")
rouge = metric.compute(predictions=predictions, references=references)
print("Rouge: ", rouge)

print("F1 Score: ", vals['f1'])
print("Exact Match: ", vals['exact'])



metric = load_metric("meteor")
meteor = metric.compute(predictions=predictions, references=references)
print("Meteor: ",meteor['meteor']*100)


preds = []
theo = []
for i,j in zip(predictions,references):
  preds.append(i['prediction_text'])
  theo.append(j['answers']['text'])
metric = evaluate.load("bleu")
bleu =metric.compute(predictions=preds, references=theo)

print(bleu)



{'exact': 10.878706199460916, 'f1': 18.90159908670889, 'total': 18550, 'HasAns_exact': 10.878706199460916, 'HasAns_f1': 18.90159908670889, 'HasAns_total': 18550, 'best_exact': 10.878706199460916, 'best_exact_thresh': 0.0, 'best_f1': 18.90159908670889, 'best_f1_thresh': 0.0}
Rouge:  {'rouge1': AggregateScore(low=Score(precision=0.3880581739816478, recall=0.4550323675149823, fmeasure=0.40900635426960424), mid=Score(precision=0.38935330249693045, recall=0.4567322690085111, fmeasure=0.4103497252640948), high=Score(precision=0.3906760301519545, recall=0.45862590291124583, fmeasure=0.4116893438014193)), 'rouge2': AggregateScore(low=Score(precision=0.12528948929063083, recall=0.150201841445336, fmeasure=0.13317084526081496), mid=Score(precision=0.12632324748328383, recall=0.1514989027008819, fmeasure=0.13426731369454203), high=Score(precision=0.12755253063395253, recall=0.15298924635620328, fmeasure=0.13557166179535804)), 'rougeL': AggregateScore(low=Score(precision=0.27524800173164926, recal

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Meteor:  34.42349882500762
{'bleu': 0.07474411444487124, 'precisions': [0.16116218190673323, 0.08790346524154896, 0.055196789710064045, 0.03991398129744876], 'brevity_penalty': 1.0, 'length_ratio': 1.0560288019544184, 'translation_length': 65704, 'reference_length': 62218}
