In [None]:
!pip install datasets rank_bm25 transformers nltk pandas


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Import required libraries
from datasets import load_dataset
from transformers import pipeline
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
import pandas as pd
import nltk

# Download NLTK data
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# Define functions
def create_qa_dataframe(ds, split):
    titles, questions, answers, evidence = [], [], [], []
    for id in range(len(ds[split])):
        title = ds[split]['title'][id]
        for i in range(len(ds[split][id]['qas']['question'])):
            for j in range(len(ds[split][id]['qas']['answers'][i]['answer'])):
                if len(ds[split][id]['qas']['answers'][i]['answer'][j]['extractive_spans']) > 0:
                    titles.append(title)
                    evidence.append(ds[split][id]['qas']['answers'][i]['answer'][j]['evidence'])
                    questions.append(ds[split][id]['qas']['question'][i])
                    answers.append(ds[split][id]['qas']['answers'][i]['answer'][j]['extractive_spans'][0])
    return pd.DataFrame({'title': titles, 'question': questions, 'answer': answers, 'evidence': evidence})

def extract_full_papers(ds, split):
    papers = []
    for doc in ds[split]:
        if 'title' not in doc or 'full_text' not in doc:
            continue
        paper_detail = {'title': doc['title'], 'paragraphs': []}
        for section in doc['full_text'].get('paragraphs', []):
            paper_detail['paragraphs'].extend([para for para in section if para.strip()])
        papers.append(paper_detail)
    return pd.DataFrame(papers)

def combine_questions_and_papers(df, papers_df):
    papers_df = papers_df.rename(columns={'paragraphs': 'full_paper'})
    combined_df = pd.merge(df, papers_df, on='title', how='inner')
    return combined_df


In [None]:
def create_qa_pipeline(model_checkpoint):
    return pipeline("question-answering", model=model_checkpoint, device=-1)

def process_bm25_qa(df_row, k=5):
    question = df_row['question']
    paper_paragraphs = df_row['full_paper']
    tokenized_paragraphs = [word_tokenize(paragraph.lower()) for paragraph in paper_paragraphs]
    bm25 = BM25Okapi(tokenized_paragraphs)
    tokenized_question = word_tokenize(question.lower())
    scores = bm25.get_scores(tokenized_question)
    ranked_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]
    return [(paper_paragraphs[i], scores[i]) for i in ranked_indices]

def process_qa(df_row, qa_pipeline, top_paragraphs):
    question = df_row['question']
    best_answer = None
    best_score = -1
    best_context = None
    for paragraph, score in top_paragraphs:
        answer = qa_pipeline(question=question, context=paragraph)
        if answer['score'] > best_score:
            best_answer = answer['answer']
            best_score = answer['score']
            best_context = paragraph
    return {'question': question, 'answer': best_answer, 'confidence': best_score, 'context': best_context}

def compute_exact_match(prediction, ground_truth):
    if not isinstance(prediction, str) or not isinstance(ground_truth, str):
        return 0
    return int(prediction.strip().lower() == ground_truth.strip().lower())

def compute_f1(prediction, ground_truth):
    if not isinstance(prediction, str) or not isinstance(ground_truth, str):
        return 0
    pred_tokens = prediction.strip().lower().split()
    truth_tokens = ground_truth.strip().lower().split()
    common_tokens = set(pred_tokens) & set(truth_tokens)
    if len(common_tokens) == 0:
        return 0
    precision = len(common_tokens) / len(pred_tokens)
    recall = len(common_tokens) / len(truth_tokens)
    return 2 * (precision * recall) / (precision + recall)

In [None]:
def evaluate_results(results_df, ground_truth_df):
    evaluation_df = pd.merge(results_df, ground_truth_df[['question', 'answer']], on='question', suffixes=('_predicted', '_ground_truth'))
    exact_matches = []
    f1_scores = []
    for _, row in evaluation_df.iterrows():
        pred_answer = row['answer_predicted']
        true_answer = row['answer_ground_truth']
        exact_matches.append(compute_exact_match(pred_answer, true_answer))
        f1_scores.append(compute_f1(pred_answer, true_answer))
    evaluation_df['exact_match'] = exact_matches
    evaluation_df['f1_score'] = f1_scores
    average_exact_match = sum(exact_matches) / len(exact_matches) * 100
    average_f1_score = sum(f1_scores) / len(f1_scores) * 100
    return average_exact_match, average_f1_score, evaluation_df

In [None]:
# Load Qasper dataset
ds = load_dataset("allenai/qasper")
split = 'test'

# Prepare DataFrames
df = create_qa_dataframe(ds, split)
papers_df = extract_full_papers(ds, split)
df = combine_questions_and_papers(df, papers_df)

# Load models
roberta_pipeline = create_qa_pipeline("ImanAndrea/roberta-finetuned-paperQA")
bert_pipeline = create_qa_pipeline("ImanAndrea/bert-finetuned-paperQA")
distilbert_pipeline = create_qa_pipeline("ImanAndrea/distilbert-finetuned-paperQA")

In [None]:
# Evaluate models
results = {'model': [], 'exact_match': [], 'f1_score': []}

for model_name, qa_pipeline in {
    "roberta": roberta_pipeline,
    "bert": bert_pipeline,
    "distilbert": distilbert_pipeline
}.items():
    model_results = []
    for _, row in df.iterrows():
        top_paragraphs = process_bm25_qa(row, k=5)
        qa_result = process_qa(row, qa_pipeline, top_paragraphs)
        model_results.append(qa_result)
    results_df = pd.DataFrame(model_results)
    em, f1, evaluation_df = evaluate_results(results_df, df)
    results['model'].append(model_name)
    results['exact_match'].append(em)
    results['f1_score'].append(f1)

results_summary = pd.DataFrame(results)
print("Summary:")
print(results_summary)

Summary:
        model  exact_match   f1_score
0     roberta     3.777994  12.371602
1        bert     3.855891  12.265795
2  distilbert     4.148004  12.120928
