In [None]:
!pip install nltk
!pip install rouge

import nltk
nltk.download('punkt')

In [None]:
import os
import json
import nltk
from rouge import Rouge
from tqdm.notebook import tqdm, trange

In [None]:
DATA_PATH = "../data/"
RESULT_PATH = "../data/baseline_results/rouge_upper_bound/"
FILES = ['train', 'dev']

os.makedirs(RESULT_PATH, exist_ok=True)

In [None]:
rouge = Rouge()

In [None]:
def compute_upper_bound(entries, dataset_name):
    """
    Compute best possible extractive answer using rouge
    """
    # Compute extractive answers
    samples = []
    for entry in tqdm(entries, desc=f"Finding best extractive answers for {dataset_name}"):
        text = entry["text"]
        if text[0] == ".":
            text = text[1:].strip()
        original_answer = entry["answer"]
        sentences = nltk.tokenize.sent_tokenize(text, language='english')
        
        best_answer_2 = "-"
        rouge_2_f1 = -1
        best_answer_l = "-"
        rouge_l_f1 = -1
        if len(sentences) > 0:
            for s in sentences:
                if s == "" or s == "...":
                    continue
                
                raw_scores = rouge.get_scores(original_answer, s)
                if not len(raw_scores) > 0:
                    continue
                
                scores = raw_scores[0]

                if scores['rouge-2']['f'] > rouge_2_f1:
                    best_answer_2 = s
                    rouge_2_f1 = scores['rouge-2']['f']
                if scores['rouge-l']['f'] > rouge_l_f1:
                    best_answer_l = s
                    rouge_l_f1 = scores['rouge-l']['f']
                
        print(f"{original_answer} -> {best_answer_2} (R2: {rouge_2_f1}); {best_answer_l} (R-L: {rouge_l_f1})")
        samples.append({
            "id": entry["id"],
            "question": entry["title"],
            "context":text, 
            "answer": original_answer, 
            "extractive_answer_rouge_2": best_answer_2,
            "extractive_answer_rouge_l": best_answer_l,
            "extractive_answer_rouge_2_f1": rouge_2_f1,
            "extractive_answer_rouge_l_f1": rouge_l_f1,
        })
    return samples

In [None]:
for f in FILES:
    with open(f"{DATA_PATH}final_{f}.json", "r") as entry_file:
        samples_with_upper_bound = compute_upper_bound(json.load(entry_file), f)

    with open(f"{RESULT_PATH}{f}_upperbound.json", "w") as result_file:
        json.dump(samples_with_upper_bound, result_file, indent=2, ensure_ascii=False)
        
    with open(f"{RESULT_PATH}{f}.json", "w") as result_file:
        json.dump([{"id":s["id"], "answer":s["extractive_answer_rouge_2"]} for s in samples_with_upper_bound], result_file, indent=2, ensure_ascii=False)