# Imports

In [1]:
import nltk
nltk.download('punkt')
import pandas as pd
import torch

from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize

from answer_extraction import NER_Extractor
from controller import Controller
from distraction_generation import DistractionFinder
from qa_evaluation import QA_Evaluator
from question_generation.question_gen_en import QuestionGenerator
from translation import Translator

[nltk_data] Downloading package punkt to C:\Users\Will
[nltk_data]     Blanton\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


# Constants

In [2]:
ENGLISH_JSON = "data/xquad.en.json"
SPANISH_JSON = "data/xquad.es.json"

# Helper Functions

In [3]:
def load_qa_data(json_file):
    
    articles = pd.read_json(json_file)

    # turn the json into a list of dictionaries
    articles = [a for a in articles["data"]]

    return articles

In [4]:
def bleu_comparison(original: str, translated: str):
    """
    Compare the generated questions with the dataset questions using BLEU score
    """

    original_tokens = word_tokenize(original.lower())

    translated_tokens = word_tokenize(translated.lower())

    bleu_score = sentence_bleu([original_tokens], translated_tokens)

    bleu_score = bleu_score if bleu_score >= .0001 else 0

    return bleu_score

In [5]:
def score_qa_pair(controller, english: dict, spanish: dict, display: bool = False):
    """
    Iterate over the articles and paragraphs in the English and Spanish data to translate,
    the Spansih data is translated to English and then generate questions and answers in English.
    Evaluate 
    """

    results = []

    evaluator = QA_Evaluator()

    # iterate through the articles and paragraphs
    for i, article in enumerate(english[0:1]):
        for j, paragraph in enumerate(article["paragraphs"]):
            if display and j >= 1:
                break

            # get the text to translate
            spanish_context = spanish[i]["paragraphs"][j]["context"]

            translated_context, qa_pairs = controller.gen_qa_pairs(spanish_context)

            # get the correct translation
            target_context = paragraph["context"]

            bleu_score = bleu_comparison(target_context, translated_context)

            if display:
                print("--------------------------------------------------\n")
                print(f"Spanish Context: {spanish_context}\n")
                print(f"English Context: {target_context}\n")
                print(f"Translated Context: {translated_context}\n")
                print(f"BLEU Score: {bleu_score}\n")
                print("--------------------------------------------------\n")

            target_qa = []
            for qas in paragraph["qas"]:
                target_question = qas["question"]
                target_answer = qas["answers"][0]["text"]
                
                target_qa.append((target_question, target_answer))

            for gen_q, extr_a, extr_dist in qa_pairs:

                gen_a = evaluator.answer_question(gen_q, target_context, answer)

                results.append({
                    "article_index": i,
                    "paragraph_index": j,
                    "spanish_context": spanish_context,
                    "target_context": target_context,
                    "translated_context": translated_context,
                    "bleu_score": bleu_score,
                    "generated_question": gen_q,
                    "extracted_answer": extr_a,
                    "distractions": extr_dist
                })

                if display:
                    print("--------------------------------------------------\n")
                    print(f"Generated-Q: {gen_q}\n")
                    print(f"Extracted-A: {extr_a}\n")
                    print(f"Distractions: {extr_dist}\n")
                    # print(f"Target-QA: {target_qa}\n")
                    print("--------------------------------------------------\n")

    df = pd.DataFrame(results)
    return df

# Initialize Models

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

controller = Controller(
    Translator(device=device),
    QuestionGenerator(),
    NER_Extractor(),
    DistractionFinder()
)

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can s

# Load in Data

In [7]:
english_qa = load_qa_data(ENGLISH_JSON)
spanish_qa = load_qa_data(SPANISH_JSON)

# Evaluate Language-Learning QA-Pair System

In [None]:
score_qa_pair(controller, english_qa, spanish_qa, display=True)