# Proyecto - SQuAD

* Benjamín Farías
* Juan Hernández
* Benjamín Lepe

In [2]:
import torch
import numpy as np
import collections
from pprint import pprint
from torch.utils.data import random_split
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from datasets import load_dataset

# Use GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Reproducibility
SEED = 1
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [4]:
squad_dataset = load_dataset('squad_v2')

Downloading: 5.26kB [00:00, 5.28MB/s]                   
Downloading: 2.40kB [00:00, 1.81MB/s]                   


Downloading and preparing dataset squad_v2/squad_v2 (download: 44.34 MiB, generated: 122.41 MiB, post-processed: Unknown size, total: 166.75 MiB) to C:\Users\benja\.cache\huggingface\datasets\squad_v2\squad_v2\2.0.0\de2e67b822b2ef3f4b137148d0758f48075e3892c359c50271ef6c9add7e794a...


Downloading: 42.1MB [00:04, 9.81MB/s]
Downloading: 4.37MB [00:00, 69.4MB/s]                  
                                           

Dataset squad_v2 downloaded and prepared to C:\Users\benja\.cache\huggingface\datasets\squad_v2\squad_v2\2.0.0\de2e67b822b2ef3f4b137148d0758f48075e3892c359c50271ef6c9add7e794a. Subsequent calls will reuse this data.




In [9]:
# Split into training/validation
train_set, val_set = random_split(squad_dataset['train'], [117287, 13032])
test_set = squad_dataset['validation']

117287
13032
11873


In [3]:
# Generate some maps to help us identify examples of interest
qid_to_example_index = {example.qas_id: i for i, example in enumerate(test_examples)}
qid_to_has_answer = {example.qas_id: bool(example.answers) for example in test_examples}
answer_qids = [qas_id for qas_id, has_answer in qid_to_has_answer.items() if has_answer]
no_answer_qids = [qas_id for qas_id, has_answer in qid_to_has_answer.items() if not has_answer]

In [4]:
# Display information for specific example
def display_example(qid, examples):
    idx = qid_to_example_index[qid]
    q = examples[idx].question_text
    c = examples[idx].context_text
    a = [answer['text'] for answer in examples[idx].answers]
    print(f'Example {idx} of {len(examples)}\n---------------------')
    print(f'Q: {q}\n')
    print('Context:')
    pprint(c)
    print(f'\nTrue Answers:\n{a}')

In [5]:
# Show example with true answers
display_example(answer_qids[1300], test_examples)

Example 2548 of 11873
---------------------
Q: Where on Earth is free oxygen found?

Context:
("Free oxygen also occurs in solution in the world's water bodies. The "
 'increased solubility of O\n'
 '2 at lower temperatures (see Physical properties) has important implications '
 'for ocean life, as polar oceans support a much higher density of life due to '
 'their higher oxygen content. Water polluted with plant nutrients such as '
 'nitrates or phosphates may stimulate growth of algae by a process called '
 'eutrophication and the decay of these organisms and other biomaterials may '
 'reduce amounts of O\n'
 '2 in eutrophic water bodies. Scientists assess this aspect of water quality '
 "by measuring the water's biochemical oxygen demand, or the amount of O\n"
 '2 needed to restore it to a normal concentration.')

True Answers:
['water', "in solution in the world's water bodies", "the world's water bodies"]


In [6]:
# Show example with no answer
display_example(no_answer_qids[1254], test_examples)

Example 2564 of 11873
---------------------
Q: What happened 3.7-2 billion years ago?

Context:
("Free oxygen gas was almost nonexistent in Earth's atmosphere before "
 'photosynthetic archaea and bacteria evolved, probably about 3.5 billion '
 'years ago. Free oxygen first appeared in significant quantities during the '
 'Paleoproterozoic eon (between 3.0 and 2.3 billion years ago). For the first '
 'billion years, any free oxygen produced by these organisms combined with '
 'dissolved iron in the oceans to form banded iron formations. When such '
 'oxygen sinks became saturated, free oxygen began to outgas from the oceans '
 '3–2.7 billion years ago, reaching 10% of its present level around 1.7 '
 'billion years ago.')

True Answers:
[]


In [None]:
# Normalize text
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
        return re.sub(regex, ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

# Exact match evaluation metric
def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

# F1 score evaluation metric
def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()

    # If either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)

    common_tokens = set(pred_tokens) & set(truth_tokens)

    # If there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0

    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)

    return 2 * (prec * rec) / (prec + rec)

# Retrieve possible answers
def get_gold_answers(example):
    """helper function that retrieves all possible true answers from a squad2.0 example"""

    gold_answers = [answer['text'] for answer in example.answers if answer['text']]

    # If gold_answers doesn't exist it's because this is a negative example -
    # the only correct answer is an empty string
    if not gold_answers:
        gold_answers = ['']

    return gold_answers

In [None]:
# Obtain prediction for a specific question & context
def get_prediction(model, example, tokenizer, nbest=10, null_threshold=1.0):
    inputs = get_qa_inputs(example, tokenizer).to(device)
    tokens = to_list(inputs['input_ids'])[0]
    with torch.no_grad():
        start_logits, end_logits = model(**inputs).values()  # Forward pass

    # Get sensible preliminary predictions, sorted by score
    prelim_preds = preliminary_predictions(start_logits, end_logits, inputs['input_ids'], nbest, tokenizer.sep_token_id)

    # Narrow that down to the top nbest predictions
    nbest_preds = best_predictions(prelim_preds, nbest, tokenizer, tokens, to_list(start_logits)[0], to_list(end_logits)[0])

    # Compute the probability of each prediction
    probabilities = prediction_probabilities(nbest_preds)

    # Compute score difference
    score_difference = compute_score_difference(nbest_preds)

    # If score difference > threshold, return the null answer (for questions with no answer)
    if score_difference > null_threshold:
        return '', probabilities[-1]
    else:
        return nbest_preds[0].text, probabilities[0]

# ----------------- Helper functions for get_prediction ----------------- #

# Load the example, convert to inputs, get tokenized info
def get_qa_inputs(example, tokenizer):
    question = example.question_text
    context = example.context_text
    return tokenizer.encode_plus(question, context, return_tensors='pt', truncation=True, max_length=512)

# Clean raw text
def get_clean_text(tokens, tokenizer):
    text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(tokens))
    text = text.strip()
    text = ' '.join(text.split())
    return text

# Calculate probabilities for each prediction
def prediction_probabilities(predictions):
    def softmax(x):
        """Compute softmax values for each sets of scores in x."""
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum()

    all_scores = [pred.start_logit + pred.end_logit for pred in predictions]
    return softmax(np.array(all_scores))

# Convert tensor to list
def to_list(tensor):
    return tensor.detach().cpu().tolist()

# Get preliminary predictions
def preliminary_predictions(start_logits, end_logits, input_ids, nbest, sep_token_id):
    # Convert tensors to lists
    start_logits = to_list(start_logits)[0]
    end_logits = to_list(end_logits)[0]
    tokens = to_list(input_ids)[0]

    # Sort our start and end logits from largest to smallest, keeping track of the index
    start_idx_and_logit = sorted(enumerate(start_logits), key=lambda x: x[1], reverse=True)
    end_idx_and_logit = sorted(enumerate(end_logits), key=lambda x: x[1], reverse=True)
    start_indexes = [idx for idx, logit in start_idx_and_logit[:nbest]]
    end_indexes = [idx for idx, logit in end_idx_and_logit[:nbest]]

    # Question tokens are between the CLS token (101, at position 0) and first SEP (102) token
    question_indexes = [i + 1 for i, token in enumerate(tokens[1 : tokens.index(sep_token_id)])]

    # Keep track of all preliminary predictions
    PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
        'PrelimPrediction', ['start_index', 'end_index', 'start_logit', 'end_logit']
    )
    prelim_preds = []
    for start_index in start_indexes:
        for end_index in end_indexes:
            # Throw out invalid predictions
            if start_index in question_indexes:
                continue
            if end_index in question_indexes:
                continue
            if end_index < start_index:
                continue
            prelim_preds.append(
                PrelimPrediction(
                    start_index=start_index,
                    end_index=end_index,
                    start_logit=start_logits[start_index],
                    end_logit=end_logits[end_index]
                )
            )
    # Sort prelim_preds in descending score order
    prelim_preds = sorted(prelim_preds, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
    return prelim_preds

# Filter the nbest predictions
def best_predictions(prelim_preds, nbest, tokenizer, tokens, start_logits, end_logits):
    # This will be the pool from which answer probabilities are computed
    BestPrediction = collections.namedtuple(
        'BestPrediction', ['text', 'start_logit', 'end_logit']
    )
    nbest_predictions = []
    seen_predictions = []
    for pred in prelim_preds:
        if len(nbest_predictions) >= nbest:
            break
        if pred.start_index > 0: # Non-null answers
            toks = tokens[pred.start_index : pred.end_index + 1]
            text = get_clean_text(toks, tokenizer)

            # If this text has been seen already - skip it
            if text in seen_predictions:
                continue

            # Flag text as being seen
            seen_predictions.append(text)

            # Add this text to a pruned list of the top nbest predictions
            nbest_predictions.append(
                BestPrediction(
                    text=text,
                    start_logit=pred.start_logit,
                    end_logit=pred.end_logit
                )
            )

    # Add the null prediction
    nbest_predictions.append(
        BestPrediction(
            text='',
            start_logit=start_logits[0],
            end_logit=end_logits[0]
        )
    )
    return nbest_predictions

# Calculate score to check if answer should be null
def compute_score_difference(predictions):
    """ Assumes that the null answer is always the last prediction """
    score_null = predictions[-1].start_logit + predictions[-1].end_logit
    score_non_null = predictions[0].start_logit + predictions[0].end_logit
    return score_null - score_non_null

In [None]:
# Evaluate a single example
def evaluate(model, example, tokenizer, nbest=10, null_threshold=-3.767639636993408):
    model.to(device)
    model.eval()
    prediction = get_prediction(model, example, tokenizer, nbest=nbest, null_threshold=null_threshold)
    gold_answers = get_gold_answers(example)
    em_score = max((compute_exact_match(prediction[0], answer)) for answer in gold_answers)
    f1_score = max((compute_f1(prediction[0], answer)) for answer in gold_answers)
    print(f'Context: {example.context_text}\n')
    print(f'Question: {example.question_text}')
    print(f'Prediction: {prediction[0] if prediction[0] else "NO ANSWER"}')
    print(f'True Answers: {gold_answers}')
    print(f'EM: {em_score} \t F1: {f1_score}')

# Evaluate on the SQuAD dev set
def run_testing(model, examples, tokenizer, nbest=10, null_threshold=-3.767639636993408):
    model.to(device)
    model.eval()
    em_score_total = 0
    f1_score_total = 0
    for example in examples:
        prediction = get_prediction(model, example, tokenizer, nbest=nbest, null_threshold=null_threshold)
        gold_answers = get_gold_answers(example)
        em_score_total += max((compute_exact_match(prediction[0], answer)) for answer in gold_answers)
        f1_score_total += max((compute_f1(prediction[0], answer)) for answer in gold_answers)
    em_score_avg = round(100 * (em_score_total / len(examples)), 2)
    f1_score_avg = round(100 * (f1_score_total / len(examples)), 2)
    print(f'Avg EM: {em_score_avg}% \t Avg F1: {f1_score_avg}%')

In [10]:
# Load pre-trained model
def get_model(name):
    name_map = {
        'bert': 'deepset/bert-base-cased-squad2',
        'bert_wwm': 'deepset/bert-large-uncased-whole-word-masking-squad2',
        'roberta': 'deepset/roberta-base-squad2',
        'sapbert': 'bigwiz83/sapbert-from-pubmedbert-squad2'
    }
    tokenizer = AutoTokenizer.from_pretrained(name_map[name])
    model = AutoModelForQuestionAnswering.from_pretrained(name_map[name])
    return model, tokenizer

In [13]:
# Tokenize sets for training & evaluation
def tokenize_set(tokenizer):
    return

# Train on the SQuAD training set
def run_training(model, tokenizer, train_set, eval_set, args):
    model.to(device)
    trainer = Trainer(
        model,
        args,
        train_dataset=train_set,
        eval_dataset=eval_set,
        tokenizer=tokenizer
    )
    trainer.train()
    #trainer.save_model('squad')

In [15]:
training_args = TrainingArguments(
    'checkpoints',
    evaluation_strategy='epoch',
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2
)

In [11]:
# Models: bert, bert_wwm, roberta, sapbert
model, tokenizer = get_model('bert')

In [16]:
run_training(model, tokenizer, train_set, val_set, training_args)

***** Running training *****
  Num examples = 117287
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 29322
  0%|          | 0/29322 [00:00<?, ?it/s]

ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['id', 'title', 'context', 'question', 'answers']

In [41]:
# Positive example evaluation
example = test_examples[qid_to_example_index[answer_qids[1300]]]
evaluate(model, example, tokenizer)

Context: Free oxygen also occurs in solution in the world's water bodies. The increased solubility of O
2 at lower temperatures (see Physical properties) has important implications for ocean life, as polar oceans support a much higher density of life due to their higher oxygen content. Water polluted with plant nutrients such as nitrates or phosphates may stimulate growth of algae by a process called eutrophication and the decay of these organisms and other biomaterials may reduce amounts of O
2 in eutrophic water bodies. Scientists assess this aspect of water quality by measuring the water's biochemical oxygen demand, or the amount of O
2 needed to restore it to a normal concentration.

Question: Where on Earth is free oxygen found?
Prediction: water bodies
True Answers: ['water', "in solution in the world's water bodies", "the world's water bodies"]
EM: 0 	 F1: 0.8


In [42]:
# Negative example evaluation
example = test_examples[qid_to_example_index[no_answer_qids[1254]]]
evaluate(model, example, tokenizer)

Context: Free oxygen gas was almost nonexistent in Earth's atmosphere before photosynthetic archaea and bacteria evolved, probably about 3.5 billion years ago. Free oxygen first appeared in significant quantities during the Paleoproterozoic eon (between 3.0 and 2.3 billion years ago). For the first billion years, any free oxygen produced by these organisms combined with dissolved iron in the oceans to form banded iron formations. When such oxygen sinks became saturated, free oxygen began to outgas from the oceans 3–2.7 billion years ago, reaching 10% of its present level around 1.7 billion years ago.

Question: What happened 3.7-2 billion years ago?
Prediction: NO ANSWER
True Answers: ['']
EM: 1 	 F1: 1


In [43]:
# Run testing over SQuAD dev set
run_testing(model, test_examples, tokenizer)

Avg EM: 74.34% 	 Avg F1: 78.34%
