# Proyecto - SQuAD

* Benjamín Farías
* Juan Hernández
* Benjamín Lepe

In [1]:
import torch
import numpy as np
import collections
from random import random
from tqdm import tqdm
from pprint import pprint
from torch import nn
from torch.utils.data import random_split, Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AdamW
from datasets import load_dataset

# Use GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Reproducibility
SEED = 1999
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [2]:
# Load SQuAD 2.0 dataset
squad_dataset = load_dataset('squad_v2')

# Split into training/validation (from the training set)
train_set, val_set = random_split(squad_dataset['train'], [117287, 13032])
print(f'Training Set: {len(train_set)} examples')
print(f'Validation Set: {len(val_set)} examples')

# Testing set (in this case we use the dev set)
test_set = squad_dataset['validation']
print(f'Testing Set: {len(test_set)} examples')

Reusing dataset squad_v2 (C:\Users\benja\.cache\huggingface\datasets\squad_v2\squad_v2\2.0.0\de2e67b822b2ef3f4b137148d0758f48075e3892c359c50271ef6c9add7e794a)


Training Set: 117287 examples
Validation Set: 13032 examples
Testing Set: 11873 examples


In [3]:
# Display information for specific example
def display_example(example):
    q = example['question']
    c = example['context']
    a = example['answers']['text']
    print(f'Q: {q}\n')
    print('Context:')
    pprint(c)
    print(f'\nTrue Answers:\n{a}')

In [4]:
# Show example from evaluation set
display_example(test_set[0])

Q: In what country is Normandy located?

Context:
('The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the '
 'people who in the 10th and 11th centuries gave their name to Normandy, a '
 'region in France. They were descended from Norse ("Norman" comes from '
 '"Norseman") raiders and pirates from Denmark, Iceland and Norway who, under '
 'their leader Rollo, agreed to swear fealty to King Charles III of West '
 'Francia. Through generations of assimilation and mixing with the native '
 'Frankish and Roman-Gaulish populations, their descendants would gradually '
 'merge with the Carolingian-based cultures of West Francia. The distinct '
 'cultural and ethnic identity of the Normans emerged initially in the first '
 'half of the 10th century, and it continued to evolve over the succeeding '
 'centuries.')

True Answers:
['France', 'France', 'France', 'France']


In [5]:
# Extract info from dataset
def get_info(dataset):
    contexts = []
    questions = []
    answers = []
    for example in dataset:
        question = example['question']
        context = example['context']
        answer = {'text': '', 'answer_start': 0}
        if not example['answers']['text']:
            contexts.append(context)
            questions.append(question)
            answers.append({'text': '', 'answer_start': 0})
        for ans_idx in range(len(example['answers']['text'])):
            contexts.append(context)
            questions.append(question)
            answer = {'text': example['answers']['text'][ans_idx], 'answer_start': example['answers']['answer_start'][ans_idx]}
            answers.append(answer)
    return contexts, questions, answers

train_contexts, train_questions, train_answers = get_info(train_set)
val_contexts, val_questions, val_answers = get_info(val_set)

In [6]:
# Add index where each answer ends
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # Sometimes squad answers are off by a character or two – fix this
        if context[start_idx : end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx - 1 : end_idx - 1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx - 2 : end_idx - 2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters

add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [7]:
# Add token positions to encodings
def add_token_positions(encodings, answers, tokenizer):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        if answers[i]['answer_end']:
            start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
            end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
        else:
            start_positions.append(0)
            end_positions.append(0)

        # If start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

# Tokenize the data
def tokenize_data(contexts, questions, answers, tokenizer):
    encodings = tokenizer(contexts, questions, truncation=True, padding='max_length', max_length=384)
    add_token_positions(encodings, answers, tokenizer)
    return encodings

In [8]:
# SQuAD dataset features
class SquadDataset(Dataset):
    def __init__(self, contexts, questions, answers, tokenizer):
        self.contexts = contexts
        self.questions = questions
        self.answers = answers
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        encoding = tokenize_data([self.contexts[idx]], [self.questions[idx]], [self.answers[idx]], self.tokenizer)
        return {key: torch.tensor(val[0]) for key, val in encoding.items()}

    def __len__(self):
        return len(self.contexts)

# Tokenize datasets
def prepare_features(tokenizer):
    train_dataset = SquadDataset(train_contexts, train_questions, train_answers, tokenizer)
    val_dataset = SquadDataset(val_contexts, val_questions, val_answers, tokenizer)
    return train_dataset, val_dataset

In [9]:
# Dict collate
def dict_collate(batch):
    group_dict = {key: [] for key in batch[0].keys()}
    for item in batch:
        for key, val in item.items():
            if not val.dim():
                group_dict[key].append(val.unsqueeze(0))
            else:
                group_dict[key].append(val)
    return {key: torch.stack(val) for key, val in group_dict.items()}

# Run training
def run_training(model, train_set, val_set, args):
    train_loader = DataLoader(train_set, batch_size=args['batch_size'], shuffle=True, collate_fn=dict_collate)
    val_loader = DataLoader(val_set, batch_size=args['batch_size'], shuffle=True, collate_fn=dict_collate)
    optim = AdamW(model.parameters(), lr=args['lr'])
    history = {
        'training': {'loss': []},
        'validation': {'loss': []}
    }
    # Train for n_epochs
    best_loss = float('inf')
    for epoch in range(1, args['n_epochs'] + 1):
        train_epoch_loss = run_epoch('train', model, train_loader, optimizer=optim, epoch=epoch, total_epoch=args['n_epochs'])
        val_epoch_loss = run_epoch('val', model, val_loader, optimizer=optim, epoch=epoch, total_epoch=args['n_epochs'])

        # Save loss/accuracy values for each epoch
        history['training']['loss'].append(train_epoch_loss)
        history['validation']['loss'].append(val_epoch_loss)

        # Save model state if needed
        if val_epoch_loss < best_loss:
            best_loss = val_epoch_loss
            torch.save(model.state_dict(), 'squad.pt')
    return history

# Run a single epoch
def run_epoch(phase, model, loader, optimizer=None, epoch=0, total_epoch=0):
    if phase == 'train':
        model.train()
    elif phase == 'val':
        model.eval()
    agg_loss = 0.0
    with tqdm(loader, unit='batch', position=0, leave=True) as tepoch:
        for n_batch, batch in enumerate(tepoch, start=1):
            if phase == 'train': # Clean gradients on training
                optimizer.zero_grad()
                tepoch.set_description(f'Epoch {epoch}/{total_epoch}')
            elif phase == 'val':
                tepoch.set_description('Validating')

            # Forward pass
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)
            if phase == 'val':
                with torch.no_grad():
                    outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
            else:
                outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)

            print(outputs)
            loss = outputs[0]
            agg_loss += loss.item()

            # Update params
            if phase == 'train':
                loss.backward() # Backpropagation only while training
                optimizer.step() # Update weights only while training
            current_agg_loss = agg_loss / n_batch
            tepoch.set_postfix(Loss=current_agg_loss)
    epoch_loss = float(agg_loss / n_batch)
    return epoch_loss

In [10]:
# Load pre-trained model
def get_model(name):
    name_map = {
        'bert': ['deepset/bert-base-cased-squad2', 'deepset/bert-base-cased-squad2'],
        'roberta': ['deepset/roberta-base-squad2', 'deepset/roberta-base-squad2'],
        'albert': ['twmkn9/albert-base-v2-squad2', 'albert-base-v2'],
    }
    model = AutoModelForQuestionAnswering.from_pretrained(name_map[name][0])
    tokenizer = AutoTokenizer.from_pretrained(name_map[name][1])
    model.to(device)
    return model, tokenizer

In [11]:
# ----------------- Metric functions for evaluation ----------------- #

# Normalize text
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
        return re.sub(regex, ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

# Exact match evaluation metric
def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

# F1 score evaluation metric
def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()

    # If either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)

    common_tokens = set(pred_tokens) & set(truth_tokens)

    # If there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0

    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)

    return 2 * (prec * rec) / (prec + rec)

# Retrieve possible answers
def get_gold_answers(example):
    """helper function that retrieves all possible true answers from a squad2.0 example"""

    gold_answers = [answer for answer in example['answers']['text'] if example['answers']['text']]

    # If gold_answers doesn't exist it's because this is a negative example -
    # the only correct answer is an empty string
    if not gold_answers:
        gold_answers = ['']

    return gold_answers

In [38]:
# Obtain prediction for a specific question & context
def get_prediction(model, example, tokenizer, nbest=10, null_threshold=1.0):
    inputs = get_qa_inputs(example, tokenizer).to(device)
    tokens = to_list(inputs['input_ids'])[0]
    with torch.no_grad():
        start_logits, end_logits = model(**inputs).values()  # Forward pass

    # Get sensible preliminary predictions, sorted by score
    prelim_preds = preliminary_predictions(start_logits, end_logits, inputs['input_ids'], nbest, tokenizer.sep_token_id)

    # Narrow that down to the top nbest predictions
    nbest_preds = best_predictions(prelim_preds, nbest, tokenizer, tokens, to_list(start_logits)[0], to_list(end_logits)[0])

    # Compute the probability of each prediction
    probabilities = prediction_probabilities(nbest_preds)

    # Compute score difference
    score_difference = compute_score_difference(nbest_preds)

    # If score difference > threshold, return the null answer (for questions with no answer)
    if score_difference > null_threshold:
        return '', probabilities[-1]
    else:
        return nbest_preds[0].text, probabilities[0]

# ----------------- Helper functions for get_prediction ----------------- #

# Load the example, convert to inputs, get tokenized info
def get_qa_inputs(example, tokenizer):
    question = example['question']
    context = example['context']
    return tokenizer.encode_plus(question, context, return_tensors='pt', truncation=True, max_length=384)

# Clean raw text
def get_clean_text(tokens, tokenizer):
    text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(tokens))
    text = text.strip()
    text = ' '.join(text.split())
    return text

# Calculate probabilities for each prediction
def prediction_probabilities(predictions):
    def softmax(x):
        """Compute softmax values for each sets of scores in x."""
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum()

    all_scores = [pred.start_logit + pred.end_logit for pred in predictions]
    return softmax(np.array(all_scores))

# Convert tensor to list
def to_list(tensor):
    return tensor.detach().cpu().tolist()

# Get preliminary predictions
def preliminary_predictions(start_logits, end_logits, input_ids, nbest, sep_token_id):
    # Convert tensors to lists
    start_logits = to_list(start_logits)[0]
    end_logits = to_list(end_logits)[0]
    tokens = to_list(input_ids)[0]

    # Sort our start and end logits from largest to smallest, keeping track of the index
    start_idx_and_logit = sorted(enumerate(start_logits), key=lambda x: x[1], reverse=True)
    end_idx_and_logit = sorted(enumerate(end_logits), key=lambda x: x[1], reverse=True)
    start_indexes = [idx for idx, logit in start_idx_and_logit[:nbest]]
    end_indexes = [idx for idx, logit in end_idx_and_logit[:nbest]]

    # Question tokens are between the CLS token (101, at position 0) and first SEP (102) token
    question_indexes = [i + 1 for i, token in enumerate(tokens[1 : tokens.index(sep_token_id)])]

    # Keep track of all preliminary predictions
    PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
        'PrelimPrediction', ['start_index', 'end_index', 'start_logit', 'end_logit']
    )
    prelim_preds = []
    for start_index in start_indexes:
        for end_index in end_indexes:
            # Throw out invalid predictions
            if start_index in question_indexes:
                continue
            if end_index in question_indexes:
                continue
            if end_index < start_index:
                continue
            prelim_preds.append(
                PrelimPrediction(
                    start_index=start_index,
                    end_index=end_index,
                    start_logit=start_logits[start_index],
                    end_logit=end_logits[end_index]
                )
            )
    # Sort prelim_preds in descending score order
    prelim_preds = sorted(prelim_preds, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
    return prelim_preds

# Filter the nbest predictions
def best_predictions(prelim_preds, nbest, tokenizer, tokens, start_logits, end_logits):
    # This will be the pool from which answer probabilities are computed
    BestPrediction = collections.namedtuple(
        'BestPrediction', ['text', 'start_logit', 'end_logit']
    )
    nbest_predictions = []
    seen_predictions = []
    for pred in prelim_preds:
        if len(nbest_predictions) >= nbest:
            break
        if pred.start_index > 0: # Non-null answers
            toks = tokens[pred.start_index : pred.end_index + 1]
            text = get_clean_text(toks, tokenizer)

            # If this text has been seen already - skip it
            if text in seen_predictions:
                continue

            # Flag text as being seen
            seen_predictions.append(text)

            # Add this text to a pruned list of the top nbest predictions
            nbest_predictions.append(
                BestPrediction(
                    text=text,
                    start_logit=pred.start_logit,
                    end_logit=pred.end_logit
                )
            )

    # Add the null prediction
    nbest_predictions.append(
        BestPrediction(
            text='',
            start_logit=start_logits[0],
            end_logit=end_logits[0]
        )
    )
    return nbest_predictions

# Calculate score to check if answer should be null
def compute_score_difference(predictions):
    """ Assumes that the null answer is always the last prediction """
    score_null = predictions[-1].start_logit + predictions[-1].end_logit
    score_non_null = predictions[0].start_logit + predictions[0].end_logit
    return score_null - score_non_null

In [13]:
# ----------------- Evaluation ----------------- #

# Evaluate a single example
def evaluate(model, example, tokenizer, nbest=10, null_threshold=-3.767639636993408):
    model.eval()
    prediction = get_prediction(model, example, tokenizer, nbest=nbest, null_threshold=null_threshold)
    gold_answers = get_gold_answers(example)
    em_score = max((compute_exact_match(prediction[0], answer)) for answer in gold_answers)
    f1_score = max((compute_f1(prediction[0], answer)) for answer in gold_answers)
    print(f'Context: {example["context"]}\n')
    print(f'Question: {example["question"]}')
    print(f'Prediction: {prediction[0] if prediction[0] else "NO ANSWER"}')
    print(f'True Answers: {gold_answers}')
    print(f'EM: {em_score} \t F1: {f1_score}')

# Evaluate on the SQuAD dev set
def run_testing(model, examples, tokenizer, nbest=10, null_threshold=-3.767639636993408):
    model.eval()
    em_score_total = 0
    f1_score_total = 0
    for example in examples:
        prediction = get_prediction(model, example, tokenizer, nbest=nbest, null_threshold=null_threshold)
        gold_answers = get_gold_answers(example)
        em_score_total += max((compute_exact_match(prediction[0], answer)) for answer in gold_answers)
        f1_score_total += max((compute_f1(prediction[0], answer)) for answer in gold_answers)
    em_score_avg = round(100 * (em_score_total / len(examples)), 2)
    f1_score_avg = round(100 * (f1_score_total / len(examples)), 2)
    print(f'Avg EM: {em_score_avg}% \t Avg F1: {f1_score_avg}%')

In [14]:
# RoBERTa model
roberta_model, roberta_tokenizer = get_model('roberta')

# ALBERT model
albert_model, albert_tokenizer = get_model('albert')

In [15]:
# Example evaluation
example = test_set[0]
evaluate(roberta_model, example, roberta_tokenizer)
evaluate(albert_model, example, albert_tokenizer)

Context: The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.

Question: In what country is Normandy located?
Prediction: France
True Answers: ['France', 'France', 'France', 'France']
EM: 1 	 F1: 1.0
Context: The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th an

In [16]:
# Run testing with both models over SQuAD dev set
#run_testing(roberta_model, test_set, roberta_tokenizer)
#run_testing(albert_model, test_set, albert_tokenizer)

In [39]:
# Alberta ensemble model
class AlbertaEnsemble(nn.Module):
    def __init__(self, roberta_model, albert_model):
        super().__init__()

        # RoBERTa model
        self.roberta = roberta_model

        # ALBERT model
        self.albert = albert_model

        # 1) En base a los input_ids y attention_mask elegir uno especializado
        # 2) Pasar por los 2 modelos y decidir cuál usar con el cls (cls recibe outputs? o probabilidades?)

    def forward(
        self,
        roberta_input_ids=None,
        roberta_attention_mask=None,
        roberta_start_positions=None,
        roberta_end_positions=None,
        albert_input_ids=None,
        albert_attention_mask=None,
        albert_token_type_ids=None,
        albert_start_positions=None,
        albert_end_positions=None,
    ):
        with torch.no_grad():
            roberta_outputs = self.roberta(
                input_ids=roberta_input_ids, attention_mask=roberta_attention_mask,
                start_positions=roberta_start_positions, end_positions=roberta_end_positions
            )

            albert_outputs = self.albert(
                input_ids=albert_input_ids, attention_mask=albert_attention_mask, token_type_ids=albert_token_type_ids,
                start_positions=albert_start_positions, end_positions=albert_end_positions
            )

        chosen = random()
        if chosen > 0.5:
            return roberta_outputs, 'roberta'
        return albert_outputs, 'albert'

In [18]:
# Run training for ensemble
def run_training_ensemble(model, train_set_roberta, val_set_roberta, train_set_albert, val_set_albert, args):
    # Dataloaders for Roberta
    train_loader_roberta = DataLoader(train_set_roberta, batch_size=args['batch_size'], shuffle=False, collate_fn=dict_collate)
    val_loader_roberta = DataLoader(val_set_roberta, batch_size=args['batch_size'], shuffle=False, collate_fn=dict_collate)

    # Dataloaders for Albert
    train_loader_albert = DataLoader(train_set_albert, batch_size=args['batch_size'], shuffle=False, collate_fn=dict_collate)
    val_loader_albert = DataLoader(val_set_albert, batch_size=args['batch_size'], shuffle=False, collate_fn=dict_collate)

    # Optimizer
    optim = AdamW(model.parameters(), lr=args['lr'])

    # History for epoch loss
    history = {
        'training': {'loss': []},
        'validation': {'loss': []}
    }

    # Train for n_epochs
    best_loss = float('inf')
    for epoch in range(1, args['n_epochs'] + 1):
        train_epoch_loss = run_epoch_ensemble('train', model, train_loader_roberta, train_loader_albert, optimizer=optim, epoch=epoch, total_epoch=args['n_epochs'])
        val_epoch_loss = run_epoch_ensemble('val', model, val_loader_roberta, val_loader_albert, optimizer=optim, epoch=epoch, total_epoch=args['n_epochs'])

        # Save loss/accuracy values for each epoch
        history['training']['loss'].append(train_epoch_loss)
        history['validation']['loss'].append(val_epoch_loss)

        # Save model state if needed
        if val_epoch_loss < best_loss:
            best_loss = val_epoch_loss
            torch.save(model.state_dict(), 'alberta.pt')
    return history

# Run a single epoch for ensemble
def run_epoch_ensemble(phase, model, roberta_loader, albert_loader, optimizer=None, epoch=0, total_epoch=0):
    if phase == 'train':
        model.train()
    elif phase == 'val':
        model.eval()
    agg_loss = 0.0
    with tqdm(zip(roberta_loader, albert_loader), unit='batch', position=0, leave=True, total=len(roberta_loader)) as tepoch:
        for n_batch, (roberta_batch, albert_batch) in enumerate(tepoch, start=1):
            if phase == 'train': # Clean gradients on training
                optimizer.zero_grad()
                tepoch.set_description(f'Epoch {epoch}/{total_epoch}')
            elif phase == 'val':
                tepoch.set_description('Validating')

            # Forward pass
            roberta_input_ids = roberta_batch['input_ids'].to(device)
            roberta_attention_mask = roberta_batch['attention_mask'].to(device)
            roberta_start_pos = roberta_batch['start_positions'].to(device)
            roberta_end_pos = roberta_batch['end_positions'].to(device)
            albert_input_ids = albert_batch['input_ids'].to(device)
            albert_attention_mask = albert_batch['attention_mask'].to(device)
            albert_tokens = albert_batch['attention_mask'].to(device)
            albert_start_pos = albert_batch['token_type_ids'].to(device)
            albert_end_pos = albert_batch['end_positions'].to(device)
            if phase == 'val':
                with torch.no_grad():
                    outputs = model(
                        roberta_input_ids=roberta_input_ids, roberta_attention_mask=roberta_attention_mask,
                        roberta_start_positions=roberta_start_pos, roberta_end_positions=roberta_end_pos,
                        albert_input_ids=albert_input_ids, albert_attention_mask=albert_attention_mask,
                        albert_token_type_ids=albert_tokens,
                        albert_start_positions=albert_start_pos, albert_end_positions=albert_end_pos,
                    )
            else:
                outputs = model(
                    roberta_input_ids=roberta_input_ids, roberta_attention_mask=roberta_attention_mask,
                    roberta_start_positions=roberta_start_pos, roberta_end_positions=roberta_end_pos,
                    albert_input_ids=albert_input_ids, albert_attention_mask=albert_attention_mask,
                    albert_token_type_ids=albert_tokens,
                    albert_start_positions=albert_start_pos, albert_end_positions=albert_end_pos,
                )

            # Loss
            loss = outputs[0]
            agg_loss += loss.item()

            # Update params
            if phase == 'train':
                loss.backward() # Backpropagation only while training
                optimizer.step() # Update weights only while training
            current_agg_loss = agg_loss / n_batch
            tepoch.set_postfix(Loss=current_agg_loss)

            # Save temporal checkpoints
            if not (n_batch % 100):
                torch.save(model.state_dict(), 'temp.pt')

    epoch_loss = float(agg_loss / n_batch)
    return epoch_loss

In [34]:
# ----------------- Ensemble Evaluation ----------------- #

# Obtain prediction for a specific question & context (ensemble)
def get_prediction_ensemble(model, example, roberta_tokenizer, albert_tokenizer, nbest=10, null_threshold=1.0):
    roberta_inputs = get_qa_inputs(example, roberta_tokenizer).to(device)
    albert_inputs = get_qa_inputs(example, albert_tokenizer).to(device)
    input_map = {'roberta': roberta_inputs, 'albert': albert_inputs}
    tokenizer_map = {'roberta': roberta_tokenizer, 'albert': albert_tokenizer}
    with torch.no_grad():
        output, chosen = model(
                        roberta_input_ids=roberta_inputs['input_ids'], roberta_attention_mask=roberta_inputs['attention_mask'],
                        albert_input_ids=albert_inputs['input_ids'], albert_attention_mask=albert_inputs['attention_mask'],
                        albert_token_type_ids=albert_inputs['token_type_ids'],
                    )
    start_logits, end_logits = output.values()
    tokens = to_list(input_map[chosen]['input_ids'])[0]
    sep_token_id = tokenizer_map[chosen].sep_token_id

    # Get sensible preliminary predictions, sorted by score
    prelim_preds = preliminary_predictions(start_logits, end_logits, input_map[chosen]['input_ids'], nbest, sep_token_id)

    # Narrow that down to the top nbest predictions
    nbest_preds = best_predictions(prelim_preds, nbest, tokenizer_map[chosen], tokens, to_list(start_logits)[0], to_list(end_logits)[0])

    # Compute the probability of each prediction
    probabilities = prediction_probabilities(nbest_preds)

    # Compute score difference
    score_difference = compute_score_difference(nbest_preds)

    # If score difference > threshold, return the null answer (for questions with no answer)
    if score_difference > null_threshold:
        return '', probabilities[-1]
    else:
        return nbest_preds[0].text, probabilities[0]

# Evaluate a single example for ensemble
def evaluate_ensemble(model, example, roberta_tokenizer, albert_tokenizer, nbest=10, null_threshold=-3.767639636993408):
    model.eval()
    prediction = get_prediction_ensemble(model, example, roberta_tokenizer, albert_tokenizer, nbest=nbest, null_threshold=null_threshold)
    gold_answers = get_gold_answers(example)
    em_score = max((compute_exact_match(prediction[0], answer)) for answer in gold_answers)
    f1_score = max((compute_f1(prediction[0], answer)) for answer in gold_answers)
    print(f'Context: {example["context"]}\n')
    print(f'Question: {example["question"]}')
    print(f'Prediction: {prediction[0] if prediction[0] else "NO ANSWER"}')
    print(f'True Answers: {gold_answers}')
    print(f'EM: {em_score} \t F1: {f1_score}')

# Evaluate on the SQuAD dev set for ensemble
def run_testing_ensemble(model, examples, roberta_tokenizer, albert_tokenizer, nbest=10, null_threshold=-3.767639636993408):
    model.eval()
    em_score_total = 0
    f1_score_total = 0
    for example in examples:
        prediction = get_prediction_ensemble(model, example, roberta_tokenizer, albert_tokenizer, nbest=nbest, null_threshold=null_threshold)
        gold_answers = get_gold_answers(example)
        em_score_total += max((compute_exact_match(prediction[0], answer)) for answer in gold_answers)
        f1_score_total += max((compute_f1(prediction[0], answer)) for answer in gold_answers)
    em_score_avg = round(100 * (em_score_total / len(examples)), 2)
    f1_score_avg = round(100 * (f1_score_total / len(examples)), 2)
    print(f'Avg EM: {em_score_avg}% \t Avg F1: {f1_score_avg}%')


In [20]:
alberta_ensemble = AlbertaEnsemble(roberta_model, albert_model).to(device)

In [21]:
train_set_roberta, val_set_roberta = prepare_features(roberta_tokenizer)
train_set_albert, val_set_albert = prepare_features(albert_tokenizer)

In [22]:
training_args = {
    'batch_size': 2,
    'lr': 3e-5,
    'n_epochs': 1,
}

#loss_history = run_training_ensemble(alberta_ensemble, train_set_roberta, val_set_roberta, train_set_albert, val_set_albert, training_args)

In [40]:
final_alberta = AlbertaEnsemble(roberta_model, albert_model).to(device)
#final_alberta.load_state_dict(torch.load('temp.pt'))

In [42]:
# Example evaluation
example = test_set[0]
evaluate_ensemble(final_alberta, example, roberta_tokenizer, albert_tokenizer)

Context: The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.

Question: In what country is Normandy located?
Prediction: france
True Answers: ['France', 'France', 'France', 'France']
EM: 1 	 F1: 1.0


In [43]:
run_testing_ensemble(final_alberta, test_set, roberta_tokenizer, albert_tokenizer)

Avg EM: 78.48% 	 Avg F1: 81.33%
