In [6]:
#import nlp
import torch
import datasets

# ATTENTION. Rerunning this command remove the cached trivia qa dataset completely 
#!rm -rf /.cache/

In [2]:
# https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/How_to_evaluate_Longformer_on_TriviaQA_using_NLP.ipynb#scrollTo=wyDYG4YDXFV7
!ls ../data
!mkdir ../data/trivia_qa

trivia_qa  wikitext-103-raw
mkdir: cannot create directory '../data/trivia_qa': File exists


In [None]:
%%time
validation_dataset = datasets.load_dataset("trivia_qa", "rc", split="validation[:5%]", cache_dir="/workspace/data/trivia_qa")

In [5]:
!pip freeze


absl-py==0.11.0
apex==0.1
argon2-cffi==20.1.0
asn1crypto==0.24.0
async-generator==1.10
attrs==20.3.0
backcall==0.2.0
bleach==3.2.1
cached-property==1.5.2
cachetools==4.1.1
certifi==2020.11.8
cffi==1.14.4
chardet==3.0.4
click==7.1.2
cloudpickle==1.6.0
colorama==0.4.4
contextvars==2.4
cryptography==2.1.4
cycler==0.10.0
Cython==0.29.21
dask==2.30.0
dataclasses==0.8
datasets==1.1.3
decorator==4.4.2
defusedxml==0.6.0
dill==0.3.3
distributed==2.30.1
dnspython==2.0.0
docopt==0.6.2
entrypoints==0.3
filelock==3.0.12
future==0.18.2
gitdb==4.0.5
GitPython==3.1.11
google-auth==1.23.0
google-auth-oauthlib==0.4.2
graphviz==0.15
grpcio==1.33.2
h5py==3.1.0
HeapDict==1.0.1
hiddenlayer==0.3
idna==2.6
immutables==0.14
importlib-metadata==3.1.0
intel-openmp==2020.0.133
ipykernel==5.3.4
ipython==7.16.1
ipython-genutils==0.2.0
ipywidgets==7.5.1
jedi==0.17.2
Jinja2==2.11.2
joblib==0.17.0
json5==0.9.5
jsonpickle==1.4.1
jsonschema==3.2.0
jupyter-client==6.1.7
jupyter-core==4.7.0
jupyterlab==2.2.9
jupyterlab-py

In [None]:
# define the mapping function
def format_dataset(example):
    # the context might be comprised of multiple contexts => me merge them here
    example["context"] = " ".join(("\n".join(example["entity_pages"]["wiki_context"])).split("\n"))
    example["targets"] = example["answer"]["aliases"]
    example["norm_target"] = example["answer"]["normalized_value"]
    return example

# map the dataset and throw out all unnecessary columns
validation_dataset = validation_dataset.map(format_dataset, remove_columns=["search_results", "question_source", "entity_pages", "answer", "question_id"])

In [None]:
validation_dataset[8]

In [None]:
validation_dataset = validation_dataset.filter(lambda x: len(x["context"]) > 0)
# check out how many samples are left
validation_dataset

In [None]:
print("\n\nLength for each example")
print(30 * "=")

# length for each example
validation_dataset.map(lambda x, i: print(f"Id: {i} - Question Length: {len(x['question'])} - context Length: {len(x['context'])}"), with_indices=True)
print(30 * "=")

print("\n")
print("Num examples larger than 4 * 4096 characters: ")
# filter out examples smaller than 4 * 4096
short_validation_dataset = validation_dataset.filter(lambda x: (len(x['question']) + len(x['context'])) < 4 * 4096)
short_validation_dataset

In [None]:
# EVAL

In [None]:
from transformers import LongformerTokenizerFast, LongformerForQuestionAnswering

tokenizer = LongformerTokenizerFast.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")

# download the 1.7 GB pretrained model. It might take ~1min
model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa")
model.to("cuda")

def evaluate(example):
    def get_answer(question, context):
        # encode question and context so that they are seperated by a tokenizer.sep_token and cut at max_length
        encoding = tokenizer.encode_plus(question, context, return_tensors="pt", max_length=4096, truncation=True)
        input_ids = encoding["input_ids"].to("cuda")
        attention_mask = encoding["attention_mask"].to("cuda")

        # the forward method will automatically set global attention on question tokens
        # The scores for the possible start token and end token of the answer are retrived
        # wrap the function in torch.no_grad() to save memory
        with torch.no_grad():
            start_scores, end_scores = model(input_ids=input_ids, attention_mask=attention_mask)

        # Let's take the most likely token using `argmax` and retrieve the answer
        all_tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"][0].tolist())
        answer_tokens = all_tokens[torch.argmax(start_scores): torch.argmax(end_scores)+1]
        answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens))[1:].replace('"', '')  # remove space prepending space token and remove unnecessary '"'
        
        return answer

    # save the model's outut here
    example["output"] = get_answer(example["question"], example["context"])

    # save if it's a match or not
    example["match"] = (example["output"] in example["targets"]) or (example["output"] == example["norm_target"])

    return example


In [None]:
results_short = short_validation_dataset.map(evaluate)

In [None]:
print(f"\nNum Correct examples: {sum(results_short['match'])}/{len(results_short)}")
wrong_results = results_short.filter(lambda x: x['match'] is False)
print(f"\nWrong examples: ")
wrong_results.map(lambda x, i: print(f"{i} - Output: {x['output']} - Target: {x['norm_target']}"), with_indices=True)

In [None]:
results = validation_dataset.map(evaluate)

In [None]:
print(f"Correct examples: {sum(results['match'])}/{len(results)}")

# TriviaQA json to SQUAD format dataloader

In [1]:
import json
from pathlib import Path

def read_squad_files(path: str):
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)
    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers
    

train_contexts, train_questions, train_answers = read_squad_files('/workspace/data/trivia_squad/squad-wikipedia-train-4096.json')
val_contexts, val_questions, val_answers = read_squad_files('/workspace/data/trivia_squad/squad-wikipedia-dev-4096.json')

In [2]:
## Add start and end tokens correctly

In [3]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two – fix this
        if context[start_idx:end_idx].lower() == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1].lower() == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2].lower() == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters

add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [4]:
## Tokenize results

In [None]:
from transformers import RobertaTokenizerFast
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', do_lowercase=True)

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)

In [None]:
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [None]:
### convert start-end pos to token start/end pos

In [None]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
        # if None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [None]:
### Dataloader

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)



In [None]:
train_dataset = DataLoader(train_dataset, batch_size=16, shuffle=True)