In [None]:
# From
#https://colab.research.google.com/github/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb

!pip install tqdm==4.52.0
!pip install --upgrade torch torchvision
!pip install --upgrade transformers datasets
!pip install --upgrade pyarrow

# From Huggingface

### Load datasets and configure

In [None]:
# TODO make AutoModel and AutoTokenizer

from __future__ import print_function
import argparse
from collections import Counter
import dataclasses
from dataclasses import dataclass, field
import logging
import json
import os
from pprint import pprint
import re
import string
import sys
from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
from typing import TypeVar
from typing import Dict, List, Optional

import datasets
from datasets import logging as DSlogging
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from tqdm.auto import tqdm

from transformers import (
    HfArgumentParser,
    DataCollator,
    Trainer,
    TrainingArguments,
    set_seed,
)



logger = logging.getLogger(__name__)
DSlogging.set_verbosity_warning()

#tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096')
MODEL_NAME = "/workspace/models/RoBERTa_Long_seed_1337/RoBERTa_Long_seed_1337-4096-lm"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME)

In [None]:

def get_correct_alignement(context: str, answer) -> Tuple[int, int]:
    """ Some original examples in SQuAD have indices wrong by 1 or 2 character. We test and fix this here. """
    gold_text = answer['text'][0]
    start_idx = answer['answer_start'][0]
    end_idx = start_idx + len(gold_text)
    if context[start_idx:end_idx] == gold_text:
        return start_idx, end_idx       # When the gold label position is good
    elif context[start_idx-1:end_idx-1] == gold_text:
        return start_idx-1, end_idx-1   # When the gold label is off by one character
    elif context[start_idx-2:end_idx-2] == gold_text:
        return start_idx-2, end_idx-2   # When the gold label is off by two character
    else:
        raise ValueError()

        
# Tokenize our training dataset
def convert_to_features(example): 
    # Tokenize contexts and questions (as pairs of inputs)
    encodings = tokenizer.encode_plus(example['question'], example['context'], pad_to_max_length=True, max_length=512, truncation=True)
    context_encodings = tokenizer.encode_plus(example['context'])
    
    # Compute start and end tokens for labels using Transformers's fast tokenizers alignement methodes.
    # this will give us the position of answer span in the context text
    start_idx, end_idx = get_correct_alignement(example['context'], example['answers'])
    start_positions_context = context_encodings.char_to_token(start_idx)
    end_positions_context = context_encodings.char_to_token(end_idx-1)

    # here we will compute the start and end position of the answer in the whole example
    # as the example is encoded like this <s> question</s></s> context</s>
    # and we know the postion of the answer in the context
    # we can just find out the index of the sep token and then add that to position + 1 (+1 because there are two sep tokens)
    # this will give us the position of the answer span in whole example 
    sep_idx = encodings['input_ids'].index(tokenizer.sep_token_id)
    start_positions = start_positions_context + sep_idx + 1
    end_positions = end_positions_context + sep_idx + 1

    if end_positions > 512:
        start_positions, end_positions = 0, 0

    encodings.update({'start_positions': start_positions,
                      'end_positions': end_positions,
                      'attention_mask': encodings['attention_mask']})
    return encodings


def convert_to_features_map(example, tokenizer): 
    # Tokenize contexts and questions (as pairs of inputs)
    encodings = tokenizer.encode_plus(example['question'], example['context'], pad_to_max_length=True, max_length=512, truncation=True)
    context_encodings = tokenizer.encode_plus(example['context'])
    
    # Compute start and end tokens for labels using Transformers's fast tokenizers alignement methodes.
    # this will give us the position of answer span in the context text
    start_idx, end_idx = get_correct_alignement(example['context'], example['answers'])
    start_positions_context = context_encodings.char_to_token(start_idx)
    end_positions_context = context_encodings.char_to_token(end_idx-1)

    # here we will compute the start and end position of the answer in the whole example
    # as the example is encoded like this <s> question</s></s> context</s>
    # and we know the postion of the answer in the context
    # we can just find out the index of the sep token and then add that to position + 1 (+1 because there are two sep tokens)
    # this will give us the position of the answer span in whole example 
    sep_idx = encodings['input_ids'].index(tokenizer.sep_token_id)
    start_positions = start_positions_context + sep_idx + 1
    end_positions = end_positions_context + sep_idx + 1

    if end_positions > 512:
        start_positions, end_positions = 0, 0

    encodings.update({'start_positions': start_positions,
                      'end_positions': end_positions,
                      'attention_mask': encodings['attention_mask']})
    return encodings




def convert_to_features_loop(dataset, tokenizer): 
    for example in dataset:    
        # Tokenize contexts and questions (as pairs of inputs)
        encodings = tokenizer.encode_plus(example['question'], example['context'], pad_to_max_length=True, max_length=512, truncation=True)
        context_encodings = tokenizer.encode_plus(example['context'])

        # Compute start and end tokens for labels using Transformers's fast tokenizers alignement methodes.
        # this will give us the position of answer span in the context text
        start_idx, end_idx = get_correct_alignement(example['context'], example['answers'])
        start_positions_context = context_encodings.char_to_token(start_idx)
        end_positions_context = context_encodings.char_to_token(end_idx-1)

        # here we will compute the start and end position of the answer in the whole example
        # as the example is encoded like this <s> question</s></s> context</s>
        # and we know the postion of the answer in the context
        # we can just find out the index of the sep token and then add that to position + 1 (+1 because there are two sep tokens)
        # this will give us the position of the answer span in whole example 
        sep_idx = encodings['input_ids'].index(tokenizer.sep_token_id)
        start_positions = start_positions_context + sep_idx + 1
        end_positions = end_positions_context + sep_idx + 1

        if end_positions > 512:
            start_positions, end_positions = 0, 0

        encodings.update({'start_positions': start_positions,
                          'end_positions': end_positions,
                          'attention_mask': encodings['attention_mask']})
        return encodings

In [None]:
%%time
#tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096')

# load train and validation split of squad
train_dataset  = datasets.load_dataset('squad', split='train')
valid_dataset = datasets.load_dataset('squad', split='validation')

train_dataset = train_dataset.map(convert_to_features)
valid_dataset = valid_dataset.map(convert_to_features)

In [None]:
# Makes it a datasets.arrow_dataset.Dataset to make to tensor for training
columns = ['input_ids', 'attention_mask', 'start_positions', 'end_positions']

train_dataset.set_format(type='torch', columns=columns)
valid_dataset.set_format(type='torch', columns=columns)

In [None]:
len(train_dataset), len(valid_dataset)



## Transform datasets into tensors for training

Set the tensor type and the columns which the dataset should return  
This converts the format of the dataset to `torch`, `tf` or `pandas`  
This allows the format of the model to be used when training a torch model  
  
- `type` define the return type for our dataset `__getitem__` method and is one of `[None, 'numpy', 'pandas', 'torch', 'tensorflow']` (`None` means return python objects), and
- `columns` define the columns returned by `__getitem__` and takes the name of a column in the dataset or a list of columns to return (`None` means return all columns).  
  
  
**NOTE**: Features are note removed from the dataset, just not passed when calling `__getitem__`.
To go back to the dataset format (Needed when evaluating or done training):
``` python
from pprint import pprint

from transformers import AutoTokenizer
import datasets

tokenizer = AutoTokenizer.from_pretrained('roberta-base', use_fast=True)

# Dataset to Tensor
def convert_to_features(batch):
    # Tokenize contexts and questions (as pairs of inputs)
    input_pairs = list(zip())
    encodings = tokenizer(batch['context'], batch['question'], truncation=True)

    # Compute start and end tokens for labels
    start_positions, end_positions = [], []
    for i, answer in enumerate(batch['answers']):
        first_char = answer['answer_start'][0]
        last_char = first_char + len(answer['text'][0]) - 1
        start_positions.append(encodings.char_to_token(i, first_char))
        end_positions.append(encodings.char_to_token(i, last_char))

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
    return encodings

data = datasets.load_dataset('squad', split="validation")
dataset.map(convert_to_features, batched=True)
print("column_names", encoded_dataset.column_names)
print("start_positions", encoded_dataset[:5]['start_positions'])

# Convert to tensor format
columns_to_return = ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions']
data.set_format(type='torch', columns=columns_to_return)

# Our dataset indexing output is now ready for being used in a pytorch dataloader
pprint(data[1], compact=True)

# Show that features are still there, just not directly callable
print(data.column_names)

# Convert back from Tensor to Dataset: `.reset_format()` 
# or call `.set_format()` with no arguments
data.reset_format()
pprint(data[1], compact=True)
```
=========================================================

In [None]:
# cach the dataset, so we can load it directly for training
"""
SQUAD_DIR = "/workspace/data/SQuAD"
if not os.path.isdir(SQUAD_DIR):
    os.mkdir(SQUAD_DIR)
    print(f"Creating data dir: {SQUAD_DIR}")

torch.save(train_dataset, f'{SQUAD_DIR}/train_data.pt')
torch.save(valid_dataset, f'{SQUAD_DIR}/valid_data.pt')
"""

# Write Training script

In [None]:
# load from here if saved dataset is already made

In [None]:
class DummyDataCollator:
    def __call__(self, batch):
        #def collate_batch(self, batch: List) -> Dict[str, torch.Tensor]:
        """
        Take a list of samples from a Dataset and collate them into a batch.
        Returns:
            A dictionary of tensors
        """
        input_ids = torch.stack([example['input_ids'] for example in batch])
        attention_mask = torch.stack([example['attention_mask'] for example in batch])
        start_positions = torch.stack([example['start_positions'] for example in batch])
        end_positions = torch.stack([example['end_positions'] for example in batch])

        return {
            'input_ids': input_ids, 
            'start_positions': start_positions, 
            'end_positions': end_positions,
            'attention_mask': attention_mask
        }
    

class DataCollector():
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch: List) -> Dict[str, torch.Tensor]:
        input_ids = torch.stack([example['input_ids'] for example in batch])
        attention_mask = torch.stack([example['attention_mask'] for example in batch])
        start_positions = torch.stack([example['start_positions'] for example in batch])
        end_positions = torch.stack([example['end_positions'] for example in batch])

        return {
            'input_ids': input_ids, 
            'start_positions': start_positions, 
            'end_positions': end_positions,
            'attention_mask': attention_mask
        }

In [None]:
print("="*50)
print("=" + "\t"*6 + " =")
print("=" + "\tStarting Training and loading data\t"+ " =")
print("=" + "\t"*6 + " =")
print("="*50)


print("="*50)
print("=" + "\t"*6 + " =")
print("=" + "\tInitialization" + "\t"*4 + " =")
print("=" + "\t"*6 + " =")
print("="*50 +"\n")

print("="*50)
print("=" + "\t"*6 + " =")
print("=" + "\tStarting preparing data \t\t"+ " =")
print("=" + "\t"*6 + " =")
print("="*50 +"\n")

## Define Training args

In [None]:
# TODO add args
SQUAD_DIR = "/workspace/data/SQuAD"

args_dict = {
  "n_gpu": 1,
  "model_name_or_path": f"{MODEL_NAME}",    #'allenai/longformer-base-4096',
    "train_file_path": f"{SQUAD_DIR}/train_data.pt",
    "val_file_path": f"{SQUAD_DIR}/valid_data.pt",
  "max_len": 512 ,
  "output_dir": './models',
  "overwrite_output_dir": True,
  "per_gpu_train_batch_size": 1, #8,
  "per_gpu_eval_batch_size": 1, # 8,
  "gradient_accumulation_steps": 2, # 16,
  "learning_rate": 1e-4,
  "num_train_epochs": 3,
  "do_train": True,
  "max_steps": 10,                            #
}


"""
seed = 404
# "evaluation_strategy": "steps",
# "overwrite_output_dir": True,
SQUAD_DIR = "/workspace/data/SQuAD"
MODEL_DIR = "/workspace/models"
LOG_DIR = "/workspace/logs"
args_dict = {
    "output_dir": f'{MODEL_DIR}/Longformer-4094-squad_seed_{seed}',
    "logging_dir": f'{LOG_DIR}/Longformer-4094-squad_seed_{seed}',
    "train_file_path": f"{SQUAD_DIR}/train_data.pt",
    "val_file_path": f"{SQUAD_DIR}/valid_data.pt",
    "seed": seed,
    "fp16": True,
    "evaluate_during_training": True,
    "overwrite_output_dir": True, 
    "logging_steps": 10,
    "eval_steps": 10,
    "do_eval": True,
    "do_train": True,
    "n_gpu": 1,
    "model_name_or_path": 'allenai/longformer-base-4096',
    "max_len": 512 ,
    "per_device_train_batch_size": 8,
    "per_device_eval_batch_size": 8,
    "gradient_accumulation_steps": 16,
    "learning_rate": 1e-4,
    "max_steps": 3,
}

"""


with open('args.json', 'w') as f:
    json.dump(args_dict, f)

## Main and Train

In [None]:
# TODO add logging to file


@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """
    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
    )

@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """
    train_file_path: Optional[str] = field(
        default='train_data.pt',
        metadata={"help": "Path for cached train dataset"},
    )
    val_file_path: Optional[str] = field(
        default='valid_data.pt',
        metadata={"help": "Path for cached valid dataset"},
    )
    max_len: Optional[int] = field(
        default=512,
        metadata={"help": "Max input length for the source text"},
    )

In [None]:
# TODO
#def main():

# See all possible arguments in src/transformers/training_args.py

parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))

# we will load the arguments from a json file, 
# make sure you save the arguments in at ./args.json
# TODO replace

model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath('args.json'))
# model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath('args.json'))

if (
    os.path.exists(training_args.output_dir)
    and os.listdir(training_args.output_dir)
    and training_args.do_train
    and not training_args.overwrite_output_dir
):
    raise ValueError(
        f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
    )

# Setup logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
)
logger.warning(
    "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
    training_args.local_rank,
    training_args.device,
    training_args.n_gpu,
    bool(training_args.local_rank != -1),
    training_args.fp16,
)
logger.info("Training/evaluation parameters %s", training_args)

# Set seed
set_seed(training_args.seed)

# Load pretrained model and tokenizer
#
# Distributed training:
# The .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.

tokenizer = AutoTokenizer.from_pretrained(
    model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
    cache_dir=model_args.cache_dir,
    use_fast=True
)
model = AutoModelForQuestionAnswering.from_pretrained(
    model_args.model_name_or_path,
    cache_dir=model_args.cache_dir,
)

# TODO Tested to remove
# Get datasets
print('loading data')
train_dataset  = torch.load(data_args.train_file_path)
valid_dataset = torch.load(data_args.val_file_path)
print('loading done')

# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=DummyDataCollator(),
    prediction_loss_only=True,
)

# Training
if training_args.do_train:
    trainer.train(
        model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
    )
    trainer.save_model()
    # For convenience, we also re-save the tokenizer to the same directory,
    # so that you can share your model easily on huggingface.co/models =)
    if trainer.is_world_master():
        tokenizer.save_pretrained(training_args.output_dir)

# Evaluation
results = {}
if training_args.do_eval and training_args.local_rank in [-1, 0]:
    logger.info("*** Evaluate ***")

    eval_output = trainer.evaluate()

    output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results *****")
        for key in sorted(eval_output.keys()):
            logger.info("  %s = %s", key, str(eval_output[key]))
            writer.write("%s = %s\n" % (key, str(eval_output[key])))

    results.update(eval_output)


#return results

In [None]:
%%time
#main()

# Eval

In [None]:
## SQuAD evaluation script. Modifed slightly for this notebook

# https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py
# https://github.com/huggingface/transformers/tree/master/examples/question-answering


def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    return (normalize_answer(prediction) == normalize_answer(ground_truth))


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)
    return max(scores_for_ground_truths)


def evaluate(gold_answers, predictions):
    f1 = exact_match = total = 0

    for ground_truths, prediction in zip(gold_answers, predictions):
        total += 1
        exact_match += metric_max_over_ground_truths(
                    exact_match_score, prediction, ground_truths)
        f1 += metric_max_over_ground_truths(
                    f1_score, prediction, ground_truths)
    
    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return {'exact_match': exact_match, 'f1': f1}

### Load the trained model

In [None]:
tokenizer = LongformerTokenizerFast.from_pretrained('./models')
model = LongformerForQuestionAnswering.from_pretrained('./models')
model = model.cuda()
model.eval()

#### Set up dataloaded for batched evaluation

In [None]:
# valid_dataset = torch.load('/workspace/data/SQuAD/valid_data.pt')

In [None]:
%%time
predicted_answers = []
with torch.no_grad():
    for batch in tqdm(dataloader):
        start_scores, end_scores = model(input_ids=batch['input_ids'].cuda(),
                                  attention_mask=batch['attention_mask'].cuda())
        for i in range(start_scores.shape[0]):
            all_tokens = tokenizer.convert_ids_to_tokens(batch['input_ids'][i])
            answer = ' '.join(all_tokens[torch.argmax(start_scores[i]) : torch.argmax(end_scores[i])+1])
            ans_ids = tokenizer.convert_tokens_to_ids(answer.split())
            answer = tokenizer.decode(ans_ids)
            predicted_answers.append(answer)

#### Make predicitons
**Importaint!**: Change the valid_dataset back from tensor format, to default python dataset

In [None]:
valid_dataset.reset_format()

In [None]:
predictions = []
references = []
# valid_dataset = nlp.load_dataset('squad', split='validation')
for ref, pred_answer in zip(valid_dataset, predicted_answers):
    actual_answer = ref['answers']['text']
    predictions.append(pred_answer)
    references.append(actual_answer)
    
evaluate(references, predictions)

In [None]:
# trainer.predict(valid_dataset)
# output:
# PredictionOutput(predictions=None, label_ids=None, metrics={})

In [15]:
import os
os.path.dirname('/workspace/data/SQuAD/train-v1.1.json')

'/workspace/data/SQuAD'

Old way to convert text to Model predictions.  
Given tokenizer and pretrained QA-model

``` python
text = r"""
🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
TensorFlow 2.0 and PyTorch.
"""

questions = [
    "How many pretrained models are available in Transformers?",
    "What does Transformers provide?",
    "Transformers provides interoperability between which frameworks?",
]

for question in questions:
    inputs = tokenizer.encode_plus(question, text, 
                                   add_special_tokens=True, 
                                   return_tensors="pt")
    input_ids = inputs["input_ids"].tolist()[0]

    text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    answer_start_scores, answer_end_scores = model(**inputs)

    answer_start = torch.argmax(
        answer_start_scores
    )  # Get the most likely beginning of answer with the argmax of the score
    answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

    print(f"Question: {question}")
    print(f"Answer: {answer}\n")
```

#### Manual way to make answer prediction
This is mainly to illustrate how the basic prediction is made:  
``` python
input_ids = valid_dataset[0]['input_ids'].tolist()
start = valid_dataset['start_positions'][0]
end   = valid_dataset['end_positions'][0]
print(start, end)

# Old way - May not work anymore
prediciton = tok.convert_tokens_to_string(tok.convert_ids_to_tokens(input_ids2[start:end]))

# New way
# Cleaning tokens need to be made. Left in for illustration and easier separation.
prediction = tok.decode(input_ids[start:end]) # But!!! includes unwanted spaces
print(prediction)
```

# Using A fine-tuned model in action 🚀
The trained model is available on Huggingface hub if you want to play with it   
You can find the model [here](https://huggingface.co/valhalla/longformer-base-4096-finetuned-squadv1)

In [None]:
import torch
from transformers import LongformerTokenizer, LongformerForQuestionAnswering

tokenizer = LongformerTokenizer.from_pretrained("valhalla/longformer-base-4096-finetuned-squadv1")
model = LongformerForQuestionAnswering.from_pretrained("valhalla/longformer-base-4096-finetuned-squadv1")

text = "Huggingface has democratized NLP. Huge thanks to Huggingface for this."
question = "What has Huggingface done ?"
encoding = tokenizer.encode_plus(question, text, return_tensors="pt")
input_ids = encoding["input_ids"]

# default is local attention everywhere
# the forward method will automatically set global attention on question tokens
attention_mask = encoding["attention_mask"]

start_scores, end_scores = model(input_ids, attention_mask=attention_mask)
all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())

answer_tokens = all_tokens[torch.argmax(start_scores) :torch.argmax(end_scores)+1]
answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens))
# output => democratized NLP

# Or use Huggingface pipeline

In [None]:
### Load Roberta pretrained model for Squad
# https://huggingface.co/deepset/roberta-base-squad2
from transformers.pipelines import pipeline
from transformers.modeling_auto import AutoModelForQuestionAnswering, RobertaForMaskedLM
from transformers.tokenization_auto import AutoTokenizer

#model_name = "deepset/roberta-base-squad2"
Pipeline_model_name = "valhalla/longformer-base-4096-finetuned-squadv1"

nlp = pipeline('question-answering', model=Pipeline_model_name, tokenizer=Pipeline_model_name)

context = r"""
Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
a model on a SQuAD task, you may leverage the `run_squad.py`.
"""

# yields very low scores compared to Huggingfaces own QA pipeline
print(nlp(question="What is extractive question answering?", context=context))
print(nlp(question="What is a good example of a question answering dataset?", context=context))

# Extra

In [None]:
# https://github.com/geblanco/mc_transformers/blob/master/mc_transformers/mc_transformers.py
def pair_predictions_with_ids(results, data_collator):
    return PredictionOutputWithIds(
        predictions=results.predictions,
        label_ids=results.label_ids,
        example_ids=data_collator.example_ids,
        metrics=results.metrics,
    )



if training_args.do_eval:
    logger.info("*** Evaluate ***")
    result = trainer.predict(eval_dataset)
    if trainer.is_world_master():
        result = pair_predictions_with_ids(result, data_collator)
        save_results(
            processor, result, all_args, split=Split.dev
        )
        results['eval'] = result
        data_collator.drop_ids()

if training_args.do_predict:
    logger.info("*** Test ***")
    result = trainer.predict(test_dataset)
    if trainer.is_world_master():
        result = pair_predictions_with_ids(result, data_collator)
        save_results(
            processor, result, all_args, split=Split.test
        )
        results['test'] = result
        data_collator.drop_ids()

#return results

In [None]:
# Huggingfaces predict


if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        eval_examples = read_squad_examples(
            input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative)
        eval_features = convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=False)

        logger.info("***** Running predictions *****")
        logger.info("  Num orig examples = %d", len(eval_examples))
        logger.info("  Num split examples = %d", len(eval_features))
        logger.info("  Batch size = %d", args.predict_batch_size)

        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)

        model.eval()
        all_results = []
        logger.info("Start evaluating")
        for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]):
            if len(all_results) % 1000 == 0:
                logger.info("Processing example: %d" % (len(all_results)))
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            with torch.no_grad():
                batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask)
            for i, example_index in enumerate(example_indices):
                start_logits = batch_start_logits[i].detach().cpu().tolist()
                end_logits = batch_end_logits[i].detach().cpu().tolist()
                eval_feature = eval_features[example_index.item()]
                unique_id = int(eval_feature.unique_id)
                all_results.append(RawResult(unique_id=unique_id,
                                             start_logits=start_logits,
                                             end_logits=end_logits))
        output_prediction_file = os.path.join(args.output_dir, "predictions.json")
        output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json")
        write_predictions(eval_examples, eval_features, all_results,
                          args.n_best_size, args.max_answer_length,
                          args.do_lower_case, output_prediction_file,
                          output_nbest_file, output_null_log_odds_file, args.verbose_logging,
                          args.version_2_with_negative, args.null_score_diff_threshold)

## Using the Datasets Library

In [None]:
# Let's import the library. We typically only need at most four methods:
from datasets import list_datasets, list_metrics, load_dataset, load_metric
# Let's import a fast tokenizer that can work on batched inputs
# (the 'Fast' tokenizers in HuggingFace)
import datasets
from datasets import logging as dataset_logging
from pprint import pprint
#dataset_logging.set_verbosity_info()
dataset_logging.set_verbosity_warning()

import torch 
from datasets import load_dataset
from transformers import BertTokenizerFast, BertForQuestionAnswering, logging as transformers_logging

transformers_logging.set_verbosity_warning()




# Load our training dataset and tokenizer
dataset = load_dataset('squad')
train_dataset = load_dataset('squad', split="train")
valid_dataset = load_dataset('squad', split="validation")
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')


def get_correct_alignement(context, answer):
    """ Some original examples in SQuAD have indices wrong by 1 or 2 character. We test and fix this here. """
    gold_text = answer['text'][0]
    start_idx = answer['answer_start'][0]
    end_idx = start_idx + len(gold_text)
    if context[start_idx:end_idx] == gold_text:
        return start_idx, end_idx       # When the gold label position is good
    elif context[start_idx-1:end_idx-1] == gold_text:
        return start_idx-1, end_idx-1   # When the gold label is off by one character
    elif context[start_idx-2:end_idx-2] == gold_text:
        return start_idx-2, end_idx-2   # When the gold label is off by two character
    else:
        raise ValueError()

# Tokenize our training dataset
def convert_to_features(example_batch):
    # Tokenize contexts and questions (as pairs of inputs)
    encodings = tokenizer(example_batch['context'], example_batch['question'], truncation=True)

    # Compute start and end tokens for labels using Transformers's fast tokenizers alignement methods.
    start_positions, end_positions = [], []
    for i, (context, answer) in enumerate(zip(example_batch['context'], example_batch['answers'])):
        start_idx, end_idx = get_correct_alignement(context, answer)
        start_positions.append(encodings.char_to_token(i, start_idx))
        end_positions.append(encodings.char_to_token(i, end_idx-1))
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
    return encodings

encoded_dataset = dataset.map(convert_to_features, batched=True)
train_dataset.map(convert_to_features, batched=True)
valid_dataset.map(convert_to_features, batched=True)

# Format our dataset to outputs torch.Tensor to train a pytorch model
columns = ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions']
encoded_dataset.set_format(type='torch', columns=columns)
train_dataset.set_format(type='torch', columns=columns)
valid_dataset.set_format(type='torch', columns=columns)

# Instantiate a PyTorch Dataloader around our dataset
# Let's do dynamic batching (pad on the fly with our own collate_fn)
def collate_fn(examples):
    return tokenizer.pad(examples, return_tensors='pt')
dataloader = torch.utils.data.DataLoader(encoded_dataset['train'], collate_fn=collate_fn, batch_size=8)







model = BertForQuestionAnswering.from_pretrained('distilbert-base-cased', return_dict=True)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)


from datasets import load_metric
# You need to give the total number of parallel python processes (num_process) and the id of each process (process_id)
squad_metric = datasets.load_metric('squad')


# Now let's train our model
device = 'cuda' if torch.cuda.is_available() else 'cpu'


model.train().to(device)
for i, batch in enumerate(dataloader):
    batch.to(device)
    outputs = model(**batch)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    model.zero_grad()
    print(f'Step {i} - loss: {loss:.3}')
    
    pprint(outputs)
    if i > 1:
        break

# How others have used padding adn truncation w datasets


In [None]:
import sys
from unittest.mock import patch

from transformers import BertTokenizer, EncoderDecoderModel
from transformers.file_utils import is_datasets_available
from transformers.testing_utils import (
    TestCasePlus,
    execute_subprocess_async,
    get_gpu_count,
    require_torch_non_multi_gpu_but_fix_me,
    slow,
)
from transformers.trainer_callback import TrainerState
from transformers.trainer_utils import set_seed




set_seed(42)
MARIAN_MODEL = "sshleifer/student_marian_en_ro_6_1"


class TestFinetuneTrainer(TestCasePlus):
    def test_finetune_trainer(self):
        output_dir = self.run_trainer(1, "12", MBART_TINY, 1)
        logs = TrainerState.load_from_json(os.path.join(output_dir, "trainer_state.json")).log_history
        eval_metrics = [log for log in logs if "eval_loss" in log.keys()]
        first_step_stats = eval_metrics[0]
        assert "eval_bleu" in first_step_stats

    @slow
    def test_finetune_trainer_slow(self):
        # There is a missing call to __init__process_group somewhere
        output_dir = self.run_trainer(eval_steps=2, max_len="128", model_name=MARIAN_MODEL, num_train_epochs=10)

        # Check metrics
        logs = TrainerState.load_from_json(os.path.join(output_dir, "trainer_state.json")).log_history
        eval_metrics = [log for log in logs if "eval_loss" in log.keys()]
        first_step_stats = eval_metrics[0]
        last_step_stats = eval_metrics[-1]

        assert first_step_stats["eval_bleu"] < last_step_stats["eval_bleu"]  # model learned nothing
        assert isinstance(last_step_stats["eval_bleu"], float)

        # test if do_predict saves generations and metrics
        contents = os.listdir(output_dir)
        contents = {os.path.basename(p) for p in contents}
        assert "test_generations.txt" in contents
        assert "test_results.json" in contents

    @slow
    @require_torch_non_multi_gpu_but_fix_me
    def test_finetune_bert2bert(self):
        if not is_datasets_available():
            return

        import datasets

        bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained("prajjwal1/bert-tiny", "prajjwal1/bert-tiny")
        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

        bert2bert.config.vocab_size = bert2bert.config.encoder.vocab_size
        bert2bert.config.eos_token_id = tokenizer.sep_token_id
        bert2bert.config.decoder_start_token_id = tokenizer.cls_token_id
        bert2bert.config.max_length = 128

        train_dataset = datasets.load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")
        val_dataset = datasets.load_dataset("cnn_dailymail", "3.0.0", split="validation[:1%]")

        train_dataset = train_dataset.select(range(32))
        val_dataset = val_dataset.select(range(16))

        rouge = datasets.load_metric("rouge")

        batch_size = 4

        def _map_to_encoder_decoder_inputs(batch):
            # Tokenizer will automatically set [BOS] <text> [EOS]
            inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=512)
            outputs = tokenizer(batch["highlights"], padding="max_length", truncation=True, max_length=128)
            batch["input_ids"] = inputs.input_ids
            batch["attention_mask"] = inputs.attention_mask

            batch["decoder_input_ids"] = outputs.input_ids
            batch["labels"] = outputs.input_ids.copy()
            batch["labels"] = [
                [-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]
            ]
            batch["decoder_attention_mask"] = outputs.attention_mask

            assert all([len(x) == 512 for x in inputs.input_ids])
            assert all([len(x) == 128 for x in outputs.input_ids])

            return batch

        def _compute_metrics(pred):
            labels_ids = pred.label_ids
            pred_ids = pred.predictions

            # all unnecessary tokens are removed
            pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
            label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

            rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])[
                "rouge2"
            ].mid

            return {
                "rouge2_precision": round(rouge_output.precision, 4),
                "rouge2_recall": round(rouge_output.recall, 4),
                "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
            }

        # map train dataset
        train_dataset = train_dataset.map(
            _map_to_encoder_decoder_inputs,
            batched=True,
            batch_size=batch_size,
            remove_columns=["article", "highlights"],
        )
        train_dataset.set_format(
            type="torch",
            columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
        )

        # same for validation dataset
        val_dataset = val_dataset.map(
            _map_to_encoder_decoder_inputs,
            batched=True,
            batch_size=batch_size,
            remove_columns=["article", "highlights"],
        )
        val_dataset.set_format(
            type="torch",
            columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
        )

        output_dir = self.get_auto_remove_tmp_dir()

        training_args = Seq2SeqTrainingArguments(
            output_dir=output_dir,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            predict_with_generate=True,
            evaluation_strategy="steps",
            do_train=True,
            do_eval=True,
            warmup_steps=0,
            eval_steps=2,
            logging_steps=2,
        )

        # instantiate trainer
        trainer = Seq2SeqTrainer(
            model=bert2bert,
            args=training_args,
            compute_metrics=_compute_metrics,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
        )

        # start training
        trainer.train()

    def run_trainer(self, eval_steps: int, max_len: str, model_name: str, num_train_epochs: int):
        data_dir = self.examples_dir / "seq2seq/test_data/wmt_en_ro"
        output_dir = self.get_auto_remove_tmp_dir()
        args = f"""
            --model_name_or_path {model_name}
            --data_dir {data_dir}
            --output_dir {output_dir}
            --overwrite_output_dir
            --n_train 8
            --n_val 8
            --max_source_length {max_len}
            --max_target_length {max_len}
            --val_max_target_length {max_len}
            --do_train
            --do_eval
            --do_predict
            --num_train_epochs {str(num_train_epochs)}
            --per_device_train_batch_size 4
            --per_device_eval_batch_size 4
            --learning_rate 3e-3
            --warmup_steps 8
            --evaluation_strategy steps
            --predict_with_generate
            --logging_steps 0
            --save_steps {str(eval_steps)}
            --eval_steps {str(eval_steps)}
            --sortish_sampler
            --label_smoothing 0.1
            --adafactor
            --task translation
            --tgt_lang ro_RO
            --src_lang en_XX
        """.split()
        # --eval_beams  2

        n_gpu = get_gpu_count()
        if n_gpu > 1:
            distributed_args = f"""
                -m torch.distributed.launch
                --nproc_per_node={n_gpu}
                {self.test_file_dir}/finetune_trainer.py
            """.split()
            cmd = [sys.executable] + distributed_args + args
            execute_subprocess_async(cmd, env=self.get_env())
        else:
            # 0 or 1 gpu
            testargs = ["finetune_trainer.py"] + args
            with patch.object(sys, "argv", testargs):
                main()

        return output_dir