In [68]:
# Test Huggingface Basics
import warnings
warnings.filterwarnings('ignore', 'FutureWarning')

In [3]:
%debug

In [32]:
from transformers import RobertaTokenizer

In [12]:
MODEL_NAME = 'distilroberta-base'
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)

seq = "Testing a simple sequence running on RoBERTa base model on Peltarion remote server"
print(tokenizer.encode(seq))

[0, 47446, 10, 2007, 13931, 878, 15, 3830, 11126, 38495, 1542, 1421, 15, 221, 6607, 271, 1499, 6063, 10228, 2]


In [17]:
# Where tokenizer files are stored
tokenizer.pretrained_vocab_files_map

{'vocab_file': {'roberta-base': 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json',
  'roberta-large': 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json',
  'roberta-large-mnli': 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json',
  'distilroberta-base': 'https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-vocab.json',
  'roberta-base-openai-detector': 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json',
  'roberta-large-openai-detector': 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json'},
 'merges_file': {'roberta-base': 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt',
  'roberta-large': 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt',
  'roberta-large-mnli': 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt',
  'distilroberta-base':

In [16]:
tokenizer.convert_ids_to_tokens(tokenizer.encode(seq))

['<s>',
 'Testing',
 'Ġa',
 'Ġsimple',
 'Ġsequence',
 'Ġrunning',
 'Ġon',
 'ĠRo',
 'BER',
 'Ta',
 'Ġbase',
 'Ġmodel',
 'Ġon',
 'ĠP',
 'elt',
 'ar',
 'ion',
 'Ġremote',
 'Ġserver',
 '</s>']

In [24]:
tokenized_sequence = tokenizer.tokenize(seq)
encoded_input = tokenizer(seq)
print("Printing sentance: \n", tokenized_sequence)
print("\nPrinting encoding\n", encoded_input)
print("\nOnly input_ids:\n", encoded_input['input_ids'])

Printing sentance: 
 ['Testing', 'Ġa', 'Ġsimple', 'Ġsequence', 'Ġrunning', 'Ġon', 'ĠRo', 'BER', 'Ta', 'Ġbase', 'Ġmodel', 'Ġon', 'ĠP', 'elt', 'ar', 'ion', 'Ġremote', 'Ġserver']

Printing encoding
 {'input_ids': [0, 47446, 10, 2007, 13931, 878, 15, 3830, 11126, 38495, 1542, 1421, 15, 221, 6607, 271, 1499, 6063, 10228, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

Only input_ids:
 [0, 47446, 10, 2007, 13931, 878, 15, 3830, 11126, 38495, 1542, 1421, 15, 221, 6607, 271, 1499, 6063, 10228, 2]


In [34]:
##import dataclasses
#import json
import random
#from dataclasses import dataclass
#from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Union

import numpy as np
import torch


def set_seed(seed: int):
    """
    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf``
    (if installed).
    Args:
        seed (:obj:`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [35]:
# Encoding multiple sentances

In [44]:
sequence_a = "This is a short sequence."
sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."

encoded_sequence_a = tokenizer(sequence_a)["input_ids"]
encoded_sequence_b = tokenizer(sequence_b)["input_ids"]

In [47]:
# Since different lengths, we need to padd each sentance
len(encoded_sequence_a), len(encoded_sequence_b)

# Easiest is to truncate / Padd  automaticlly with the tokenizer
# Send in the text sentances and add PADDING
padded_seq = tokenizer([sequence_a, sequence_b], padding=True)

In [57]:
padded_input_ids = padded_seq['input_ids']
padded_attention_mask = padded_seq['attention_mask']

# Notice that the shorter sentance has additional padding, with mask of 0 to indixate that those are simply paddings
for idx, sentance in enumerate(padded_input_ids):
    print(f"Sentance {idx}: \n", padded_input_ids[idx])
    print(f"With Mask: \n", padded_attention_mask[idx])
    print()


Sentance 0: 
 [0, 713, 16, 10, 765, 13931, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
With Mask: 
 [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Sentance 1: 
 [0, 713, 16, 10, 1195, 251, 13931, 4, 85, 16, 23, 513, 1181, 87, 5, 13931, 83, 4, 2]
With Mask: 
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]



### Task specific and unique tokens

Some transformer models encode words and tokens differently   
For instance BERT [CLS], [SEP], [UNK], [MASK] tokens and more   
   
Notice how BERT and RoBERTa uses different tokens

In [79]:
from transformers import BertTokenizer
# Show how RoBERTa/BERT encodes sentances into multiple ones
sequence_a = "This is a short sequence."
sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
encoded_seq = tokenizer(sequence_a, sequence_b)
decoded_seq = tokenizer.decode(encoded_seq['input_ids'])
print(decoded_seq)
# In BERT, each sentance also a token indicating which sentance it belongs to, since modeling next sentance prediction during pretraining
print("\nToken types: \n", encoded_seq['token_type_ids'])

[CLS] This is a short sequence. [SEP] This is a rather long sequence. It is at least longer than the sequence A. [SEP]

Token types: 
 [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [90]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)
encoded_seq = tokenizer(sequence_a, sequence_b)
decoded_seq = tokenizer.decode(encoded_seq['input_ids'])
print(decoded_seq)
print(encoded_seq)

<s>This is a short sequence.</s></s>This is a rather long sequence. It is at least longer than the sequence A.</s>
{'input_ids': [0, 713, 16, 10, 765, 13931, 4, 2, 2, 713, 16, 10, 1195, 251, 13931, 4, 85, 16, 23, 513, 1181, 87, 5, 13931, 83, 4, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [116]:
from transformers import XLMRobertaTokenizer

# Show how RoBERTa encodes sentances into multiple ones
sequence_a = "This is a short sequence."
sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
sequence_sv = "Det här är en mening på Svenska!"

"""
xlm-roberta-base, xlm-roberta-large, xlm-roberta-large-finetuned-conll02-dutch, xlm-roberta-large-finetuned-conll02-spanish, xlm-roberta-large-finetuned-conll03-english, xlm-roberta-large-finetuned-conll03-german
"""
XLM_NAME = 'xlm-roberta-base'
tokenizer = XLMRobertaTokenizer.from_pretrained(XLM_NAME)
encoded_seq = tokenizer(sequence_b, sequence_sv)
decoded_seq = tokenizer.decode(encoded_seq['input_ids'])
# Notice that much larger tokens are used to encode the sentance!
print(decoded_seq)
print(encoded_seq)

<s> This is a rather long sequence. It is at least longer than the sequence A.</s></s> Det här är en mening på Svenska!</s>
{'input_ids': [0, 3293, 83, 10, 43257, 4989, 40, 944, 3956, 5, 1650, 83, 99, 19713, 51713, 3501, 70, 40, 944, 3956, 62, 5, 2, 2, 579, 2496, 369, 22, 26213, 109, 46062, 38, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [92]:
from transformers import XLMRobertaConfig

In [141]:
[print(f"{key}: \t {value}") for (key, value) in XLMRobertaConfig.get_config_dict(XLM_NAME)[0].items()]

# easier
config_easy = XLMRobertaConfig.from_pretrained(XLM_NAME)

config = XLMRobertaConfig.get_config_dict(XLM_NAME)[0]
config

architectures: 	 ['XLMRobertaForMaskedLM']
attention_probs_dropout_prob: 	 0.1
bos_token_id: 	 0
eos_token_id: 	 2
hidden_act: 	 gelu
hidden_dropout_prob: 	 0.1
hidden_size: 	 768
initializer_range: 	 0.02
intermediate_size: 	 3072
layer_norm_eps: 	 1e-05
max_position_embeddings: 	 514
model_type: 	 xlm-roberta
num_attention_heads: 	 12
num_hidden_layers: 	 12
output_past: 	 True
pad_token_id: 	 1
type_vocab_size: 	 1
vocab_size: 	 250002


{'architectures': ['XLMRobertaForMaskedLM'],
 'attention_probs_dropout_prob': 0.1,
 'bos_token_id': 0,
 'eos_token_id': 2,
 'hidden_act': 'gelu',
 'hidden_dropout_prob': 0.1,
 'hidden_size': 768,
 'initializer_range': 0.02,
 'intermediate_size': 3072,
 'layer_norm_eps': 1e-05,
 'max_position_embeddings': 514,
 'model_type': 'xlm-roberta',
 'num_attention_heads': 12,
 'num_hidden_layers': 12,
 'output_past': True,
 'pad_token_id': 1,
 'type_vocab_size': 1,
 'vocab_size': 250002}

In [133]:
print("MAX Position embeddings XLM-R: ", config['max_position_embeddings'])
print("XLM-R embedding range: [0, {}]".format(int(config['max_position_embeddings']) - 1))

MAX Position embeddings XLM-R:  514
XLM-R embedding range: [0, 513]


In [147]:
from transformers import XLMRobertaForSequenceClassification
config_easy
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', config=config_easy)
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
output = model(**inputs)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

In [155]:
from transformers import RobertaTokenizer, RobertaForMultipleChoice
import torch

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForMultipleChoice.from_pretrained('roberta-base', return_dict=True)

prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
choice0 = "It is eaten with a fork and a knife."
choice1 = "It is eaten while held in the hand."
labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1

encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', padding=True)
outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels)  # batch size is 1

# the linear classifier still needs to be trained
loss = outputs.loss
logits = outputs.logits

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForMultipleChoice: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predi

In [156]:
encoding.items()

dict_items([('input_ids', tensor([[    0,  1121,  2627,     6,  9366,  1665,    11,  4828,  9629,     6,
           215,    25,    23,    10,  2391,     6,    16,  2633,  9977,  5895,
           196,     4,     2,     2,  1121,  2627,     6,  9366,  1665,    11,
          4828,  9629,     6,   215,    25,    23,    10,  2391,     6,    16,
          2633,  9977,  5895,   196,     4,     2],
        [    0,   243,    16, 18804,    19,    10, 20935,     8,    10,  7023,
             4,     2,     2,   243,    16, 18804,   150,   547,    11,     5,
           865,     4,     2,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1]])), ('attention_mask', tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [157]:
{k: v for k,v in encoding.items()}

{'input_ids': tensor([[    0,  1121,  2627,     6,  9366,  1665,    11,  4828,  9629,     6,
            215,    25,    23,    10,  2391,     6,    16,  2633,  9977,  5895,
            196,     4,     2,     2,  1121,  2627,     6,  9366,  1665,    11,
           4828,  9629,     6,   215,    25,    23,    10,  2391,     6,    16,
           2633,  9977,  5895,   196,     4,     2],
         [    0,   243,    16, 18804,    19,    10, 20935,     8,    10,  7023,
              4,     2,     2,   243,    16, 18804,   150,   547,    11,     5,
            865,     4,     2,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [158]:
{k: v.unsqueeze(0) for k,v in encoding.items()}

{'input_ids': tensor([[[    0,  1121,  2627,     6,  9366,  1665,    11,  4828,  9629,     6,
             215,    25,    23,    10,  2391,     6,    16,  2633,  9977,  5895,
             196,     4,     2,     2,  1121,  2627,     6,  9366,  1665,    11,
            4828,  9629,     6,   215,    25,    23,    10,  2391,     6,    16,
            2633,  9977,  5895,   196,     4,     2],
          [    0,   243,    16, 18804,    19,    10, 20935,     8,    10,  7023,
               4,     2,     2,   243,    16, 18804,   150,   547,    11,     5,
             865,     4,     2,     1,     1,     1,     1,     1,     1,     1,
               1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
               1,     1,     1,     1,     1,     1]]]),
 'attention_mask': tensor([[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [None]:
# Roberta QA

In [None]:
# XLM-R QA
# https://huggingface.co/deepset/roberta-base-squad2

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
# https://huggingface.co/deepset/xlm-roberta-large-squad2
tokenizer = AutoTokenizer.from_pretrained("a-ware/xlmroberta-QA")

model = AutoModelForQuestionAnswering.from_pretrained("a-ware/xlmroberta-QA")

In [None]:
# Fine tuned for XQuAD?
# https://huggingface.co/deepset/xlm-roberta-large-squad2

In [7]:
from datasets import list_datasets

dataset_list = list_datasets()
print(dataset_list)

['aeslc', 'ag_news', 'ai2_arc', 'allocine', 'anli', 'arcd', 'art', 'billsum', 'biomrc', 'blended_skill_talk', 'blimp', 'blog_authorship_corpus', 'bookcorpus', 'boolq', 'break_data', 'c4', 'cfq', 'civil_comments', 'clue', 'cmrc2018', 'cnn_dailymail', 'coarse_discourse', 'com_qa', 'common_gen', 'commonsense_qa', 'compguesswhat', 'conll2000', 'conll2003', 'coqa', 'cornell_movie_dialog', 'cos_e', 'cosmos_qa', 'crd3', 'crime_and_punish', 'csv', 'daily_dialog', 'definite_pronoun_resolution', 'discofuse', 'docred', 'doqa', 'drop', 'eli5', 'emo', 'emotion', 'empathetic_dialogues', 'eraser_multi_rc', 'esnli', 'event2Mind', 'fever', 'flores', 'fquad', 'gap', 'germeval_14', 'gigaword', 'glue', 'guardian_authorship', 'hans', 'hansards', 'hellaswag', 'hotpot_qa', 'hyperpartisan_news_detection', 'imdb', 'iwslt2017', 'jeopardy', 'json', 'kilt_tasks', 'kilt_wikipedia', 'kor_nli', 'lc_quad', 'librispeech_lm', 'lince', 'lm1b', 'math_dataset', 'math_qa', 'matinf', 'mlqa', 'mlsum', 'movie_rationales', 'ms

In [31]:
# Load SQuAD
"""
[ "NAME", "DATA" ]

{
    "id":"string"
    "title":"string"
    "context":"string"
    "question":"string"
        "answers":{
        "[]":{
            "text":"string"
            "answer_start":"int32"
        }
    }
}
"""

from datasets import load_dataset

data_train, data_val = load_dataset('squad').items()
data_train = data_train[1]
data_val = data_val[1]

Reusing dataset squad (/.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41)


In [32]:
print(data_train)
print()
print(data_val)

Dataset(features: {'id': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'context': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None), 'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None)}, num_rows: 87599)

Dataset(features: {'id': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'context': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None), 'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None)}, num_rows: 10570)


In [45]:
"""
id, title, context, question, answers, 
"""
print(data_train['id'][0])
print(data_train['context'][0])

5733be284776f41900661182
Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.


# QA models

### Load via pipeline

In [2]:
### Load Roberta pretrained model for Squad
# https://huggingface.co/deepset/roberta-base-squad2
from transformers.pipelines import pipeline
from transformers.modeling_auto import AutoModelForQuestionAnswering
from transformers.tokenization_auto import AutoTokenizer

model_name = "deepset/roberta-base-squad2"

# a) Get predictions
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
# nlp = pipeline('question-answering')

context = r"""
Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
a model on a SQuAD task, you may leverage the `run_squad.py`.
"""

# yields very low scores compared to Huggingfaces own QA pipeline
print(nlp(question="What is extractive question answering?", context=context))
print(nlp(question="What is a good example of a question answering dataset?", context=context))

KeyError: 0

In [71]:
nlp = pipeline('question-answering')
print(nlp(question="What is extractive question answering?", context=context))
print(nlp(question="What is a good example of a question answering dataset?", context=context))



{'score': 0.6222440004348755, 'start': 34, 'end': 96, 'answer': 'the task of extracting an answer from a text given a question.'}




{'score': 0.5115299820899963, 'start': 147, 'end': 161, 'answer': 'SQuAD dataset,'}


### Load via models

In [72]:
# # b) Load model & tokenizer
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

text = r"""
🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
TensorFlow 2.0 and PyTorch.
"""

questions = [
    "How many pretrained models are available in Transformers?",
    "What does Transformers provide?",
    "Transformers provides interoperability between which frameworks?",
]

for question in questions:
    inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
    input_ids = inputs["input_ids"].tolist()[0]

    text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    answer_start_scores, answer_end_scores = model(**inputs)

    answer_start = torch.argmax(
        answer_start_scores
    )  # Get the most likely beginning of answer with the argmax of the score
    answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

    print(f"Question: {question}")
    print(f"Answer: {answer}\n")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=443.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1340675298.0, style=ProgressStyle(descr…


Question: How many pretrained models are available in Transformers?
Answer: over 32 +

Question: What does Transformers provide?
Answer: general - purpose architectures

Question: Transformers provides interoperability between which frameworks?
Answer: tensorflow 2 . 0 and pytorch



In [1]:
# # b) Load model & tokenizer
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

"""
Here is an example of question answering using a model and a tokenizer. The process is the following:
Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and loads it with the weights stored in the checkpoint.
Define a text and a few questions.
Iterate over the questions and build a sequence from the text and the current question, with the correct model-specific separators token type ids and attention masks
Pass this sequence through the model. This outputs a range of scores across the entire sequence tokens (question and text), for both the start and end positions.
Compute the softmax of the result to get probabilities over the tokens
Fetch the tokens from the identified start and stop values, convert those tokens to a string.
Print the results
"""


tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")

text = r"""
🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
TensorFlow 2.0 and PyTorch.
"""

questions = [
    "How many pretrained models are available in Transformers?",
    "What does Transformers provide?",
    "Transformers provides interoperability between which frameworks?",
]

for question in questions:
    inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
    input_ids = inputs["input_ids"].tolist()[0]

    text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
    # Equivalent to defining first a vector of the inputs, how long and where the sep token is located / called
    answer_start_scores, answer_end_scores = model(**inputs)

    answer_start = torch.argmax(
        answer_start_scores
    )  # Get the most likely beginning of answer with the argmax of the score
    answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

    print(f"Question: {question}")
    print(f"Answer: {answer}\n")
    


HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=473.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=213450.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=260793700.0), HTML(value='')))


Question: How many pretrained models are available in Transformers?
Answer: over 32 +

Question: What does Transformers provide?
Answer: general - purpose architectures

Question: Transformers provides interoperability between which frameworks?
Answer: TensorFlow 2 . 0 and PyTorch



In [5]:
inputs = tokenizer.encode_plus("How many pretrained models are available in Transformers?", text, add_special_tokens=True, return_tensors="pt")
input_ids = inputs["input_ids"].tolist()[0]

text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
# Equivalent to defining first a vector of the inputs, how long and where the sep token is located / called
answer_start_scores, answer_end_scores = model(**inputs)

answer_start = torch.argmax(
    answer_start_scores
)  # Get the most likely beginning of answer with the argmax of the score
answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score

answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
answer
tokenizer.convert_ids_to_tokens(input_ids)

['[CLS]',
 'How',
 'many',
 'pre',
 '##tra',
 '##ined',
 'models',
 'are',
 'available',
 'in',
 'Transformers',
 '?',
 '[SEP]',
 '[UNK]',
 'Transformers',
 '(',
 'formerly',
 'known',
 'as',
 'p',
 '##yt',
 '##or',
 '##ch',
 '-',
 'transform',
 '##ers',
 'and',
 'p',
 '##yt',
 '##or',
 '##ch',
 '-',
 'pre',
 '##tra',
 '##ined',
 '-',
 'be',
 '##rt',
 ')',
 'provides',
 'general',
 '-',
 'purpose',
 'architecture',
 '##s',
 '(',
 'B',
 '##ER',
 '##T',
 ',',
 'GP',
 '##T',
 '-',
 '2',
 ',',
 'R',
 '##o',
 '##BE',
 '##RT',
 '##a',
 ',',
 'X',
 '##LM',
 ',',
 'Di',
 '##st',
 '##il',
 '##B',
 '##ert',
 ',',
 'X',
 '##L',
 '##Net',
 '…',
 ')',
 'for',
 'Natural',
 'Language',
 'Understanding',
 '(',
 'NL',
 '##U',
 ')',
 'and',
 'Natural',
 'Language',
 'Generation',
 '(',
 'NL',
 '##G',
 ')',
 'with',
 'over',
 '32',
 '+',
 'pre',
 '##tra',
 '##ined',
 'models',
 'in',
 '100',
 '+',
 'languages',
 'and',
 'deep',
 'inter',
 '##oper',
 '##ability',
 'between',
 'Ten',
 '##sor',
 '##F',
 '##

In [7]:
tokenizer.decode(inputs["input_ids"].tolist()[0])

'[CLS] How many pretrained models are available in Transformers? [SEP] [UNK] Transformers ( formerly known as pytorch - transformers and pytorch - pretrained - bert ) provides general - purpose architectures ( BERT, GPT - 2, RoBERTa, XLM, DistilBert, XLNet … ) for Natural Language Understanding ( NLU ) and Natural Language Generation ( NLG ) with over 32 + pretrained models in 100 + languages and deep interoperability between TensorFlow 2. 0 and PyTorch. [SEP]'

### To fine-tune on SQuAD, run

[https://github.com/tensorflow/models/blob/master/official/nlp/bert/run_squad.py](https://github.com/tensorflow/models/blob/master/official/nlp/bert/run_squad.py)

  [https://github.com/google-research/bert/blob/master/run_squad.py](https://github.com/google-research/bert/blob/master/run_squad.py)




In [88]:
from transformers import DistilBertForQuestionAnswering, DistilBertTokenizer
import torch

tokenizer = DistilBertTokenizer.from_pretrained('bert-base-cased')
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased-distilled-squad', return_dict=True)

question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
inputs = tokenizer(question, text, return_tensors='pt')
start_positions = torch.tensor([1])
end_positions = torch.tensor([3])

outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)
loss = outputs.loss
start_scores = outputs.start_logits
end_scores = outputs.end_logits

In [94]:
answer_start_scores

tensor([[-2.6795, -3.8380, -6.4174, -5.7717, -7.4343, -7.6056, -6.1904, -5.6268,
         -5.3795, -6.5965, -4.1058, -4.5886, -1.8644, -1.0924, -3.5250, -3.4141,
         -5.5243, -6.4417,  0.0673, -4.3859, -5.0666, -3.7565, -4.6029, -3.2577,
         -4.7540, -5.2575,  0.7586, -4.6094, -5.3793, -4.1280, -4.7538, -2.8453,
         -5.5077, -5.1051, -5.2773, -4.2016, -5.1009, -3.2573, -2.8114, -1.0467,
         -5.5897, -4.6258, -3.3316, -4.9376, -0.7163,  1.3645, -4.3429, -4.1639,
         -4.4820, -1.4840, -4.5714, -5.3030, -3.9374, -4.9646, -2.2356, -5.0848,
         -5.1845, -5.7979, -5.0689, -4.9632, -3.2077, -5.3158, -5.2520, -3.7048,
         -6.3534, -5.7309, -6.1014, -5.6059, -5.4818, -2.5921, -5.5769, -5.3902,
         -3.8664, -4.3630, -3.3337,  0.5424, -4.2209, -3.5774, -3.7752, -1.7529,
         -5.3554, -3.5998, -4.4433,  0.7994, -3.8337, -3.7183, -3.5382, -1.4980,
         -5.4940, -4.1144, -3.2679, -2.1713, -2.1325, -4.0288, -3.1973, -6.4203,
         -5.8919, -4.0308, -

In [91]:
a, b = model(**inputs)

In [93]:
b

'end_logits'