In [None]:
#importing the dataset
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
x = data['data']
y = data['target']

In [None]:
from transformers import BertTokenizer, BertForQuestionAnswering
import torch

tokenizer = BertTokenizer.from_pretrained("deepset/bert-base-cased-squad2")
model = BertForQuestionAnswering.from_pretrained("deepset/bert-base-cased-squad2")

question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

inputs = tokenizer(question, text, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)

answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

In [None]:
inputs

In [None]:

tokenizer.convert_ids_to_tokens(inputs["input_ids"][0], skip_special_tokens=False)

In [None]:
from transformers import AutoTokenizer
model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
question, context = "Who was Jim Henson?", "Jim Henson was a nice puppet. Jim Henson was a nice puppet. Jim Henson was a nice puppet. Yaya."

In [75]:
inputs = tokenizer(
    question,
    context,
    max_length=20,
    truncation="only_second",
    stride=7,
    padding="max_length",
    return_overflowing_tokens=True,
    return_offsets_mapping=True,
)

In [76]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping'])

In [None]:
tokenizer.convert_ids_to_tokens(inputs["input_ids"][0], skip_special_tokens=False)

In [None]:
offset_mapping = inputs["offset_mapping"]
sample_map = inputs["overflow_to_sample_mapping"]

start_char = 0
end_char = 10

In [74]:
for i, offset in enumerate(offset_mapping):
    sequence_ids = inputs.sequence_ids(i)

    # Find the start and end of the context
    idx = 0
    while sequence_ids[idx] != 1:
        idx += 1
    context_start = idx
    while sequence_ids[idx] == 1:
        idx += 1
    context_end = idx - 1

    ids = inputs["input_ids"][i]

    sequence = tokenizer.decode(ids)

    print(sequence.split("[SEP]"))

    # If the answer is not fully inside the context, label is (0, 0)
    if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
        start_position, end_position = 0, 0
    else:
        # Otherwise it's the start and end token positions
        idx = context_start
        while idx <= context_end and offset[idx][0] <= start_char:
            idx += 1
        start_position = idx - 1

        idx = context_end
        while idx >= context_start and offset[idx][1] >= end_char:
            idx -= 1
        end_position = idx + 1
        
    print(start_position, end_position)

['[CLS] who was jim henson? ', ' jim henson was a nice puppet. jim henson was a nice ', '']
7 8
['[CLS] who was jim henson? ', ' puppet. jim henson was a nice puppet. jim henson was ', '']
0 0
['[CLS] who was jim henson? ', ' a nice puppet. jim henson was a nice puppet. ya ', '']
0 0
['[CLS] who was jim henson? ', ' henson was a nice puppet. yaya. ', ' [PAD] [PAD] [PAD]']
0 0
