## Tokenization

In [None]:
import torch
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-cased')

In [None]:
text_1 = "I understand equations, both the simple and quadratical."
text_2 = "What kind of equations do I understand?"

# Tokenized input with special tokens around it (for BERT: [CLS] at the beginning and [SEP] at the end)
indexed_tokens = tokenizer.encode(text_1, text_2, add_special_tokens=True)
indexed_tokens

In [None]:
tokenizer.convert_ids_to_tokens([str(token) for token in indexed_tokens])

In [None]:
tokenizer.decode(indexed_tokens)

## Segmenting Text

In [None]:
cls_token = 101
sep_token = 102

In [None]:
def get_segment_ids(indexed_tokens):
    segment_ids = []
    segment_id = 0
    for token in indexed_tokens:
        if token == sep_token:
            segment_id += 1
        segment_ids.append(segment_id)
    segment_ids[-1] -= 1  # Last [SEP] is ignored
    return torch.tensor([segment_ids]), torch.tensor([indexed_tokens])

In [None]:
segments_tensors, tokens_tensor = get_segment_ids(indexed_tokens)
segments_tensors

## Text Masking

In [None]:
tokenizer.mask_token

In [None]:
tokenizer.mask_token_id

In [None]:
masked_index = 5

In [None]:
indexed_tokens[masked_index] = tokenizer.mask_token_id
tokens_tensor = torch.tensor([indexed_tokens])
tokenizer.decode(indexed_tokens)

In [None]:
masked_lm_model = torch.hub.load('huggingface/pytorch-transformers', 'modelForMaskedLM', 'bert-base-cased')

In [None]:
masked_lm_model

In [None]:
embedding_table = next(masked_lm_model.bert.embeddings.word_embeddings.parameters())
embedding_table

In [None]:
embedding_table.shape

In [None]:
with torch.no_grad():
    predictions = masked_lm_model(tokens_tensor, token_type_ids=segments_tensors)
predictions

In [None]:
predictions[0].shape

In [None]:
# Get the predicted token
predicted_index = torch.argmax(predictions[0][0], dim=1)[masked_index].item()
predicted_index

In [None]:
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
predicted_token

In [None]:
tokenizer.decode(indexed_tokens)

In [None]:
text_1 = "I understand equations, both the simple and quadratical."
text_2 = "What kind of equations do I understand?"

question_answering_tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-large-uncased-whole-word-masking-finetuned-squad')
indexed_tokens = question_answering_tokenizer.encode(text_1, text_2, add_special_tokens=True)
segments_tensors, tokens_tensor = get_segment_ids(indexed_tokens)

In [None]:
question_answering_model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-large-uncased-whole-word-masking-finetuned-squad')

In [None]:
# Predict the start and end positions logits
with torch.no_grad():
    out = question_answering_model(tokens_tensor, token_type_ids=segments_tensors)
out

In [None]:
out.start_logits

In [None]:
out.end_logits

In [None]:
answer_sequence = indexed_tokens[torch.argmax(out.start_logits):torch.argmax(out.end_logits)+1]
answer_sequence

In [None]:
question_answering_tokenizer.convert_ids_to_tokens(answer_sequence)

In [None]:
question_answering_tokenizer.decode(answer_sequence)