In [3]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
# Example sentence with a masked token
sentence = "The cat [MASK] on the mat."

# Tokenize the sentence
inputs = tokenizer(sentence, return_tensors="pt")

# Forward pass through the model to get predictions
outputs = model(**inputs)

# Get the predictions for the masked token
predictions = outputs.logits

# Identify the index of the masked token in the input
masked_token_index = inputs["input_ids"][0].tolist().index(tokenizer.mask_token_id)

# Get the probabilities for each vocabulary token at the masked position
probs = torch.nn.functional.softmax(predictions[0, masked_token_index], dim=-1)

# Get the top k predicted tokens
top_k = 5
top_k_tokens = torch.topk(probs, k=top_k)
top_k_ids = top_k_tokens.indices.tolist()
top_k_probabilities = top_k_tokens.values.tolist()

# Decode the token IDs to words
predicted_words = tokenizer.batch_decode(top_k_ids)

# Display the results
print(f"Original Sentence: {sentence}")
print(f"Masked Token: {predicted_words[0]}")
print("Top Predictions:")
for word, prob in zip(predicted_words, top_k_probabilities):
    print(f"{word}: {prob:.4f}")



Original Sentence: The cat [MASK] on the mat.
Masked Token: sat
Top Predictions:
sat: 0.1680
lay: 0.0828
was: 0.0623
landed: 0.0533
collapsed: 0.0499


In [5]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2-medium")
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2-medium")

In [6]:
raw_inputs = ["I love deep learning!", "I like Artificial Intelligence"]

tokenizer.pad_token = tokenizer.eos_token  # For example, using the end-of-sequence token as the padding token


inputs = tokenizer(raw_inputs, padding = True, truncation = True,return_tensors="pt")
# inputs = tokenizer(raw_inputs, return_tensors="pt")

print(inputs)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'input_ids': tensor([[   40,  1842,  2769,  4673,     0],
        [   40,   588, 35941,  9345, 50256]]), 'attention_mask': tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0]])}


In [7]:
print('Tokenizer output for "I love deep learning!"')
print(f"Input ids: {inputs['input_ids'][0]}")
print(f"Attention Mask: {inputs['attention_mask'][0]}")
print("-"*30)
print('Tokenizer output for "I hate this so much!"')
print(f"Input ids: {inputs['input_ids'][1]}")
print(f"Attention Mask: {inputs['attention_mask'][1]}")

Tokenizer output for "I love deep learning!"
Input ids: tensor([  40, 1842, 2769, 4673,    0])
Attention Mask: tensor([1, 1, 1, 1, 1])
------------------------------
Tokenizer output for "I hate this so much!"
Input ids: tensor([   40,   588, 35941,  9345, 50256])
Attention Mask: tensor([1, 1, 1, 1, 0])


## Tokenizers Under the Hood

In [8]:
tokenizer("I love Machine learning")

{'input_ids': [40, 1842, 10850, 4673], 'attention_mask': [1, 1, 1, 1]}

In [9]:
tokens = tokenizer.tokenize("I love Machine learning")
tokens

['I', 'Ġlove', 'ĠMachine', 'Ġlearning']

In [10]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)
token_ids

[40, 1842, 10850, 4673]

In [11]:
decoded_tokens = tokenizer.decode(token_ids)
decoded_tokens

'I love Machine learning'

In [12]:
model_prepped_ids = tokenizer.prepare_for_model(token_ids)
model_prepped_ids

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': [40, 1842, 10850, 4673], 'attention_mask': [1, 1, 1, 1]}

# Models

## Pipeline