# Encoder-style Models

## Use Case 1: Fill in the missing token

In [2]:
from transformers import BertTokenizer, BertModel, BertForMaskedLM

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
text = "I want to [MASK] a new car, roychowdhury."

In [5]:
tokenized_text = tokenizer.tokenize(text)
print(tokenized_text)

['i', 'want', 'to', '[MASK]', 'a', 'new', 'car', ',', 'roy', '##cho', '##wd', '##hur', '##y', '.']


In [6]:
masked_index = tokenized_text.index('[MASK]')
print(masked_index)

3


In [7]:
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
print(indexed_tokens)

[1045, 2215, 2000, 103, 1037, 2047, 2482, 1010, 6060, 9905, 21724, 24572, 2100, 1012]


In [8]:
import torch
tokens_tensor = torch.tensor([indexed_tokens])
print(tokens_tensor)

tensor([[ 1045,  2215,  2000,   103,  1037,  2047,  2482,  1010,  6060,  9905,
         21724, 24572,  2100,  1012]])


In [9]:
outputs = model(tokens_tensor)
predictions = outputs[0] # look at the first element in the batch
print(predictions)
print(predictions.shape)

# print(predictions[0, masked_index])
print(predictions[0, masked_index].shape)

tensor([[[ -7.5251,  -7.4327,  -7.4601,  ...,  -6.8418,  -6.7214,  -4.5217],
         [ -8.1979,  -8.0036,  -8.2781,  ...,  -7.4715,  -6.7217,  -6.1903],
         [-11.0837, -10.7895, -11.1458,  ...,  -9.9987,  -9.3693,  -7.2951],
         ...,
         [ -5.5479,  -4.9952,  -5.2698,  ...,  -3.6761,  -5.6037,  -0.3025],
         [ -9.3287,  -9.0934,  -9.2260,  ...,  -7.7051,  -8.5031,  -5.5208],
         [-13.0553, -12.8476, -13.2015,  ..., -12.3488, -11.0748, -10.5593]]],
       grad_fn=<ViewBackward0>)
torch.Size([1, 14, 30522])
torch.Size([30522])


In [10]:
torch.argmax(predictions[0, 0])

tensor(1012)

In [11]:
predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
print(predicted_index, predicted_token)

2022 be


In [12]:
tokenized_text[masked_index] = predicted_token

In [13]:
predicted_sentence = ' '.join(tokenized_text).replace(' ##', '')
print("Predicted Sentence:", predicted_sentence)

Predicted Sentence: i want to be a new car , roychowdhury .


## Use Case 2: Get the token vectors


In [None]:
model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
text = "QCBio Colab is the real deal."
tokens = tokenizer.tokenize(text)
print(tokens)

['qc', '##bio', 'cola', '##b', 'is', 'the', 'real', 'deal', '.']


In [None]:
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print(input_ids)

[25196, 26282, 15270, 2497, 2003, 1996, 2613, 3066, 1012]


In [None]:
import torch
outputs = model(torch.tensor([input_ids]))
print(outputs.keys())

odict_keys(['last_hidden_state', 'pooler_output'])


In [None]:
last_hidden_states = outputs.last_hidden_state

In [None]:
print(last_hidden_states.shape)

torch.Size([1, 9, 768])


## Challenge! Lets fill out the function to generate the correct token for MASK.

In [31]:
def unmask(text:str = "The [MASK] you give me is difficult"):
    model = BertForMaskedLM.from_pretrained('bert-base-uncased')

    # Tokenizing the text
    tokenized_text = tokenizer.tokenize(text)
    print(tokenized_text)
    
    # Identify the mask token
    masked_index = tokenized_text.index('[MASK]')
    print(masked_index)

    # Convert the tokens into token ids
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    print(indexed_tokens)

    # Putting indexed tokens into a torch tensor
    import torch
    tokens_tensor = torch.tensor([indexed_tokens])
    print(tokens_tensor)    

    # Passing the tensor into the model
    outputs = model(tokens_tensor)
    predictions = outputs[0] # look at the first element in the batch
    print(predictions)
    print(predictions.shape)
    print(predictions[0, masked_index].shape)
    torch.argmax(predictions[0, 0])
    predicted_index = torch.argmax(predictions[0, masked_index]).item()
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
    print(predicted_index, predicted_token)

    tokenized_text[masked_index] = predicted_token
    # For the masked token find the output token with the highest probability
    #print("Just the masked output: ", masked_output.shape)

    predicted_sentence = ' '.join(tokenized_text).replace(' ##', '')
    print("Predicted Sentence:", predicted_sentence)

    return predicted_sentence

In [32]:

unmask()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


['the', '[MASK]', 'you', 'give', 'me', 'is', 'difficult']
1
[1996, 103, 2017, 2507, 2033, 2003, 3697]
tensor([[1996,  103, 2017, 2507, 2033, 2003, 3697]])
tensor([[[ -7.4526,  -7.4113,  -7.4881,  ...,  -6.8182,  -7.1236,  -4.7793],
         [ -8.7925,  -8.6144,  -8.9970,  ...,  -7.6588,  -7.9244,  -6.7575],
         [-12.9916, -12.7997, -12.9415,  ..., -10.5085, -11.7863,  -8.9920],
         ...,
         [ -9.8546,  -9.7412,  -9.9184,  ...,  -8.1045,  -9.5773,  -8.8084],
         [-11.1741, -11.0256, -11.4264,  ...,  -9.4551, -10.0672,  -8.3335],
         [ -9.9531,  -9.9532, -10.3589,  ...,  -7.5861,  -9.2030,  -9.3564]]],
       grad_fn=<ViewBackward0>)
torch.Size([1, 7, 30522])
torch.Size([30522])
2054 what
Predicted Sentence: the what you give me is difficult


'the what you give me is difficult'

In [None]:
samples = ["I want to predict [MASK] for the next day", "I want to understand [MASK] now"]

# Decoder-style Models

## Use case 1: Single Token Generation

In [33]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [34]:
model_name = 'gpt2-medium'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [47]:
sentence = "The man stand on the"
input_ids = tokenizer.encode(sentence, return_tensors='pt')
print(input_ids)

tensor([[ 464,  582, 1302,  319,  262]])


In [48]:
import torch
outputs = model(input_ids=input_ids)
predictions = outputs.logits

In [49]:
predicted_index = torch.argmax(predictions[0, -1, :]).item()
predicted_token = tokenizer.decode([predicted_index])

In [50]:
print("Predicted token:", predicted_token)

Predicted token:  edge


## Multi-token Generation

In [51]:
# Generate multiple tokens using beam search
beam_outputs = model.generate(
    input_ids,
    max_length=50,  # Maximum length of the generated sequence
    num_beams=5,    # Number of beams in beam search
    num_return_sequences=3,  # Number of sequences to generate
    early_stopping=True,     # Stop generation when all beams reach the end
    temperature=5.0,         # Temperature for sampling
    do_sample=True,
    pad_token_id=tokenizer.pad_token_id,  # Padding token ID
    eos_token_id=tokenizer.eos_token_id,  # End-of-sequence token ID
    bos_token_id=tokenizer.bos_token_id,  # Beginning-of-sequence token ID
    length_penalty=0.7,       # Length penalty for beam search
)

# Decode and print the generated sequences
for output in beam_outputs:
    generated_sequence = tokenizer.decode(output, skip_special_tokens=False)
    print(generated_sequence)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


The man stand on the bridge. We can make contact with the other ship and see if we gain any advantage or lose that ship, but on the ship I was assigned to was destroyed before I arrived. It seems like there was plenty of time left
The man stand on the bridge. We can make contact with the other ship and see if we gain any advantage or lose that ship, but on the ship I was assigned to was destroyed before I arrived. It seems like there was plenty of time where
The man stand on the bridge. We can make contact with the other ship and see if we gain any advantage or lose that ship, but on the ship I was assigned to was destroyed before I arrived. It seems like there was plenty of time ahead


## Challenge: Given part of a sentence,
"I want a piece of"

Generate the next token two ways:
1. Use Encoder-style BERT to predict "I want a piece of [MASK]"
2. Use Decoder-style GPT to predict "I want a piece of" -> TOKEN.

Are both the results the same, different? Let's discuss.

In [52]:
def generate_predict_gpt_bert(text:str = "I want a piece of the"):
    # feel free to copy code from the above examples.

    pass
    return (bert_predicted_sentence, gpt_sentence)

In [None]:
generate_predict_gpt_bert()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


I want a piece of the


('i want a piece of the "', 'I want a piece of the action')

## Experimental: A small State-of-the-Art Model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-1.5B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

prompt = "Give me a short introduction to the field of bioinformatics."
messages = [
    {"role": "system", "content": "You are a graduate students in bioinformatics."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 6c964c0f-2188-41bb-8b6d-9a17c5f1627d)')' thrown while requesting HEAD https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct/resolve/main/config.json
Retrying in 1s [Retry 1/5].


ValueError: Using a `device_map`, `tp_plan`, `torch.device` context manager or setting `torch.set_default_device(device)` requires `accelerate`. You can install it with `pip install accelerate`