In [1]:
!pip install transformers torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
import torch
from transformers import BertTokenizer, BertForMaskedLM
import torch.nn.functional as F

In [3]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForMaskedLM.from_pretrained("bert-base-uncased")
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [4]:
def predict_mask(sentence, top_k=5):
    # Tokenize and find [MASK] position
    inputs = tokenizer.encode_plus(sentence, return_tensors="pt")
    mask_index = torch.where(inputs["input_ids"][0] == tokenizer.mask_token_id)[0].item()

    # Forward pass through model
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # Get top-k predictions at the [MASK] index
    softmax = F.softmax(logits, dim=-1)
    mask_word_probs = softmax[0, mask_index]
    top_k_weights, top_k_indices = torch.topk(mask_word_probs, top_k, sorted=True)

    predictions = []
    for i in range(top_k):
        token = tokenizer.decode([top_k_indices[i]])
        prob = top_k_weights[i].item()
        predictions.append((token.strip(), prob))

    return predictions


In [5]:
masked_sentence = "The capital of France is [MASK]."
print("Masked Sentence:", masked_sentence)

top_predictions = predict_mask(masked_sentence)
for i, (word, prob) in enumerate(top_predictions):
    print(f"{i+1}: {word} ({prob:.4f})")


Masked Sentence: The capital of France is [MASK].
1: paris (0.4168)
2: lille (0.0714)
3: lyon (0.0634)
4: marseille (0.0444)
5: tours (0.0303)


In [6]:
test_cases = [
    ("The capital of France is [MASK].", "paris"),
    ("The [MASK] wrote a novel.", "author"),
    ("He went to the [MASK] to buy food.", "store"),
]

correct = 0
for sent, truth in test_cases:
    predictions = predict_mask(sent)
    predicted_words = [p[0].lower() for p in predictions]
    print(f"Sentence: {sent}")
    print(f"Top-5: {predicted_words} | Truth: {truth}")
    if truth.lower() in predicted_words:
        correct += 1

accuracy = correct / len(test_cases)
print(f"\nTop-5 Accuracy on test cases: {accuracy * 100:.2f}%")


Sentence: The capital of France is [MASK].
Top-5: ['paris', 'lille', 'lyon', 'marseille', 'tours'] | Truth: paris
Sentence: The [MASK] wrote a novel.
Top-5: ['couple', 'author', 'family', 'writer', 'two'] | Truth: author
Sentence: He went to the [MASK] to buy food.
Top-5: ['store', 'market', 'supermarket', 'bank', 'kitchen'] | Truth: store

Top-5 Accuracy on test cases: 100.00%
