# The celebrated [BERT architecture](https://arxiv.org/pdf/1810.04805)
Simple token embeddings - classic transformer encoder. \
Trained using $\texttt{MLM}$.

In [1]:
import warnings, torch
warnings.filterwarnings("ignore")

from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

KeyboardInterrupt: 

In [2]:
print(f"Param count: {sum(p.numel() for p in model.parameters())/1e6:.2f}M parameters")

Param count: 109.48M parameters


In [3]:

# Example sentence
text = "BERT is great for natural language processing!"

# Tokenize input
inputs = tokenizer(text, return_tensors="pt")
print(f'{inputs["input_ids"].shape=}')

# Forward pass (no gradient calculation needed for inference)
with torch.no_grad():
    outputs = model(**inputs)

# Extract embeddings
last_hidden_state = outputs.last_hidden_state  # Shape: [batch_size, seq_len, hidden_dim]

print(f'{last_hidden_state.shape=}')

inputs["input_ids"].shape=torch.Size([1, 10])
last_hidden_state.shape=torch.Size([1, 10, 768])
