# Models (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
# Install required libraries for Transformers, datasets, and evaluation
!uv pip install datasets evaluate transformers[sentencepiece]

In [None]:
# Creating a model from scratch - Building configuration and model
# BertConfig contains all the hyperparameters for the model architecture
# BertModel creates a model with random weights based on the configuration
from transformers import BertConfig, BertModel

# Building the config - uses default BERT-base parameters
config = BertConfig()

# Building the model from the config - starts with random weights
model = BertModel(config)

In [None]:
# Examine the configuration object - shows key model architecture parameters
# Key parameters include:
# - hidden_size: size of embeddings (768 for BERT-base)
# - num_hidden_layers: number of transformer layers (12 for BERT-base)
# - num_attention_heads: number of attention heads per layer (12 for BERT-base)
# - intermediate_size: size of feed-forward network (3072 for BERT-base)
# - max_position_embeddings: maximum sequence length (512 for BERT)
print(config)

In [None]:
# Important: Creating a model from config gives random weights
# This model has not been trained and will produce meaningless outputs
# Useful for research, experimentation, or when you plan to train from scratch
from transformers import BertConfig, BertModel

config = BertConfig()
model = BertModel(config)

# Model is randomly initialized! Not suitable for production use without training

In [None]:
# Loading a pre-trained model - Downloads trained weights from the Hub
# "bert-base-cased" is a BERT model trained on large text corpora
# This model has learned meaningful representations and is ready for use
# Case-sensitive version (distinguishes between "Apple" and "apple")
from transformers import BertModel

model = BertModel.from_pretrained("bert-base-cased")

In [None]:
# Saving a model locally - stores both config and weights
# Creates a directory with:
# - config.json: model configuration
# - pytorch_model.bin: model weights
# Allows for offline usage and custom model distribution
model.save_pretrained("directory_on_my_computer")

In [None]:
# Example text sequences for processing
# These are the raw text inputs that need to be tokenized
sequences = ["Hello!", "Cool.", "Nice!"]

In [None]:
# Pre-tokenized sequences converted to token IDs
# Each sequence starts with [CLS] token (101) and ends with [SEP] token (102)
# These numbers correspond to specific tokens in BERT's vocabulary
# - 101: [CLS] (classification token)
# - 102: [SEP] (separator token)
# - 7592: "Hello", 4658: "Cool", 3835: "Nice"
# - 999, 1012: punctuation marks (!, .)
encoded_sequences = [
    [101, 7592, 999, 102],
    [101, 4658, 1012, 102],
    [101, 3835, 999, 102],
]

In [None]:
# Convert token IDs to PyTorch tensor
# Models expect tensors as input, not Python lists
# The tensor shape will be [batch_size, sequence_length] = [3, 4]
import torch

model_inputs = torch.tensor(encoded_sequences)

In [None]:
# Pass the tensor through the model to get embeddings
# Output contains contextualized embeddings for each token
# The model automatically handles the forward pass and attention computation
output = model(model_inputs)

# output.last_hidden_state contains the final layer embeddings
# Shape: [batch_size, sequence_length, hidden_size] = [3, 4, 768]