# Word Embeddings
Embeddings are a way to turn words (or other data) into vectors of numbers so that a neural network can process them. In LLMs, they are used to convert the input text into a numerical form the model can understand and work with.

In [10]:
import torch
import torch.nn as nn

# Create an embedding layer
# num_embeddings=10000: vocabulary size (how many unique tokens/words we can represent)
# embedding_dim=768: dimension of each embedding vector (how many numbers represent each word)
embedding = nn.Embedding(num_embeddings=10000, embedding_dim=768)

print(f"Total embedding parameters: {embedding.num_embeddings * embedding.embedding_dim:,}")

# Example token indices - these represent positions in our vocabulary
# Think of them as "word IDs" - each number corresponds to a specific word
token_ids = torch.tensor([12, 45, 78])
print(f"\nInput token IDs: {token_ids}")
print(f"These might represent words like: ['the', 'cat', 'sat']")

# Convert token IDs to dense vectors (embeddings)
# Each token ID gets mapped to a 768-dimensional vector of learned features
embedded = embedding(token_ids)

print(f"\nOutput shape: {embedded.shape}")
print(f"    - 3 tokens, each represented by 768 numbers")
print(f"    - Each number in the embedding captures some learned feature about the token")
print(f"    - The embeddings are trained along with the model")

Total embedding parameters: 7,680,000

Input token IDs: tensor([12, 45, 78])
These might represent words like: ['the', 'cat', 'sat']

Output shape: torch.Size([3, 768])
    - 3 tokens, each represented by 768 numbers
    - Each number in the embedding captures some learned feature about the token
    - The embeddings are trained along with the model


# Positional Embeddings
Since LLMs process text in sequences, they need to know the order of the words. Positional embeddings are added to the word embeddings to give the model information about the position of each word in the sequence. This helps the model understand the context and relationships between words based on their order.

In [None]:
import torch
import torch.nn as nn

max_len = 512  # Context length of 512 tokens
# This determines how much text the model can "see" at once

# from above
token_ids = torch.tensor([12, 45, 78])
embedding = nn.Embedding(num_embeddings=10000, embedding_dim=768)
pos_embedding = nn.Embedding(num_embeddings=max_len, embedding_dim=768)
embedded = embedding(token_ids)

print(f"Positional embedding parameters: {pos_embedding.num_embeddings * pos_embedding.embedding_dim:,}")

# Create position indices for our sequence
# Position 0, 1, 2 for our 3 tokens
sequence_length = len(token_ids)
position_ids = torch.arange(sequence_length)
print(f"\nPosition IDs: {position_ids}")
print(f"These represent positions: [0th, 1st, 2nd] in the sequence")

# Get positional embeddings
pos_embedded = pos_embedding(position_ids)

print(f"\nPositional embedding shape: {pos_embedded.shape}")
print(f"    - Same shape as word embeddings: 3 tokens × 768 dimensions")

# Combine word embeddings with positional embeddings
# This gives the model both semantic (what the word means) and positional (where it is) information
final_embeddings = embedded + pos_embedded

print(f"\nFinal combined embeddings shape: {final_embeddings.shape}")
print(f"    - Word meaning + position information for each token")
print(f"    - Ready to be processed by transformer layers")

Positional embedding parameters: 393,216

Position IDs: tensor([0, 1, 2])
These represent positions: [0th, 1st, 2nd] in the sequence

Positional embedding shape: torch.Size([3, 768])
    - Same shape as word embeddings: 3 tokens × 768 dimensions

Final combined embeddings shape: torch.Size([3, 768])
    - Word meaning + position information for each token
    - Ready to be processed by transformer layers
