<h3>Encoder-only models (e.g., BERT/DeBERTa)</h3>

<h3>Encoder–Decoder models (e.g., T5) — both encoder embeddings and decoder embeddings</h3>

<h3>Decoder-only / CausalLM models (e.g., GPT-2 / Llama-style)</h3>

<b>A. Core utilities (pooling + device)</b>

In [3]:
import torch

def get_device():
    if torch.cuda.is_available():
        return torch.device("cuda")
    return torch.device("cpu")

def mean_pool(last_hidden_state: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
    """
    last_hidden_state: (B, T, H)
    attention_mask:    (B, T) with 1 for real tokens, 0 for padding
    returns:           (B, H)
    """
    mask = attention_mask.unsqueeze(-1).to(last_hidden_state.dtype)  # (B, T, 1)
    summed = (last_hidden_state * mask).sum(dim=1)                   # (B, H)
    counts = mask.sum(dim=1).clamp(min=1e-9)                         # (B, 1)
    return summed / counts

def cls_pool(last_hidden_state: torch.Tensor) -> torch.Tensor:
    """(B, T, H) -> (B, H) using first token representation."""
    return last_hidden_state[:, 0, :]


<b>B. 1) Encoder-only embeddings (BERT/DeBERTa/etc.)</b>

In [4]:
from transformers import AutoTokenizer, AutoModel

@torch.inference_mode()
def encoder_embeddings(
    texts,
    model_id="microsoft/deberta-base",
    pooling="mean",      # "mean" or "cls"
    max_length=256,
    batch_size=8,
):
    device = get_device()
    tok = AutoTokenizer.from_pretrained(model_id)
    model = AutoModel.from_pretrained(model_id).to(device).eval()

    all_embs = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tok(
            batch, padding=True, truncation=True, max_length=max_length,
            return_tensors="pt"
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}

        outputs = model(**inputs, return_dict=True)  # last_hidden_state: (B,T,H)
        h = outputs.last_hidden_state

        if pooling == "cls":
            emb = cls_pool(h)
        else:
            emb = mean_pool(h, inputs["attention_mask"])

        emb = torch.nn.functional.normalize(emb, p=2, dim=-1)  # good for cosine search
        all_embs.append(emb.cpu())

    return torch.cat(all_embs, dim=0)  # (N, H)

# Example call
texts = ["NVIDIA Triton serves models at scale.", "Attention uses QKV projections."]
E = encoder_embeddings(texts, model_id="microsoft/deberta-base", pooling="mean")
print(E.shape)  # (2, hidden_size)


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


torch.Size([2, 768])


<b>C. 2) Encoder–Decoder embeddings (T5/etc.) — both sides</b>

In [5]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

@torch.inference_mode()
def seq2seq_encoder_decoder_embeddings(
    inputs_texts,
    targets_texts=None,                 # if provided, we extract decoder embeddings too
    model_id="t5-small",
    enc_pooling="mean",                 # encoder pooling: "mean" or "first"
    max_length=256,
    batch_size=4,
):
    device = get_device()
    tok = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_id).to(device).eval()

    enc_embs_all = []
    dec_token_embs_all = []  # token-level decoder embeddings (no pooling by default)

    for i in range(0, len(inputs_texts), batch_size):
        x_batch = inputs_texts[i:i+batch_size]
        x = tok(
            x_batch, padding=True, truncation=True, max_length=max_length,
            return_tensors="pt"
        )
        x = {k: v.to(device) for k, v in x.items()}

        if targets_texts is not None:
            y_batch = targets_texts[i:i+batch_size]
            y = tok(
                y_batch, padding=True, truncation=True, max_length=max_length,
                return_tensors="pt"
            )
            decoder_input_ids = y["input_ids"].to(device)
            decoder_attention_mask = y["attention_mask"].to(device)
        else:
            decoder_input_ids = None
            decoder_attention_mask = None

        outputs = model(
            **x,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            output_hidden_states=True,    # needed for decoder hidden states
            return_dict=True
        )

        # ---- Encoder embeddings (token-level -> pooled) ----
        enc_h = outputs.encoder_last_hidden_state  # (B, Tx, H)
        if enc_pooling == "first":
            enc_emb = enc_h[:, 0, :]
        else:
            enc_emb = mean_pool(enc_h, x["attention_mask"])
        enc_emb = torch.nn.functional.normalize(enc_emb, p=2, dim=-1)
        enc_embs_all.append(enc_emb.cpu())

        # ---- Decoder embeddings (token-level) ----
        if targets_texts is not None:
            # decoder_hidden_states is a tuple: (layer0, layer1, ..., last)
            dec_last = outputs.decoder_hidden_states[-1]  # (B, Ty, H)
            dec_token_embs_all.append(dec_last.cpu())

    enc_embs = torch.cat(enc_embs_all, dim=0)  # (N, H)
    if targets_texts is None:
        return enc_embs, None

    dec_token_embs = torch.cat(dec_token_embs_all, dim=0)  # (N, Ty, H)
    return enc_embs, dec_token_embs

# Example call
inputs_texts = ["translate English to German: The cat sits on the mat."]
targets_texts = ["Die Katze sitzt auf der Matte."]

enc_emb, dec_tokens = seq2seq_encoder_decoder_embeddings(
    inputs_texts, targets_texts, model_id="t5-small"
)
print("Encoder pooled:", enc_emb.shape)      # (1, H)
print("Decoder token embs:", dec_tokens.shape)  # (1, Ty, H)


Encoder pooled: torch.Size([1, 512])
Decoder token embs: torch.Size([1, 11, 512])


<b>D. 3) Decoder-only / CausalLM embeddings (GPT2/Llama-style)</b>

In [6]:
# token embeddings (per position)

# pooled embedding (mean over tokens) for rough semantic use (not always best)

# last token embedding for next-token analysis

from transformers import AutoTokenizer, AutoModelForCausalLM

@torch.inference_mode()
def causal_lm_embeddings(
    prompts,
    model_id="gpt2",         # swap to Llama model_id if you have access
    pooling="mean",          # "mean" or "last_token"
    max_length=256,
    batch_size=4,
):
    device = get_device()
    tok = AutoTokenizer.from_pretrained(model_id)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token  # important for batching in decoder-only

    model = AutoModelForCausalLM.from_pretrained(model_id).to(device).eval()

    all_embs = []
    for i in range(0, len(prompts), batch_size):
        batch = prompts[i:i+batch_size]
        x = tok(
            batch, padding=True, truncation=True, max_length=max_length,
            return_tensors="pt"
        )
        x = {k: v.to(device) for k, v in x.items()}

        outputs = model(**x, output_hidden_states=True, return_dict=True)
        h = outputs.hidden_states[-1]  # (B, T, H)

        if pooling == "last_token":
            # last *real* token per sequence (use attention_mask)
            lengths = x["attention_mask"].sum(dim=1) - 1  # (B,)
            emb = h[torch.arange(h.size(0), device=device), lengths]  # (B, H)
        else:
            emb = mean_pool(h, x["attention_mask"])

        emb = torch.nn.functional.normalize(emb, p=2, dim=-1)
        all_embs.append(emb.cpu())

    return torch.cat(all_embs, dim=0)  # (N, H)

# Example call
prompts = ["Explain self-attention briefly.", "What is KV cache?"]
D = causal_lm_embeddings(prompts, model_id="gpt2", pooling="mean")
print(D.shape)  # (2, hidden_size)


Using pad_token, but it is not set yet.


torch.Size([2, 768])


In [7]:
import torch
from transformers import AutoTokenizer, AutoModel

# 1. Load the "Base" model (No Head)
# We use 'bert-base-uncased' as the standard Encoder example
model_id = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id) # AutoModel loads the raw base (no classification head)

text = "NVIDIA TensorRT optimizes inference latency."

# 2. Tokenize
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

# 3. Forward Pass (No Gradients needed for extraction)
with torch.no_grad():
    outputs = model(**inputs)

# 4. Extract Embeddings
# 'last_hidden_state' shape: [Batch_Size, Sequence_Length, Hidden_Dim]
# e.g., [1, 8, 768]
last_hidden_states = outputs.last_hidden_state

# Option A: [CLS] Token Embedding (The "Representative" Vector)
# Position 0 is always [CLS] in BERT
cls_embedding = last_hidden_states[:, 0, :] 

# Option B: Mean Pooling (Average of all tokens - often more accurate for semantic search)
# We must ignore padding tokens in the average!
attention_mask = inputs['attention_mask'].unsqueeze(-1).expand(last_hidden_states.size()).float()
sum_embeddings = torch.sum(last_hidden_states * attention_mask, 1)
sum_mask = torch.clamp(attention_mask.sum(1), min=1e-9)
mean_embedding = sum_embeddings / sum_mask

print(f"Encoder Output Shape (CLS): {cls_embedding.shape}") 
# Output: torch.Size([1, 768])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Encoder Output Shape (CLS): torch.Size([1, 768])


In [8]:
from transformers import AutoTokenizer, AutoModel

# 1. Load a Decoder Model
# We use 'gpt2' (or could use meta-llama/Llama-2-7b-hf)
model_id = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token # GPT-2 has no default pad token
model = AutoModel.from_pretrained(model_id)

text = "Generative AI requires massive compute."

# 2. Tokenize
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

# 3. Forward Pass
with torch.no_grad():
    outputs = model(**inputs)

# 4. Extract Embeddings
last_hidden_states = outputs.last_hidden_state

# CRITICAL DIFFERENCE: We need the embedding of the LAST token.
# Since sequences have different lengths (padding), we can't just take index -1.
# We find the index of the last real token using the attention mask.
attention_mask = inputs['attention_mask']
# Subtract 1 to get the index (length - 1)
last_token_indices = attention_mask.sum(dim=1) - 1 

# Select the vector for each sequence in the batch at the calculated index
batch_size = last_hidden_states.shape[0]
decoder_embedding = last_hidden_states[torch.arange(batch_size), last_token_indices, :]

print(f"Decoder Output Shape (Last Token): {decoder_embedding.shape}")
# Output: torch.Size([1, 768])

Decoder Output Shape (Last Token): torch.Size([1, 768])
