# Create Embeddings out of an LLM

### Problem Statement
Your mission, should you choose to accept it, is to extract **meaningful sentence-level embeddings** using a pre-trained **causal language model (SmolLM2-135M)** on Amazon Reviews.

You're working with a **generative language model**, but you’re not here to generate Shakespeare. Instead, you’ll tap into its **hidden states** to get semantic embeddings that capture the essence of a review — the good, the bad, and the brutally honest.

---

### Requirements

1. **Load and Tokenize Text**
   - Use the `McAuley-Lab/Amazon-Reviews-2023` dataset (subset: `raw_review_All_Beauty`).
   - Load ~10 sample reviews for testing.
   - Tokenize them using `"HuggingFaceTB/SmolLM2-135M"` tokenizer.

2. **Extract Embeddings**
   - Run the tokenized batch through the model with `output_hidden_states=True`.
   - Access the **last hidden layer** from `outputs.hidden_states[-1]`.

3. **Compute Sentence Embeddings**
   - Options:
     - If the model uses a classification token (e.g., `[CLS]`), extract its embedding.
     - For causal models (which typically don’t), **average the token embeddings** from the final layer, **excluding padding tokens**.

4. **Find the cosine similarity for a given keyword** 
   - Compute the cosine similarity between the average embeddings of the reviews and a keyword.

---

### Constraints

- ❌ Do **not** use sentence-transformers or pre-built embedding tools like `bert-as-service`.
- ❌ Do **not** generate text (no `.generate()`).
- ✅ Use only Hugging Face's `AutoModelForCausalLM` and `AutoTokenizer`.
- ✅ Exclude padding tokens when computing average embeddings.
- ✅ Ensure everything runs on `cuda` if available.

---

<details>
  <summary>💡 Hint</summary>

```python
# Run model with hidden states
outputs = model(**tokenized_inputs, output_hidden_states=True, return_dict=True)

# Get the last hidden layer (batch_size, seq_len, hidden_dim)
last_hidden = outputs.hidden_states[-1]

# Use the attention mask to avoid averaging over padding
attention_mask = tokenized_inputs['attention_mask']  # (batch_size, seq_len)

# Compute masked average: zero out padding tokens
masked_embeddings = last_hidden * attention_mask.unsqueeze(-1)  # broadcast mask
summed = masked_embeddings.sum(dim=1)  # sum across tokens
count = attention_mask.sum(dim=1, keepdim=True)  # count of non-padding tokens

# Final sentence-level embeddings
sentence_embeddings = summed / count  # (batch_size, hidden_dim)


In [7]:
import torch
import torch.nn as nn
import torch.optim as optim

In [8]:
# Load Amazon Reviews dataset
from datasets import load_dataset
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty", trust_remote_code=True)
reviews = dataset['full'][:1000] # first 1000 reviews

In [12]:
reviews

{'rating': [5.0,
  4.0,
  5.0,
  1.0,
  5.0,
  4.0,
  5.0,
  3.0,
  5.0,
  5.0,
  3.0,
  5.0,
  3.0,
  5.0,
  5.0,
  3.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  3.0,
  5.0,
  5.0,
  5.0,
  3.0,
  1.0,
  5.0,
  2.0,
  2.0,
  5.0,
  1.0,
  5.0,
  3.0,
  5.0,
  5.0,
  4.0,
  5.0,
  1.0,
  2.0,
  4.0,
  5.0,
  5.0,
  5.0,
  5.0,
  1.0,
  5.0,
  4.0,
  3.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  1.0,
  5.0,
  2.0,
  4.0,
  4.0,
  4.0,
  4.0,
  5.0,
  5.0,
  4.0,
  5.0,
  4.0,
  5.0,
  4.0,
  4.0,
  4.0,
  5.0,
  4.0,
  4.0,
  4.0,
  4.0,
  4.0,
  4.0,
  4.0,
  5.0,
  4.0,
  4.0,
  4.0,
  4.0,
  4.0,
  4.0,
  4.0,
  4.0,
  3.0,
  4.0,
  4.0,
  4.0,
  4.0,
  5.0,
  5.0,
  4.0,
  3.0,
  4.0,
  5.0,
  5.0,
  5.0,
  4.0,
  5.0,
  5.0,
  5.0,
  4.0,
  4.0,
  4.0,
  5.0,
  4.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  4.0,
  5.0,
  5.0,
  5.0,
  1.0,
  2.0,
  4.0,
  5.0,
  4.0,
  5.0,
  5.0,
  5.0,
  5.0,
  1.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  5.0,
  4.0,
  5.0,
  5.0,
  5

In [9]:
# Load SmolLM2-135M model and tokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-135M")
model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-135M")
print(isinstance(model, torch.nn.Module))  # Should print: True

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

True


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(49152, 576)
    (layers): ModuleList(
      (0-29): 30 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=576, out_features=576, bias=False)
          (k_proj): Linear(in_features=576, out_features=192, bias=False)
          (v_proj): Linear(in_features=576, out_features=192, bias=False)
          (o_proj): Linear(in_features=576, out_features=576, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=576, out_features=1536, bias=False)
          (up_proj): Linear(in_features=576, out_features=1536, bias=False)
          (down_proj): Linear(in_features=1536, out_features=576, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((576,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((576,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((576,), eps=1e-05)
    (rotary_emb): LlamaRotaryEm

In [10]:
# 3. Tokenize the reviews with padding for batch processing
encodings = tokenizer(reviews, return_tensors="pt", padding=True, truncation=True)
input_ids = encodings['input_ids'].to(device)
attention_mask = encodings['attention_mask'].to(device)

# 4. Forward pass with output_hidden_states=True to get all hidden states
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask, output_hidden_states=True)

# 5. Extract last hidden states (batch_size, seq_len, hidden_dim)
last_hidden_states = outputs.hidden_states[-1]

# 6. Compute sentence embeddings by averaging token embeddings excluding padding tokens
# attention_mask has 1 for real tokens, 0 for padding
expanded_mask = attention_mask.unsqueeze(-1).expand(last_hidden_states.size()).float()  # (batch, seq_len, hidden_dim)
sum_embeddings = torch.sum(last_hidden_states * expanded_mask, dim=1)
sum_mask = torch.clamp(expanded_mask.sum(dim=1), min=1e-9)  # avoid division by zero
sentence_embeddings = sum_embeddings / sum_mask  # (batch_size, hidden_dim)

print("Sentence embeddings shape:", sentence_embeddings.shape)  # (10, hidden_dim)

# --- Cosine similarity for a given keyword ---

# Example keyword
keyword = "quality"

# Tokenize and embed the keyword the same way
keyword_enc = tokenizer(keyword, return_tensors="pt")
keyword_input_ids = keyword_enc['input_ids'].to(device)
keyword_attention_mask = keyword_enc['attention_mask'].to(device)

with torch.no_grad():
    keyword_outputs = model(keyword_input_ids, attention_mask=keyword_attention_mask, output_hidden_states=True)

keyword_last_hidden = keyword_outputs.hidden_states[-1]
keyword_mask = keyword_attention_mask.unsqueeze(-1).expand(keyword_last_hidden.size()).float()
keyword_embedding = (keyword_last_hidden * keyword_mask).sum(dim=1) / torch.clamp(keyword_mask.sum(dim=1), min=1e-9)

# Compute cosine similarity between keyword embedding and each review embedding
cosine_similarities = F.cosine_similarity(sentence_embeddings, keyword_embedding)

for i, (review, sim) in enumerate(zip(reviews, cosine_similarities)):
    print(f"\nReview #{i+1} similarity to '{keyword}': {sim.item():.4f}")
    print(review)

ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).