In [6]:
!hf auth login --token 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

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `hf`CLI if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `Flipkart` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `Flipkart`


In [7]:
# ============================================
# Colab RAG with google/embeddinggemma-300m
# ============================================

!pip -q install --upgrade sentence-transformers chromadb transformers accelerate huggingface_hub datasets bitsandbytes tiktoken

import os, textwrap
import numpy as np
import torch
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM

# -------------------------------
# Config
# -------------------------------
class CFG:
    db_path = "./chroma_db"
    collection = "docs"
    embed_model = "google/embeddinggemma-300m"
    embed_dim = 768   # 768 / 512 / 256 / 128
    gen_model = "google/gemma-3-1b-it"
    gen_max_new_tokens = 256
    gen_temperature = 0.2
    gen_top_p = 0.95
    batch_size = 16
    device = "cuda" if torch.cuda.is_available() else "cpu"

# -------------------------------
# Helpers
# -------------------------------
def mrl_truncate_and_renorm(emb: np.ndarray, dim: int) -> np.ndarray:
    """Matryoshka: keep first dim dims, then L2 renorm."""
    if dim == emb.shape[-1]:
        return emb
    e = emb[..., :dim]
    if e.ndim == 1:
        return e / (np.linalg.norm(e) + 1e-12)
    return e / (np.linalg.norm(e, axis=1, keepdims=True) + 1e-12)

# -------------------------------
# Load models
# -------------------------------
embed_model = SentenceTransformer(CFG.embed_model)
print("Embedding model loaded on:", embed_model.device)

tokenizer = AutoTokenizer.from_pretrained(CFG.gen_model, use_fast=True)

load_kwargs = dict(device_map="auto", torch_dtype=torch.bfloat16)
text_model = AutoModelForCausalLM.from_pretrained(CFG.gen_model, **load_kwargs)
text_model.eval()

# -------------------------------
# Setup Chroma
# -------------------------------
client = chromadb.PersistentClient(path=CFG.db_path, settings=Settings(anonymized_telemetry=False))
collection = client.get_or_create_collection(name=CFG.collection)

def add_texts(texts):
    ids, docs, metas, all_embs = [], [], [], []
    for name, text in texts.items():
        chunks = [text]  # simple, no chunking for demo
        embs = embed_model.encode_document(
            chunks, batch_size=CFG.batch_size,
            convert_to_numpy=True, normalize_embeddings=True
        )
        embs = mrl_truncate_and_renorm(embs, CFG.embed_dim)
        for i, (ch, e) in enumerate(zip(chunks, embs)):
            uid = f"{name}_{i}"
            ids.append(uid); docs.append(ch)
            metas.append({"source": name, "chunk_id": i})
            all_embs.append(e)
    collection.add(ids=ids, documents=docs, embeddings=all_embs, metadatas=metas)
    return len(ids)

# -------------------------------
# Retrieval
# -------------------------------
def retrieve(query, k=3):
    q_emb = embed_model.encode_query(query, convert_to_numpy=True, normalize_embeddings=True)
    q_emb = mrl_truncate_and_renorm(q_emb, CFG.embed_dim)
    res = collection.query(query_embeddings=[q_emb], n_results=k,
                           include=["documents","metadatas","distances"])
    hits = []
    for i in range(len(res["ids"][0])):
        hits.append({
            "doc": res["documents"][0][i],
            "meta": res["metadatas"][0][i],
            "score": 1 - res["distances"][0][i],
        })
    return hits

def build_prompt(query, contexts):
    ctx = "\n\n".join([f"[{c['meta']['source']}] {c['doc']}" for c in contexts])
    return f"<system>You are a helpful assistant.</system>\n<user>Question: {query}\n\nContext:\n{ctx}</user>\n<assistant>"

def generate_answer(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(CFG.device)
    with torch.no_grad():
        out = text_model.generate(
            **inputs,
            max_new_tokens=CFG.gen_max_new_tokens,
            temperature=CFG.gen_temperature,
            top_p=CFG.gen_top_p,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    return tokenizer.decode(out[0], skip_special_tokens=True).split("<assistant>")[-1].strip()

def answer_query(query, k=3):
    hits = retrieve(query, k)
    prompt = build_prompt(query, hits)
    return generate_answer(prompt), hits

# -------------------------------
# Demo
# -------------------------------
docs = {
    "doc1": "EmbeddingGemma-300M is a multilingual embedding model from Google. It supports Matryoshka dimensions: 768, 512, 256, 128.",
    "doc2": "Gemma-3 is a family of open models from Google, optimized for efficiency and long-context understanding."
}
add_texts(docs)

q = "What is EmbeddingGemma and what dimensionalities can it output?"
ans, hits = answer_query(q, k=2)
print("Question:", q)
print("\nRetrieved docs:")
for h in hits:
    print(f"- ({h['meta']}) score={h['score']:.3f} -> {h['doc']}")
print("\nAnswer:", ans)


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m83.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m61.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/997 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/16.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/58.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.49k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.21G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/312 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/9.44M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

3_Dense/model.safetensors:   0%|          | 0.00/9.44M [00:00<?, ?B/s]

Embedding model loaded on: cpu


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/899 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.00G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

Question: What is EmbeddingGemma and what dimensionalities can it output?

Retrieved docs:
- ({'chunk_id': 0, 'source': 'doc1'}) score=0.221 -> EmbeddingGemma-300M is a multilingual embedding model from Google. It supports Matryoshka dimensions: 768, 512, 256, 128.
- ({'source': 'doc2', 'chunk_id': 0}) score=-0.076 -> Gemma-3 is a family of open models from Google, optimized for efficiency and long-context understanding.

Answer: The difference between EmbeddingGemma and Gemma-3 is that EmbeddingGemma is a multilingual model, while Gemma-3 is a family of open models optimized for efficiency and long-context
