#Experiment 1

In [1]:
!pip install sentence-transformers faiss-cpu numpy tabulate tqdm

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_6

In [3]:
"""dime_demo.py — Minimal but runnable DIME pipeline"""
import faiss, numpy as np
from sentence_transformers import SentenceTransformer
from tabulate import tabulate
from tqdm import tqdm

# ---------------------------------------------------------------------
# 1. Toy corpus (replace with your own docs) ---------------------------
# ---------------------------------------------------------------------
CORPUS = {
    "doc_1": "Paris is the capital of France and a major European city.",
    "doc_2": "Berlin is the vibrant capital of Germany, known for its art scene.",
    "doc_3": "Rome, the capital of Italy, hosts the Vatican City.",
    "doc_4": "Toronto is the largest city in Canada and the capital of Ontario.",
    "doc_5": "The City of Light is a popular nickname for Paris in France.",
}

QUERIES = [
    "capital of France",
    "German capital city",
]

# ---------------------------------------------------------------------
# 2. Encoder & embeddings ---------------------------------------------
# ---------------------------------------------------------------------
print("Loading SBERT …")
model = SentenceTransformer("all-MiniLM-L6-v2")  # 384‑D embeddings
ids, docs = list(CORPUS.keys()), list(CORPUS.values())
embs = model.encode(docs, convert_to_numpy=True, show_progress_bar=False)

# ---------------------------------------------------------------------
# 3. Build FAISS index -------------------------------------------------
# ---------------------------------------------------------------------
dim = embs.shape[1]
index = faiss.IndexFlatIP(dim)  # inner‑product (dot‑product) search
index.add(embs)
print(f"Indexed {index.ntotal} docs, dim={dim}")

# ---------------------------------------------------------------------
# 4. DIME utilities ----------------------------------------------------
# ---------------------------------------------------------------------

def softmax(x, tau=1.0):
    x = (x / tau) - np.max(x)  # avoid overflow
    return np.exp(x) / np.exp(x).sum()


def dime_filter(q_vec, top_vecs, top_scores, keep_frac=0.6, tau=1.0, weighted=True):
    """Return filtered (masked) query vector q̃."""
    if weighted:
        w = softmax(top_scores, tau)
        centroid = np.average(top_vecs, axis=0, weights=w)
    else:
        centroid = top_vecs.mean(axis=0)

    # Dimension importance & mask
    importance = q_vec * centroid  # element‑wise product
    thresh = np.quantile(importance, 1 - keep_frac)
    q_tilde = np.where(importance >= thresh, q_vec, 0.0)
    return q_tilde


def search(q_vec, k=3):
    q_norm = q_vec / np.linalg.norm(q_vec)  # SBERT vectors are L2‑norm sensitive
    scores, idx = index.search(q_norm[None, :], k)
    return scores[0], idx[0]


# ---------------------------------------------------------------------
# 5. Run everything ----------------------------------------------------
# ---------------------------------------------------------------------
for q in QUERIES:
    print("=== QUERY:", q, "===")
    q_vec = model.encode(q, convert_to_numpy=True)

    # First‑stage retrieval
    scores1, idx1 = search(q_vec, k=5)
    top_vecs = embs[idx1]

    # DIME mask & re‑rank (no 2nd index call) -------------------------
    q_tilde = dime_filter(q_vec, top_vecs, scores1, keep_frac=0.6, tau=0.7)
    reranked_scores = top_vecs @ (q_tilde / np.linalg.norm(q_tilde))
    order = np.argsort(-reranked_scores)

    # Pretty print -----------------------------------------------------
    print("First‑stage vs. DIME rerank:")
    rows = []
    for rank, i in enumerate(idx1):
        rows.append([
            rank + 1,
            ids[i],
            f"{scores1[rank]:.3f}",
            f"{reranked_scores[rank]:.3f}",
            docs[i][:50] + "…",
        ])
    print(tabulate(rows, headers=["Rank", "ID", "Score1", "DIME", "Text"], tablefmt="github"))

Loading SBERT …


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Indexed 5 docs, dim=384
=== QUERY: capital of France ===
First‑stage vs. DIME rerank:
|   Rank | ID    |   Score1 |   DIME | Text                                                |
|--------|-------|----------|--------|-----------------------------------------------------|
|      1 | doc_1 |    0.78  |  0.828 | Paris is the capital of France and a major Europea… |
|      2 | doc_5 |    0.528 |  0.602 | The City of Light is a popular nickname for Paris … |
|      3 | doc_3 |    0.428 |  0.497 | Rome, the capital of Italy, hosts the Vatican City… |
|      4 | doc_2 |    0.275 |  0.36  | Berlin is the vibrant capital of Germany, known fo… |
|      5 | doc_4 |    0.255 |  0.341 | Toronto is the largest city in Canada and the capi… |
=== QUERY: German capital city ===
First‑stage vs. DIME rerank:
|   Rank | ID    |   Score1 |   DIME | Text                                                |
|--------|-------|----------|--------|-----------------------------------------------------|
|      1 | do

#Experiment 2

In [8]:
!pip install datasets sentence-transformers numpy tabulate




In [10]:
!pip install --upgrade datasets


Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-4.0.0-py3-none-any.whl (494 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency r

In [10]:
import numpy as np
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from tabulate import tabulate

# ---- CONFIG ----
N_QUERIES = 50      # Scan this many queries to find an improvement
TOP_K = 5           # 1 positive, 4 negatives
KEEP_FRAC = 0.4
TAU = 0.7

# ---- LOAD DATA ----
dataset = load_dataset("ms_marco", "v2.1")
val = dataset["validation"]
model = SentenceTransformer("all-MiniLM-L6-v2")

def softmax(x, tau=1.0):
    x = (x / tau) - np.max(x)
    return np.exp(x) / np.exp(x).sum()

found = 0
for i in range(N_QUERIES):
    query = val[i]["query"]
    answers = val[i]["answers"]
    passages = val[i]["passages"]["passage_text"]
    is_selected = val[i]["passages"]["is_selected"]

    # Ensure at least one positive and enough negatives
    positives = [txt for sel, txt in zip(is_selected, passages) if sel == 1]
    negatives = [txt for sel, txt in zip(is_selected, passages) if sel == 0]
    if not positives or len(negatives) < (TOP_K - 1):
        continue

    # Build pool: 1 positive, TOP_K-1 negatives
    candidates = [positives[0]] + negatives[:TOP_K-1]
    labels = ["✔️"] + [""] * (TOP_K-1)
    gold_idx = 0  # positive always at position 0

    # --- Encode ---
    cand_vecs = model.encode(candidates, convert_to_numpy=True)
    q_vec = model.encode(query, convert_to_numpy=True)

    # --- Baseline ---
    scores = cand_vecs @ q_vec
    ranked_idx = np.argsort(-scores)
    base_rank = list(ranked_idx).index(gold_idx) + 1

    # --- DIME ---
    top_idx = ranked_idx[:TOP_K]
    top_vecs = cand_vecs[top_idx]
    top_scores = scores[top_idx]
    weights = softmax(top_scores, TAU)
    centroid = np.average(top_vecs, axis=0, weights=weights)
    importance = q_vec * centroid
    keep_count = int(len(importance) * KEEP_FRAC)
    keep_dims = np.argsort(importance)[-keep_count:]
    mask = np.zeros_like(importance)
    mask[keep_dims] = 1
    q_dime = q_vec * mask
    dime_scores = cand_vecs @ q_dime
    dime_ranked_idx = np.argsort(-dime_scores)
    dime_rank = list(dime_ranked_idx).index(gold_idx) + 1

    if dime_rank < base_rank:
        print("="*100)
        print(f"Query: {query}")
        print(f"Ground-truth answer: {answers[0] if answers else ''}\n")
        print("BASELINE ranking:")
        print(tabulate(
            [(i+1, candidates[j][:90].replace('\n',' '), labels[j], scores[j])
             for i, j in enumerate(ranked_idx)],
            headers=["Rank", "Passage", "Gold", "Score"], tablefmt="github"))
        print("\nDIME RE-RANKING:")
        print(tabulate(
            [(i+1, candidates[j][:90].replace('\n',' '), labels[j], dime_scores[j])
             for i, j in enumerate(dime_ranked_idx)],
            headers=["Rank", "Passage", "Gold", "Score"], tablefmt="github"))
        print(f"\nGold passage baseline position: {base_rank}")
        print(f"Gold passage DIME position: {dime_rank}")
        print("Did DIME improve the rank? YES\n")
        found += 1
    if found >= 1:
        break

if found == 0:
    print(f"Tried {N_QUERIES} examples but did not find a case where DIME improved the rank. Try increasing N_QUERIES or KEEP_FRAC or use TOP_K=10.")


Query: why did rachel carson write an obligation to endure
Ground-truth answer: Rachel Carson writes The Obligation to Endure because believes that as man tries to eliminate unwanted insects and weeds, however he is actually causing more problems by polluting the environment.

BASELINE ranking:
|   Rank | Passage                                                                                    | Gold   |    Score |
|--------|--------------------------------------------------------------------------------------------|--------|----------|
|      1 | Carson subtly defers her writing in just the right writing technique for it to not be subj |        | 0.802501 |
|      2 | The Obligation to Endure by Rachel Carson Rachel Carson's essay on The Obligation to Endur | ✔️     | 0.793415 |
|      3 | Ashley Deemer. Eastern gateway Community College. Abstract. In the following pages the rea |        | 0.694328 |
|      4 | The essay starts out with the statement “The history of life on earth has