#Experiment 1

In [1]:
!pip install sentence-transformers faiss-cpu numpy tabulate tqdm

Collecting sentence-transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp312-cp312-macosx_14_0_arm64.whl.metadata (4.8 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.53.2-py3-none-any.whl.metadata (40 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.7.1-cp312-none-macosx_11_0_arm64.whl.metadata (29 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.33.4-py3-none-any.whl.metadata (14 kB)
Collecting hf-xet<2.0.0,>=1.1.2 (from huggingface-hub>=0.20.0->sentence-transformers)
  Downloading hf_xet-1.1.5-cp37-abi3-macosx_11_0_arm64.whl.metadata (879 bytes)
Collecting sympy>=1.13.3 (from torch>=1.11.0->sentence-transformers)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.41.0->sentence-transformers

In [2]:
"""dime_demo.py — Minimal but runnable DIME pipeline"""
import faiss, numpy as np
from sentence_transformers import SentenceTransformer
from tabulate import tabulate
from tqdm import tqdm

# ---------------------------------------------------------------------
# 1. Toy corpus (replace with your own docs) ---------------------------
# ---------------------------------------------------------------------
CORPUS = {
    "doc_1": "Paris is the capital of France and a major European city.",
    "doc_2": "Berlin is the vibrant capital of Germany, known for its art scene.",
    "doc_3": "Rome, the capital of Italy, hosts the Vatican City.",
    "doc_4": "Toronto is the largest city in Canada and the capital of Ontario.",
    "doc_5": "The City of Light is a popular nickname for Paris in France.",
}

QUERIES = [
    "capital of France",
    "German capital city",
]

# ---------------------------------------------------------------------
# 2. Encoder & embeddings ---------------------------------------------
# ---------------------------------------------------------------------
print("Loading SBERT …")
model = SentenceTransformer("all-MiniLM-L6-v2")  # 384‑D embeddings
ids, docs = list(CORPUS.keys()), list(CORPUS.values())
embs = model.encode(docs, convert_to_numpy=True, show_progress_bar=False)

# ---------------------------------------------------------------------
# 3. Build FAISS index -------------------------------------------------
# ---------------------------------------------------------------------
dim = embs.shape[1]
index = faiss.IndexFlatIP(dim)  # inner‑product (dot‑product) search
index.add(embs)
print(f"Indexed {index.ntotal} docs, dim={dim}")

# ---------------------------------------------------------------------
# 4. DIME utilities ----------------------------------------------------
# ---------------------------------------------------------------------

def softmax(x, tau=1.0):
    x = (x / tau) - np.max(x)  # avoid overflow
    return np.exp(x) / np.exp(x).sum()


def dime_filter(q_vec, top_vecs, top_scores, keep_frac=0.6, tau=1.0, weighted=True):
    """Return filtered (masked) query vector q̃."""
    if weighted:
        w = softmax(top_scores, tau)
        centroid = np.average(top_vecs, axis=0, weights=w)
    else:
        centroid = top_vecs.mean(axis=0)

    # Dimension importance & mask
    importance = q_vec * centroid  # element‑wise product
    thresh = np.quantile(importance, 1 - keep_frac)
    q_tilde = np.where(importance >= thresh, q_vec, 0.0)
    return q_tilde


def search(q_vec, k=3):
    q_norm = q_vec / np.linalg.norm(q_vec)  # SBERT vectors are L2‑norm sensitive
    scores, idx = index.search(q_norm[None, :], k)
    return scores[0], idx[0]


# ---------------------------------------------------------------------
# 5. Run everything ----------------------------------------------------
# ---------------------------------------------------------------------
for q in QUERIES:
    print("=== QUERY:", q, "===")
    q_vec = model.encode(q, convert_to_numpy=True)

    # First‑stage retrieval
    scores1, idx1 = search(q_vec, k=5)
    top_vecs = embs[idx1]

    # DIME mask & re‑rank (no 2nd index call) -------------------------
    q_tilde = dime_filter(q_vec, top_vecs, scores1, keep_frac=0.6, tau=0.7)
    reranked_scores = top_vecs @ (q_tilde / np.linalg.norm(q_tilde))
    order = np.argsort(-reranked_scores)

    # Pretty print -----------------------------------------------------
    print("First‑stage vs. DIME rerank:")
    rows = []
    for rank, i in enumerate(idx1):
        rows.append([
            rank + 1,
            ids[i],
            f"{scores1[rank]:.3f}",
            f"{reranked_scores[rank]:.3f}",
            docs[i][:50] + "…",
        ])
    print(tabulate(rows, headers=["Rank", "ID", "Score1", "DIME", "Text"], tablefmt="github"))

Loading SBERT …
Indexed 5 docs, dim=384
=== QUERY: capital of France ===
First‑stage vs. DIME rerank:
|   Rank | ID    |   Score1 |   DIME | Text                                                |
|--------|-------|----------|--------|-----------------------------------------------------|
|      1 | doc_1 |    0.78  |  0.828 | Paris is the capital of France and a major Europea… |
|      2 | doc_5 |    0.528 |  0.602 | The City of Light is a popular nickname for Paris … |
|      3 | doc_3 |    0.428 |  0.497 | Rome, the capital of Italy, hosts the Vatican City… |
|      4 | doc_2 |    0.275 |  0.36  | Berlin is the vibrant capital of Germany, known fo… |
|      5 | doc_4 |    0.255 |  0.341 | Toronto is the largest city in Canada and the capi… |
=== QUERY: German capital city ===
First‑stage vs. DIME rerank:
|   Rank | ID    |   Score1 |   DIME | Text                                                |
|--------|-------|----------|--------|----------------------------------------------------

#Experiment 2

In [3]:
!pip install datasets sentence-transformers numpy tabulate


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Downloading datasets-4.0.0-py3-none-any.whl (494 kB)
Downloading multiprocess-0.70.16-py312-none-any.whl (146 kB)
Downloading xxhash-3.5.0-cp312-cp312-macosx_11_0_arm64.whl (30 kB)
Installing collected packages: xxhash, multiprocess, datasets
Successfully installed datasets-4.0.0 multiprocess-0.70.16 xxhash-3.5.0


In [4]:
!pip install --upgrade datasets


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [5]:
import numpy as np
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from tabulate import tabulate

# ---- CONFIG ----
N_QUERIES = 50      # Scan this many queries to find an improvement
TOP_K = 5           # 1 positive, 4 negatives
KEEP_FRAC = 0.4
TAU = 0.7

# ---- LOAD DATA ----
dataset = load_dataset("ms_marco", "v2.1")
val = dataset["validation"]
model = SentenceTransformer("all-MiniLM-L6-v2")

def softmax(x, tau=1.0):
    x = (x / tau) - np.max(x)
    return np.exp(x) / np.exp(x).sum()

found = 0
for i in range(N_QUERIES):
    query = val[i]["query"]
    answers = val[i]["answers"]
    passages = val[i]["passages"]["passage_text"]
    is_selected = val[i]["passages"]["is_selected"]

    # Ensure at least one positive and enough negatives
    positives = [txt for sel, txt in zip(is_selected, passages) if sel == 1]
    negatives = [txt for sel, txt in zip(is_selected, passages) if sel == 0]
    if not positives or len(negatives) < (TOP_K - 1):
        continue

    # Build pool: 1 positive, TOP_K-1 negatives
    candidates = [positives[0]] + negatives[:TOP_K-1]
    labels = ["✔️"] + [""] * (TOP_K-1)
    gold_idx = 0  # positive always at position 0

    # --- Encode ---
    cand_vecs = model.encode(candidates, convert_to_numpy=True)
    q_vec = model.encode(query, convert_to_numpy=True)

    # --- Baseline ---
    scores = cand_vecs @ q_vec
    ranked_idx = np.argsort(-scores)
    base_rank = list(ranked_idx).index(gold_idx) + 1

    # --- DIME ---
    top_idx = ranked_idx[:TOP_K]
    top_vecs = cand_vecs[top_idx]
    top_scores = scores[top_idx]
    weights = softmax(top_scores, TAU)
    centroid = np.average(top_vecs, axis=0, weights=weights)
    importance = q_vec * centroid
    keep_count = int(len(importance) * KEEP_FRAC)
    keep_dims = np.argsort(importance)[-keep_count:]
    mask = np.zeros_like(importance)
    mask[keep_dims] = 1
    q_dime = q_vec * mask
    dime_scores = cand_vecs @ q_dime
    dime_ranked_idx = np.argsort(-dime_scores)
    dime_rank = list(dime_ranked_idx).index(gold_idx) + 1

    if dime_rank < base_rank:
        print("="*100)
        print(f"Query: {query}")
        print(f"Ground-truth answer: {answers[0] if answers else ''}\n")
        print("BASELINE ranking:")
        print(tabulate(
            [(i+1, candidates[j][:90].replace('\n',' '), labels[j], scores[j])
             for i, j in enumerate(ranked_idx)],
            headers=["Rank", "Passage", "Gold", "Score"], tablefmt="github"))
        print("\nDIME RE-RANKING:")
        print(tabulate(
            [(i+1, candidates[j][:90].replace('\n',' '), labels[j], dime_scores[j])
             for i, j in enumerate(dime_ranked_idx)],
            headers=["Rank", "Passage", "Gold", "Score"], tablefmt="github"))
        print(f"\nGold passage baseline position: {base_rank}")
        print(f"Gold passage DIME position: {dime_rank}")
        print("Did DIME improve the rank? YES\n")
        found += 1
    if found >= 1:
        break

if found == 0:
    print(f"Tried {N_QUERIES} examples but did not find a case where DIME improved the rank. Try increasing N_QUERIES or KEEP_FRAC or use TOP_K=10.")


README.md: 0.00B [00:00, ?B/s]

v2.1/validation-00000-of-00001.parquet:   0%|          | 0.00/210M [00:00<?, ?B/s]

v2.1/train-00000-of-00007.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

v2.1/train-00001-of-00007.parquet:   0%|          | 0.00/240M [00:00<?, ?B/s]

v2.1/train-00002-of-00007.parquet:   0%|          | 0.00/241M [00:00<?, ?B/s]

v2.1/train-00003-of-00007.parquet:   0%|          | 0.00/242M [00:00<?, ?B/s]

v2.1/train-00004-of-00007.parquet:   0%|          | 0.00/242M [00:00<?, ?B/s]

v2.1/train-00005-of-00007.parquet:   0%|          | 0.00/242M [00:00<?, ?B/s]

v2.1/train-00006-of-00007.parquet:   0%|          | 0.00/244M [00:00<?, ?B/s]

v2.1/test-00000-of-00001.parquet:   0%|          | 0.00/204M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/101093 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/808731 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/101092 [00:00<?, ? examples/s]

Query: why did rachel carson write an obligation to endure
Ground-truth answer: Rachel Carson writes The Obligation to Endure because believes that as man tries to eliminate unwanted insects and weeds, however he is actually causing more problems by polluting the environment.

BASELINE ranking:
|   Rank | Passage                                                                                    | Gold   |    Score |
|--------|--------------------------------------------------------------------------------------------|--------|----------|
|      1 | Carson subtly defers her writing in just the right writing technique for it to not be subj |        | 0.802501 |
|      2 | The Obligation to Endure by Rachel Carson Rachel Carson's essay on The Obligation to Endur | ✔️      | 0.793415 |
|      3 | Ashley Deemer. Eastern gateway Community College. Abstract. In the following pages the rea |        | 0.694328 |
|      4 | The essay starts out with the statement “The history of life on earth ha