In [10]:
import os
import gc
import torch
import chromadb
from chromadb.utils import embedding_functions
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    Pipeline)
from sentence_transformers import CrossEncoder


# Cleanup

gc.collect()
torch.cuda.empty_cache()
# Configuration
DB_PATH = os.path.join(os.getcwd(), "arxiv")  # Safe path joining
MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

In [11]:
# Initialize Client & Embedding
client = chromadb.PersistentClient(path=DB_PATH)

embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
collection = client.get_or_create_collection(name="Arxiv-Database",
                                             embedding_function=embedding_func)

def search(query, top_k_retrieval=20, top_k_rerank=5):

    # Stage 1: Semantic Retrieval (Bio-Encoder)
    results = collection.query(
        query_texts=[query],
        n_results=top_k_retrieval
    )
    
    documents = results['documents'][0]
    metadatas = results['metadatas'][0]
    
    # Stage 2: Re-ranking (Cross-Encoder)
    # Prepare pairs: (Query, Document_Context)
    pairs = [[query, doc] for doc in documents]
    
    # Predict scores
    scores = cross_encoder.predict(pairs)
    
    # Sort by score (descending)
    ranked_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
    
    # Stage 3: Return top-k results
    retrieved = []
    for rank, idx in enumerate(ranked_indices[:top_k_rerank]):
        retrieved.append({
            "rank": rank+1,
            "score": scores[idx],
            "source": metadatas[idx]['source'],
            "page": metadatas[idx]['page'],
            "content": documents[idx]
        })

    return retrieved

    

In [12]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    # device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
)

print("⏳ Loading Cross-Encoder Model...")
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
print("✅ Cross-Encoder Loaded.")


⏳ Loading Cross-Encoder Model...


config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

✅ Cross-Encoder Loaded.


In [14]:
query = "What is the Assessment Misalignment problem in Large Language Models?"
docs = search(query)

print("--- Retrieved Context ---")
for doc in docs:
    print(f"Content \n: {doc['content']}")

--- Retrieved Context ---
Content 
: and a certain search overhead. Although our detached alignment loss can mitigate the assessment
misalignment problem without requiring any hyper-parameters, it sometimes falls short in comparison
to the boundary constraint alignment loss, especially in ranking situations. Therefore, how to design
a dynamic boundary constraint without introducing the hyper-parameter is a meaningful question,
which leaves for further work.
REFERENCES
Shourya Aggarwal, Divyanshu Mandowara, Vishwajeet Agrawal, Dinesh Khandelwal, Parag Singla,
and Dinesh Garg. Explanations for CommonsenseQA: New Dataset and Models. In Proceed-
ings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th
International Joint Conference on Natural Language Processing (Volume 1: Long Papers), pp.
3050–3065, Online, August 2021. Association for Computational Linguistics. doi: 10.18653/v1/
2021.acl-long.238. URL https://aclanthology.org/2021.acl-long.238.
Conte