In [2]:
import os
import gc
import torch
import chromadb
from chromadb.utils import embedding_functions
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    Pipeline)
from sentence_transformers import CrossEncoder

from dotenv import load_dotenv
from openai import OpenAI
from typing import List, Dict
# Cleanup

gc.collect()
torch.cuda.empty_cache()
# Configuration
DB_PATH = os.path.join(os.getcwd(), "arxiv")  # Safe path joining
MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

In [3]:
# Initialize Client & Embedding
client = chromadb.PersistentClient(path=DB_PATH)

embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
collection = client.get_or_create_collection(name="Arxiv-Database",
                                             embedding_function=embedding_func)

def search(query, top_k_retrieval=20, top_k_rerank=5):

    # Stage 1: Semantic Retrieval (Bio-Encoder)
    results = collection.query(
        query_texts=[query],
        n_results=top_k_retrieval
    )
    
    documents = results['documents'][0]
    metadatas = results['metadatas'][0]
    
    # Stage 2: Re-ranking (Cross-Encoder)
    # Prepare pairs: (Query, Document_Context)
    pairs = [[query, doc] for doc in documents]
    
    # Predict scores
    scores = cross_encoder.predict(pairs)
    
    # Sort by score (descending)
    ranked_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
    
    # Stage 3: Return top-k results
    retrieved = []
    for rank, idx in enumerate(ranked_indices[:top_k_rerank]):
        retrieved.append({
            "rank": rank+1,
            "score": scores[idx],
            "source": metadatas[idx]['source'],
            "page": metadatas[idx]['page'],
            "content": documents[idx]
        })

    return retrieved
    

In [3]:
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16
# )

# tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# model = AutoModelForCausalLM.from_pretrained(
#     MODEL_ID,
#     quantization_config=bnb_config,
#     # device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
# )

print("⏳ Loading Cross-Encoder Model...")
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
print("✅ Cross-Encoder Loaded.")


⏳ Loading Cross-Encoder Model...
✅ Cross-Encoder Loaded.


In [15]:
query = "What is the Assessment Misalignment problem in Large Language Models?"
docs = search(query)

print("--- Retrieved Context ---")
for doc in docs:
    print(f"Content \n: {doc['content']}")

--- Retrieved Context ---
Content 
: the reasoning ability of LLMs by alleviating the assessment misalignment problem caused by VFT.
To address the assessment misalignment problem, in this paper, we propose an alignment fine-tuning
(AFT) paradigm to improve LLM reasoning with three steps: 1) fine-tuning LLMs using COT
training data; 2) generating multiple COT responses for each question using the fine-tuned LLMs,
and categorizing them as positive and negative based on whether they deduce the correct answer;
3) calibrating the scores of positive and negative responses given by LLMs with a novel constraint
alignment (CA) loss. Specifically, the CA loss ensures that all positive scores (the scores of positive
COTs) are larger than negative scores. In addition, the negative scores are protected by a constraint
term, which is proven to be very important in preventing model degradation. Beyond just binary
positive and negative feedback, the CA loss can be seamlessly adapted to ranking situat

In [21]:
load_dotenv()

class RAGGenerator:
    def __init__(self, api_key: str = None, base_url: str = None, model_name: str = "gpt-5-nano:free"):
        """
        Initialize the OpenAI-compatible client.

        Args:
            api_key: Your API Key (use "dummy" for local models like Ollama)
            base_url: The API endpoint (e.g., "http://localhost:11434/v1" for Ollama)
            model_name: The specific model to target (e.g., "llama3", "gpt-4o")
        """
        self.client = OpenAI(
            api_key=api_key or os.getenv("OPENAI_API_KEY"),
            base_url=base_url or os.getenv("OPENAI_BASE_URL")
        )
        self.model_name = model_name

    def construct_prompt(self, query: str, context_chunks: List[str]) -> str:
        """
        Builds the prompt by combining the user query with retrieved context.
        """
        # Join chunks with a clear separator
        context_str = "\n\n---\n\n".join(context_chunks)

        prompt = f"""You are a helpful assistant for maritime regulations.
Answer the user's question based ONLY on the following context.
If the answer is not in the context, say "I don't know."

The context may contain Markdown tables. Please interpret the rows and columns accurately.

### CONTEXT:
{context_str}

### USER QUESTION:
{query}

### ANSWER:
"""
        return prompt

    def generate_answer(self, query: str, context_chunks: List[str]) -> str:
        """
        Sends the prompt to the LLM and returns the response.
        """
        prompt = self.construct_prompt(query, context_chunks)

        try:
            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=[
                    {"role": "system", "content": "You are a precise technical assistant."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.1, # Keep strict for RAG to avoid hallucinations
            )
            return response.choices[0].message.content
        except Exception as e:
            return f"Error during inference: {e}"


In [22]:
rag = RAGGenerator()

In [31]:
# 1. SIMULATE RETRIEVAL
query = "How does the 'Constraint' objective in AFT prevent model degradation?"
docs = search(query)
context = ["-> ".join([doc['source'], doc['content']]) for doc in docs]
# 2. GENERATE RESPONSE
print(f"Query: {query}\n")
answer = rag.generate_answer(query, context)
print(f"Response:\n{answer}")

Query: How does the 'Constraint' objective in AFT prevent model degradation?

Response:
The Constraint objective keeps negative scores confined to a reasonable range using a constraint term in the CA loss. This bound prevents negative scores from drifting too far, which protects against deterioration of the model’s performance (i.e., prevents model degradation) while still ensuring positive scores exceed negatives.
