In [170]:
# LLM config

from langchain_openai import ChatOpenAI
from langchain import hub
from os import getenv
from dotenv import load_dotenv
load_dotenv()

# IO-NET

# llm = ChatOpenAI(
#     model = "Qwen/QwQ-32B",
#     base_url= "https://api.intelligence.io.solutions/api/v1",
#     api_key=getenv("OPENAI_API_KEY")
# )

# smol = ChatOpenAI(
#     model = "Qwen/QwQ-32B",
#     base_url= "https://api.intelligence.io.solutions/api/v1",
#     api_key=getenv("OPENAI_API_KEY")
# )

# LMSTUDIO
llm = ChatOpenAI(
    model = "qwen/qwen3-4b",
    base_url = "http://127.0.0.1:1234/v1"
)

smol = ChatOpenAI(
    model = "qwen/qwen3-4b",
    base_url = "http://127.0.0.1:1234/v1"
)

# # OPENROUTER
# smol = ChatOpenAI(
#     model = "qwen/qwen3-8b:free",
#     base_url = "https://openrouter.ai/api/v1",
#     api_key = getenv("OPENROUTER_API_KEY")
# )

# llm = ChatOpenAI(
#     model = "qwen/qwen3-8b:free",
#     base_url = "https://openrouter.ai/api/v1",
#     api_key = getenv("OPENROUTER_API_KEY")
# )

In [163]:
# Embedding config

from langchain_ollama import OllamaEmbeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient

collection_name = "arxiv-cvpr-main"

embeddings = OllamaEmbeddings(
   model="nomic-embed-text:latest"
)

client = QdrantClient(
    url="http://localhost:6333",
)

vector_store = QdrantVectorStore(
    client=client,
    collection_name=collection_name,
    embedding=embeddings,
)

In [171]:
search_query = "How is visual hallucination still an issue in LVLMs?"

In [172]:
import json
from pathlib import Path
from collections import Counter
from typing import List, Dict
from langchain_core.prompts import PromptTemplate
import random



def get_rewritten_queries(question: str, llm) -> List[str]:
    """Generate multiple versions of the input question using an LLM."""
    multi_query_template = PromptTemplate.from_template("""You are an AI language model assistant. Your task is to generate three 
    different versions of the given user question to retrieve relevant documents from a vector 
    database of research papers in the field Computer Vision and Pattern Recognition. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the distance-based similarity search. 
    Provide ONLY the alternative questions separated by newlines. Original question: {question}""")
    
    multi_query_chain = multi_query_template | llm
    queries = multi_query_chain.invoke({"question": question}).content.split('\n')
    
    # Clean up queries and add original question
    queries = [q.strip() for q in queries if q.strip()]
    queries.append(question)
    return queries

def get_top_paper_id(queries: List[str], vector_store) -> str:
    """Get the most frequent paper ID from multiple queries."""
    all_results = []
    
    for query in queries:
        results = vector_store.similarity_search(
            query,
            k=1,
            score_threshold=0.4
        )
        if results:
            all_results.append(results[0].metadata['id'])

    print(f"All results: {all_results}")
    # Return most common paper ID if we have results, else None
    if all_results:
        return Counter(all_results).most_common(1)[0][0]
    return None

def get_paper_id_from_search_query(search_query: str, abstracts_vector_store_collection_name) -> str:
    """Get the paper ID from a search query using query reconstruction and similarity search."""

    abstract_vector_store = QdrantVectorStore(
        client=client,
        collection_name=abstracts_vector_store_collection_name,
        embedding=embeddings,
    )
    
    # print(f"\nOriginal Question: {search_query}")
    queries = get_rewritten_queries(search_query, smol)
    # print("Rewritten queries:")
    for i, q in enumerate(queries, 1):
        print(f"{i}. {q}")
    
    # Get most frequent paper ID from all queries
    predicted_paper_id = get_top_paper_id(queries, abstract_vector_store)
    # print(f"Predicted Paper ID for re-written queries: {predicted_paper_id}")

    return predicted_paper_id, queries

In [173]:
paper_id, rewritten_queries = get_paper_id_from_search_query(
    search_query,
    "arxiv-abstracts"
)

print(f"\nPredicted Paper ID: {paper_id}")

1. <think>
2. Okay, the user is asking about why visual hallucinations are still a problem in LVLMs (Large Vision Language Models). I need to come up with three different versions of this question to help retrieve relevant documents from a vector database. The goal is to generate alternative questions that might cover different angles or aspects of the original query, so that the search can find more diverse and relevant papers.
3. First, let me break down the original question. The key points are "visual hallucination" and "LVLMs". The user is interested in understanding why this issue persists. To generate alternative questions, I should consider different perspectives or angles that might be covered in the literature.
4. One angle could be the technical aspects. For example, how do LVLMs generate hallucinations? Maybe looking at factors like training data or model architecture. Another angle could be the impact on performance, like how hallucinations affect tasks like image captioni

In [174]:
from qdrant_client import models

def get_context_for_qa(paper_id: str, rewritten_queries: List[str], vector_store, k : int = 3) -> List[models.Record]:
    """Get context for QA from the vector store based on paper ID and search query."""
    results = []
    for query in rewritten_queries:
        # Perform similarity search with filter for the specific paper ID
        individual_results = vector_store.similarity_search_with_score(
            search_query, 
            k=k, 
            score_threshold=0.4,
            filter=models.Filter(
                must=[
                    models.FieldCondition(
                        key="metadata.id",
                        match=models.MatchValue(value=str(paper_id))
                    )
                ]
            )
        )
        if individual_results:
            for doc in individual_results:
                if doc not in results:
                    results.append(doc)
    
    return results

def get_context_for_qa_without_id(rewritten_queries: List[str], vector_store, k : int = 3) -> List[models.Record]:
    """Get context for QA from the vector store based on paper ID and search query."""
    results = []
    for query in rewritten_queries:
        # Perform similarity search with filter for the specific paper ID
        individual_results = vector_store.similarity_search_with_score(
            search_query, 
            k=k, 
            score_threshold=0.4,
        )
        if individual_results:
            for doc in individual_results:
                if doc not in results:
                    results.append(doc)
    
    return results


In [175]:

results = get_context_for_qa_without_id(rewritten_queries, vector_store)
final_context = [res for res, score in results if res.page_content and len(res.page_content) > 0]

final_context

[Document(metadata={'id': '2505.01958', 'title': 'a comprehensive analysis for visual object hallucination in large   vision-language models', 'categories': 'cs.cv cs.cl', 'abstract': 'large vision-language models (lvlms) demonstrate remarkable capabilities in multimodal tasks, but visual object hallucination remains a persistent issue. it refers to scenarios where models generate inaccurate visual object-related information based on the query input, potentially leading to misinformation and concerns about safety and reliability. previous works focus on the evaluation and mitigation of visual hallucinations, but the underlying causes have not been comprehensively investigated. in this paper, we analyze each component of llava-like lvlms -- the large language model, the vision backbone, and the projector -- to identify potential sources of error and their impact. based on our observations, we propose methods to mitigate hallucination for each problematic component. additionally, we deve

In [176]:
# RAG Chain
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate


# TODO: improve prompt to include more persona
prompt = PromptTemplate.from_template("""You are an expert in CVPR topics and help students to learn by answering questions solely based on the provided context which are taken from research papers in arxiv.

Focus on explaining concepts in detail and substantiate answers with relevant context from the given information.

# Steps

1. **Identify Key Concepts**: Upon receiving a question, pinpoint the core topics within CVPR relevant to the inquiry.
2. **Contextual Analysis**: Thoroughly review the provided context to gather accurate and pertinent information specific to the question.
3. **Detailed Explanation**: Craft a comprehensive explanation, incorporating key details and any relevant examples that illuminate the concept.
4. **Clarification and Depth**: Ensure the response is clear, well-substantiated, and sufficiently detailed to aid student understanding.

# Output Format

- Provide a paragraph elaborating the concept or answering the inquiry.
- Ensure clarity and depth, utilizing examples if applicable.

# Notes

- Always derive the response solely from the given context.
- Ensure terminologies and technical details are accurately explained within the framework of the provided context.
                                      
Context: {context}
Question: {question}
Answer the question based on the context provided above. If the context is not sufficient, say "I don't know" or "I don't have enough information to answer this question." Do not make up answers or provide information not present in the context.                                      
""")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    prompt | llm 
)

response1 = rag_chain.invoke({"question": search_query, "context": format_docs(final_context)})



if("I don't" in response1.content):
    print("Retrying without paper_id...")
    results_without_id = get_context_for_qa_without_id(rewritten_queries, vector_store)
    final_context_without_id = [res for res, score in results if res.page_content and len(res.page_content) > 0]
    response2 = rag_chain.invoke({"question": search_query, "context": format_docs(final_context_without_id)})
    print("Response without paper_id: ", response2.content)
    print("\n\n")
else:
    print("Response with paper_id: ", response1.content)
    print("\n\n")


Response with paper_id:  Visual hallucination remains a significant issue in Large Vision Language Models (LVLMs) due to the inherent challenges in aligning visual and textual information across different model components. The LLM, while capable of generating coherent responses based on image inputs, can still produce inaccurate or fabricated information when the visual and textual components do not align properly. The vision backbone, such as CLIP, is trained on image-caption pairs that are often brief and noisy, leading to a lack of fine-grained details in the learned visual representations. This results in the model being unable to capture subtle differences between images and captions, contributing to hallucinations. Additionally, the projector module struggles to align visual features with textual embeddings, as evidenced by low cosine similarity between caption embeddings and projected image features. These misalignments can cause the model to generate responses that are factuall