In [2]:
# LLM config

from langchain_openai import ChatOpenAI
from langchain import hub
from os import getenv
from dotenv import load_dotenv
load_dotenv()

# IO-NET

# llm = ChatOpenAI(
#     model = "Qwen/QwQ-32B",
#     base_url= "https://api.intelligence.io.solutions/api/v1",
#     api_key=getenv("OPENAI_API_KEY")
# )

# smol = ChatOpenAI(
#     model = "Qwen/QwQ-32B",
#     base_url= "https://api.intelligence.io.solutions/api/v1",
#     api_key=getenv("OPENAI_API_KEY")
# )

# # LMSTUDIO
# llm = ChatOpenAI(
#     model = "qwen3-14b-cvpr-chat-full",
#     base_url = "http://127.0.0.1:1234/v1"
# )

# smol = ChatOpenAI(
#     model = "qqwen3-14b-cvpr-chat-full",
#     base_url = "http://127.0.0.1:1234/v1"
# )

# OPENROUTER
smol = ChatOpenAI(
    model = "qwen/qwen3-8b:free",
    base_url = "https://openrouter.ai/api/v1",
    api_key = getenv("OPENROUTER_API_KEY")
)

llm = ChatOpenAI(
    model = "qwen/qwen3-8b:free",
    base_url = "https://openrouter.ai/api/v1",
    api_key = getenv("OPENROUTER_API_KEY")
)

In [3]:
# Embedding config

from langchain_ollama import OllamaEmbeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient

collection_name = "arxiv-cvpr-main"

embeddings = OllamaEmbeddings(
   model="nomic-embed-text:latest"
)

client = QdrantClient(
    url="http://localhost:6333",
)

vector_store = QdrantVectorStore(
    client=client,
    collection_name=collection_name,
    embedding=embeddings,
)

In [4]:
search_query = "How is visual hallucination still an issue in LVLMs?"

In [5]:
import json
from pathlib import Path
from collections import Counter
from typing import List, Dict
from langchain_core.prompts import PromptTemplate
import random



def get_rewritten_queries(question: str, llm) -> List[str]:
    """Generate multiple versions of the input question using an LLM."""
    multi_query_template = PromptTemplate.from_template("""You are an AI language model assistant. Your task is to generate three 
    different versions of the given user question to retrieve relevant documents from a vector 
    database of research papers in the field Computer Vision and Pattern Recognition. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the distance-based similarity search. 
    Provide ONLY the alternative questions separated by newlines. Original question: {question}""")
    
    multi_query_chain = multi_query_template | llm
    # TODO: make a custom parser to handle <think> tags in the response
    queries = multi_query_chain.invoke({"question": question}).content.split('\n')
    
    # Clean up queries and add original question
    queries = [q.strip() for q in queries if q.strip()]
    queries.append(question)
    return queries

def get_top_paper_id(queries: List[str], vector_store) -> str:
    """Get the most frequent paper ID from multiple queries."""
    all_results = []
    
    for query in queries:
        results = vector_store.similarity_search(
            query,
            k=1,
            score_threshold=0.4
        )
        if results:
            all_results.append(results[0].metadata['id'])

    print(f"All results: {all_results}")
    # Return most common paper ID if we have results, else None
    if all_results:
        return Counter(all_results).most_common(1)[0][0]
    return None

def get_paper_id_from_search_query(search_query: str, abstracts_vector_store_collection_name) -> str:
    """Get the paper ID from a search query using query reconstruction and similarity search."""

    abstract_vector_store = QdrantVectorStore(
        client=client,
        collection_name=abstracts_vector_store_collection_name,
        embedding=embeddings,
    )
    
    # print(f"\nOriginal Question: {search_query}")
    queries = get_rewritten_queries(search_query, smol)
    # print("Rewritten queries:")
    for i, q in enumerate(queries, 1):
        print(f"{i}. {q}")
    
    # Get most frequent paper ID from all queries
    predicted_paper_id = get_top_paper_id(queries, abstract_vector_store)
    # print(f"Predicted Paper ID for re-written queries: {predicted_paper_id}")

    return predicted_paper_id, queries

In [6]:
paper_id, rewritten_queries = get_paper_id_from_search_query(
    search_query,
    "arxiv-abstracts"
)

print(f"\nPredicted Paper ID: {paper_id}")

1. What are the underlying causes of visual hallucination in large vision-language models (LVLMs) that persist despite recent advancements?
2. How do limitations in training data or model architecture specifically contribute to visual hallucination in LVLMs, and what are the consequences for their performance?
3. What strategies are currently being explored to address visual hallucination in LVLMs, and why do these approaches still fall short in practical applications?
4. How is visual hallucination still an issue in LVLMs?
All results: [2505.01958, 2505.01958, 2505.01958, 2505.01958]

Predicted Paper ID: 2505.01958


In [37]:
from qdrant_client import models
from langchain_core.documents import Document

def get_context_for_qa(paper_id: str, rewritten_queries: List[str], vector_store, k : int = 3) -> List[models.Record]:
    """Get context for QA from the vector store based on paper ID and search query."""
    results = []
    for query in rewritten_queries:
        # Perform similarity search with filter for the specific paper ID
        individual_results = vector_store.similarity_search_with_score(
            query, 
            k=k, 
            score_threshold=0.4,
            filter=models.Filter(
                must=[
                    models.FieldCondition(
                        key="metadata.id",
                        match=models.MatchValue(value=str(paper_id))
                    )
                ]
            )
        )
        if individual_results:
            if results:
                for doc, score in individual_results:
                    if not any(doc.page_content == existing_doc.page_content for existing_doc, score in results):
                        results.append((doc, score))
            else:
                results = individual_results
    
    return results

def get_context_for_qa_without_id(rewritten_queries: List[str], vector_store, k : int = 3) -> List[models.Record]:
    """Get context for QA from the vector store based on paper ID and search query."""
    results = []
    for query in rewritten_queries:
        # Perform similarity search with filter for the specific paper ID
        individual_results = vector_store.similarity_search_with_score(
            query, 
            k=k, 
            score_threshold=0.4,
        )
        if individual_results:
            if results:
                for doc, score in individual_results:
                    if not any(doc.page_content == existing_doc.page_content for existing_doc, score in results):
                        results.append((doc, score))
            else:
                results = individual_results
    
    return results


In [38]:

results = get_context_for_qa(paper_id, rewritten_queries, vector_store)
final_context = [res for res, score in results if res.page_content and len(res.page_content) > 0]

len(final_context)

6

In [185]:
# RAG Chain
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate


# TODO: improve prompt to include more persona
prompt = PromptTemplate.from_template("""You are an expert in CVPR topics and help students to learn by answering questions solely based on the provided context which are taken from research papers in arxiv.

Focus on explaining concepts in detail and substantiate answers with relevant context from the given information.

# Steps

1. **Identify Key Concepts**: Upon receiving a question, pinpoint the core topics within CVPR relevant to the inquiry.
2. **Contextual Analysis**: Thoroughly review the provided context to gather accurate and pertinent information specific to the question.
3. **Detailed Explanation**: Craft a comprehensive explanation, incorporating key details and any relevant examples that illuminate the concept.
4. **Clarification and Depth**: Ensure the response is clear, well-substantiated, and sufficiently detailed to aid student understanding.

# Output Format

- Provide a paragraph elaborating the concept or answering the inquiry.
- Ensure clarity and depth, utilizing examples if applicable.

# Notes

- Always derive the response solely from the given context.
- Ensure terminologies and technical details are accurately explained within the framework of the provided context.
                                      
Context: {context}
Question: {question}
Answer the question based on the context provided above. If the context is not sufficient, say "I don't know" or "I don't have enough information to answer this question." Do not make up answers or provide information not present in the context.                                      
""")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    prompt | llm 
)

response1 = rag_chain.invoke({"question": search_query, "context": format_docs(final_context)})



if("I don't" in response1.content):
    print("Retrying without paper_id...")
    results_without_id = get_context_for_qa_without_id(rewritten_queries, vector_store)
    final_context_without_id = [res for res, score in results if res.page_content and len(res.page_content) > 0]
    response2 = rag_chain.invoke({"question": search_query, "context": format_docs(final_context_without_id)})
    print("Response without paper_id: ", response2.content)
    print("\n\n")
else:
    print("Response with paper_id: ", response1.content)
    print("\n\n")


Response with paper_id:   class
</think>

Visual hallucination remains an issue in LVLMs because errors can occur in any of the three key components: the large language model (LLM), the vision backbone, and the projector. The LLM may produce faithful outputs when captions are provided but has limited capacity for perception-based hallucinations without adequate visual input. Errors in the vision backbone—such as misclassifying or missing objects due to reliance on noisy internet captions—can result in incorrect features being extracted from images. Lastly, the projector, responsible for aligning visual and textual spaces, may struggle with low cosine similarity between caption embeddings and projected image features, leading to hallucinations when mapping visual information into language space. These challenges highlight the need for targeted strategies like fine-grained perception-based instruction tuning or contrastive alignment objectives to reduce hallucination in LVLMs.



