In [None]:
# LLM config

from langchain_openai import ChatOpenAI
from langchain import hub
from os import getenv
from dotenv import load_dotenv
load_dotenv()

# llm = ChatOpenAI(
#     model = "Qwen/QwQ-32B",
#     base_url= "https://api.intelligence.io.solutions/api/v1",
#     api_key=getenv("OPENAI_API_KEY")
# )

smol = ChatOpenAI(
    model = "qwen/qwen3-8b:free",
    base_url = "https://openrouter.ai/api/v1",
    api_key = getenv("OPENROUTER_API_KEY")
)

llm = ChatOpenAI(
    model = "qwen/qwen3-8b:free",
    base_url = "https://openrouter.ai/api/v1",
    api_key = getenv("OPENROUTER_API_KEY")
)

In [None]:
# Text Loading 

from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from scraping.metadata import extract_metadata
import pprint

directory_name = "data"
file_name = "2505.00312 aware-net_adaptive_weighted_averaging_for_robust_ensemble_network_in_deepfake_detection.pdf"

#TODO: generalize this to automatically split all files from a directory
# from langchain_community.document_loaders import FileSystemBlobLoader
# from langchain_community.document_loaders.generic import GenericLoader
# from langchain_community.document_loaders.parsers import PyMuPDFParser

# loader = GenericLoader(
#     blob_loader=FileSystemBlobLoader(
#         path=directory_name,
#         glob="*.pdf",
#     ),
#     blob_parser=PyMuPDFParser(),
# )
# documents = loader.load()
# print(documents[0].page_content)

loader = PyMuPDFLoader(
    file_path=f"{directory_name}/{file_name}",
    extract_tables="markdown"
)
documents = loader.load()

#TODO: clean up the documents byremoving references etc.

def clean_arxiv_content(text):

    import re
    # Remove references, bibliography or works cited sections regardless of case or extra formatting
    text = re.sub(
        r'\n\s*(References|Bibliography|Works Cited)\s*:?\s*\n.*', 
        '', 
        text, 
        flags=re.DOTALL | re.IGNORECASE
    )
    
    # Remove citation patterns [1], [2-5], (Author, 2023)
    text = re.sub(r'\[\d+(?:[-,]\s*\d+)*\]', '', text)
    text = re.sub(r'\([A-Za-z\s]+,?\s*\d{4}[a-z]?\)', '', text)
    
    # Clean LaTeX artifacts
    text = re.sub(r'\\[a-zA-Z]+\{[^}]*\}', '', text)  # \textbf{}, \cite{}, etc.
    text = re.sub(r'\\[a-zA-Z]+', '', text)  # \section, \subsection
    
    # # Remove figure/table references
    # text = re.sub(r'Figure\s+\d+', 'Figure', text, flags=re.IGNORECASE)
    # text = re.sub(r'Table\s+\d+', 'Table', text, flags=re.IGNORECASE)
    # text = re.sub(r'Equation\s+\(\d+\)', '', text, flags=re.IGNORECASE)
    
    # Clean extra whitespace
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n\s*\n', '\n\n', text)
    
    return text.strip()

# print("\nBefore cleaning:")
# print(documents[6].page_content)

for doc in documents:
    doc.page_content = clean_arxiv_content(doc.page_content)

# print("\nAfter cleaning:")
# print(documents[6].page_content)

In [None]:
# Text Splitting

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 700,
    chunk_overlap = 150,
    length_function = len,
    add_start_index = True,
)

texts = text_splitter.split_documents(documents)

# replace metadata of every chunk with the metadata of the original document
metadata = extract_metadata(file_name)
for i, text in enumerate(texts):
    texts[i].metadata = metadata


print(f"Total number of chunks: {len(texts)}")
# print(texts[0].metadata)
print(f"First chunk content: {texts[0].page_content}")

In [54]:
# Embedding config

from langchain_ollama import OllamaEmbeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient

collection_name = "arxiv-abstracts"

embeddings = OllamaEmbeddings(
   model="nomic-embed-text:latest"
)

client = QdrantClient(
    url="http://localhost:6333",
)

client.create_collection(
    collection_name=collection_name,
    vectors_config={
        "size": 768, # Size of the embedding vector
        "distance": "Cosine"
    }
)
vector_store = QdrantVectorStore(
    client=client,
    collection_name=collection_name,
    embedding=embeddings,
)

In [55]:
import pandas as pd
from langchain_core.documents import Document
df = pd.read_csv('final_paper_list.csv')

abstr = df['abstract']
ids = df['id']
df = pd.DataFrame({
    "page_content": abstr,
    "metadata": [id for id in ids]
})
df = df.rename(columns={"page_content": "content", "metadata": "metadata"})


abstracts = [Document(page_content=row['content'], metadata={"id": row['metadata']}) for _, row in df.iterrows()]

abstracts

[Document(metadata={'id': 2505.00312}, page_content="deepfake detection has become increasingly important due to the rise of synthetic media, which poses significant risks to digital identity and cyber presence for security and trust. while multiple approaches have improved detection accuracy, challenges remain in achieving consistent performance across diverse datasets and manipulation types. in response, we propose AWARE-NET - a novel two-tier ensemble framework for deepfake detection based on deep learning that hierarchically combines multiple instances of three state-of-the-art architectures: xception, res2net101, and efficientnet-b7. our framework employs a unique approach where each architecture is instantiated three times with different initializations to enhance model diversity, followed by a learnable weighting mechanism that dynamically combines their predictions. unlike traditional fixed-weight ensembles, our first-tier averages predictions within each architecture family to

In [56]:

# Add documents to the vector store
vector_store.add_documents(abstracts)

['f46e2114e96e4da2955c1438f001faa3',
 '7d99f515539c413c84f7e30902162a4a',
 'c66e978a766f46c09a0403ef42cb486f',
 'dd04fdd2e53b43fd8353615c60fdbffa',
 '331abeeb596b49af96afa80b4dd6dcaf',
 '06fdc245d08c4d718eec22da2e5a5ffd',
 '8490a6ddf92e4680a4a303a21d209dab',
 '29a44c22348e48bab29d875139e65817',
 '5780e2cd1aba460980640a1e50e28cf0',
 '4df79840fc8f4e259560384877875bca',
 '6ea0fe3d16c340cfb60ccdd655206f92',
 '23f458a244fb40b78ee572beea62c2cf',
 '439b9c39e8f8484dae7e385536cc01d3',
 '7682fae2fecb459aa17039b88b0b53bd',
 '7debd3796e084a2db5b04101d5e23acf',
 'ddd8502c98b34bbfbb7411ceee5ad83b',
 '7de6bfc70023425cae440530413ded21',
 '0b63516cd0bd4dd6855a59273bdba141',
 '4acf38a6e34f4c27b39acba334bac300',
 'dec60f78ed8243bb85128f0b7fe5ffa0',
 'f08e415888544e72ab8d43dc96098d04',
 '3754a06cf6de423183ab2d19427ca15f']

In [None]:
import json
import os
from pathlib import Path

def evaluate_similarity_search(directory_path="final_datasets"):
    # Get all JSON files from the directory
    json_files = list(Path(directory_path).glob("*.jsonl"))
    print(f"Found {len(json_files)} JSON files in {directory_path}")
    matches = 0
    total = 0
    
    for file_path in json_files:
        paper_id = file_path.stem  # Get filename without extension
        print(f"Processing file: {file_path} with paper ID: {paper_id}")
        
        try:
            # Read JSON file
            with open(file_path, 'r') as f:
                data = json.load(f)
            
            # Handle both single object and list of objects
            if isinstance(data, dict):
                data = [data]
            
            # Process each question in the file
            for item in data:
                if 'question' in item:
                    total += 1
                    question = item['question']
                    
                    # Perform similarity search
                    results = vector_store.similarity_search(
                        question,
                        k=1,
                        score_threshold=0.4
                    )
                    
                    if results:
                        top_result_id = results[0].metadata['id']
                        
                        # Compare paper_id with the top result's ID
                        if str(top_result_id) == paper_id:
                            matches += 1
                            
                    print(f"Question: {question}")
                    print(f"Expected ID: {paper_id}")
                    print(f"Retrieved ID: {top_result_id if results else 'No results'}")
                    print("Match: ", str(top_result_id) == paper_id if results else False)
                    print("-" * 50)
                    
        except json.JSONDecodeError as e:
            print(f"Error parsing file {file_path}: {e}")
            continue
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")
            continue
    
    # Calculate accuracy
    accuracy = (matches / total) * 100 if total > 0 else 0
    print(f"\nFinal Results:")
    print(f"Total questions processed: {total}")
    print(f"Correct matches: {matches}")
    print(f"Accuracy: {accuracy:.2f}%")

# Run the evaluation
evaluate_similarity_search("/Users/adithyankrishnan/Downloads/arxiv-rag/final_datasets")

In [None]:
sq = "What new benchmarks are introduced in the paper by Liqiang Jing et al. for evaluating hallucination"

results = vector_store.similarity_search(
   sq, 
    k=5, 
    score_threshold=0.4
)
results = [result.metadata for result in results]
print("Search Results:")
for result in results:
    print(result)

In [None]:
import json
from pathlib import Path
from collections import Counter
from typing import List, Dict
from langchain_core.prompts import PromptTemplate
import random

def get_rewritten_queries(question: str, llm) -> List[str]:
    """Generate multiple versions of the input question using an LLM."""
    multi_query_template = PromptTemplate.from_template("""You are an AI language model assistant. Your task is to generate three 
    different versions of the given user question to retrieve relevant documents from a vector 
    database. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the distance-based similarity search. 
    Provide ONLY the alternative questions separated by newlines. Original question: {question}""")
    
    multi_query_chain = multi_query_template | llm
    queries = multi_query_chain.invoke({"question": question}).content.split('\n')
    
    # Clean up queries and add original question
    queries = [q.strip() for q in queries if q.strip()]
    queries.append(question)
    return queries

def get_top_paper_id_with_rank_aggregation(queries: List[str], vector_store, k: int = 3) -> str:
    """Get the paper ID with the lowest summed rank across multiple rewritten queries."""
    rank_scores = {}

    for query in queries:
        results = vector_store.similarity_search(query, k=k, score_threshold=0.4)
        
        for rank, doc in enumerate(results, start=1):  # rank starts at 1
            paper_id = doc.metadata['id']
            rank_scores[paper_id] = rank_scores.get(paper_id, 0) + rank
    
    if rank_scores:
        # Return the paper with the lowest total rank sum
        return min(rank_scores.items(), key=lambda x: x[1])[0]
    
    return None


def get_top_paper_id(queries: List[str], vector_store) -> str:
    """Get the most frequent paper ID from multiple queries."""
    all_results = []
    
    for query in queries:
        results = vector_store.similarity_search(
            query,
            k=1,
            score_threshold=0.4
        )
        if results:
            all_results.append(results[0].metadata['id'])
    
    # Return most common paper ID if we have results, else None
    if all_results:
        return Counter(all_results).most_common(1)[0][0]
    return None

def evaluate_similarity_search_with_rewriting(directory_path: str = "final_datasets"):
    """Evaluate similarity search with query rewriting."""
    json_files = list(Path(directory_path).glob("*.jsonl"))
    print(f"Found {len(json_files)} JSON files in {directory_path}")
    
    # Tracking metrics
    total = 0
    matches = 0
    results_log: List[Dict] = []

    index = random.randint(0, 50)
    
    for file_path in json_files[:7]:
        paper_id = file_path.stem
        print(f"\nProcessing file: {file_path}")
        
        try:
            with open(file_path, 'r') as f:
                data = json.load(f)
            
            if isinstance(data, dict):
                data = [data]
            
            for item in data[index:index+1]:  # Process only one item for brevity
                if 'question' in item:
                    total += 1
                    question = item['question']
                    
                    # Get multiple versions of the question
                    print(f"\nOriginal Question: {question}")
                    queries = get_rewritten_queries(question, smol)
                    print("Rewritten queries:")
                    for i, q in enumerate(queries, 1):
                        print(f"{i}. {q}")
                    
                    # Get most frequent paper ID from all queries
                    predicted_paper_id = get_top_paper_id(queries, vector_store)
                    print(f"Predicted Paper ID for re-written queries: {predicted_paper_id}")
                    
                    # Check if it's a match
                    is_match = str(predicted_paper_id) == paper_id if predicted_paper_id else False
                    if is_match:
                        matches += 1
                    
                    # Log results
                    result = {
                        'original_question': question,
                        'rewritten_queries': queries,
                        'expected_id': paper_id,
                        'predicted_id': predicted_paper_id,
                        'is_match': is_match
                    }
                    results_log.append(result)
                    
                    print(f"Expected ID: {paper_id}")
                    print(f"Predicted ID: {predicted_paper_id}")
                    print(f"Match: {is_match}")
                    print("-" * 50)
                    
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")
            continue
    
    # Calculate and display metrics
    accuracy = (matches / total) * 100 if total > 0 else 0
    print("\nFinal Results:")
    print(f"Total questions processed: {total}")
    print(f"Correct matches: {matches}")
    print(f"Accuracy: {accuracy:.2f}%")
    
    return results_log

# Run the evaluation
results = evaluate_similarity_search_with_rewriting("/Users/adithyankrishnan/Downloads/arxiv-rag/final_datasets")

In [None]:
# Retrieval
from langchain_core.prompts import PromptTemplate

search_query = "Explain the training implementation of the aware-net model in the paper in detail."

# Multi-query generation

multi_query_template = PromptTemplate.from_template("""You are an AI language model assistant. Your task is to generate three 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide ONLY the alternative questions separated by newlines. Original question: {question}""")

multi_query_chain = (
    multi_query_template | smol
)

def split_queries(queries):
    return queries.split("\n")

multiple_queries = split_queries(multi_query_chain.invoke({"question": search_query}).content)

print("Generated Queries:")
for i, query in enumerate(multiple_queries):
    print(f"{i+1}. {query}")

# TODO: set a threshold for similarity search score
# TODO: decide a vaue of k OR make it configurable

results = vector_store.similarity_search(
    search_query, 
    k=5, 
    score_threshold=0.4
)

# # pprint.pprint(results)

# for i in results:
#     print(i.page_content)
#     print("\n")



In [None]:
# RAG Chain
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate


# TODO: improve prompt to include more persona
prompt = PromptTemplate.from_template("""You are an expert in CVPR topics and help students to learn by answering questions solely based on the provided context which are taken from research papers in arxiv.

Focus on explaining concepts in detail and substantiate answers with relevant context from the given information.

# Steps

1. **Identify Key Concepts**: Upon receiving a question, pinpoint the core topics within CVPR relevant to the inquiry.
2. **Contextual Analysis**: Thoroughly review the provided context to gather accurate and pertinent information specific to the question.
3. **Detailed Explanation**: Craft a comprehensive explanation, incorporating key details and any relevant examples that illuminate the concept.
4. **Clarification and Depth**: Ensure the response is clear, well-substantiated, and sufficiently detailed to aid student understanding.

# Output Format

- Provide a paragraph elaborating the concept or answering the inquiry.
- Ensure clarity and depth, utilizing examples if applicable.

# Notes

- Always derive the response solely from the given context.
- Ensure terminologies and technical details are accurately explained within the framework of the provided context.
                                      
Context: {context}
Question: {question}
Answer the question based on the context provided above. If the context is not sufficient, say "I don't know" or "I don't have enough information to answer this question." Do not make up answers or provide information not present in the context.                                      
""")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    prompt | llm
)

response1 = rag_chain.invoke({"question": search_query, "context": format_docs(results)})
print("Response with context: ", response1.content)

print("\n\n")

response2 = rag_chain.invoke({"question": search_query, "context": ""})
print("Response without context: ", response2.content)
