In [None]:
# LLM config

from langchain_openai import ChatOpenAI
from langchain import hub
from os import getenv
from dotenv import load_dotenv
load_dotenv()

llm = ChatOpenAI(
    model = "meta-llama/Llama-3.3-70B-Instruct",
    base_url= "https://api.intelligence.io.solutions/api/v1",
    api_key=getenv("OPENAI_API_KEY")
)

In [None]:
# Text Loading 

from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from scraping.metadata import extract_metadata
import pprint

directory_name = "data"
file_name = "2505.00312 aware-net_adaptive_weighted_averaging_for_robust_ensemble_network_in_deepfake_detection.pdf"

#TODO: generalize this to automatically split all files from a directory
# from langchain_community.document_loaders import FileSystemBlobLoader
# from langchain_community.document_loaders.generic import GenericLoader
# from langchain_community.document_loaders.parsers import PyMuPDFParser

# loader = GenericLoader(
#     blob_loader=FileSystemBlobLoader(
#         path=directory_name,
#         glob="*.pdf",
#     ),
#     blob_parser=PyMuPDFParser(),
# )
# documents = loader.load()
# print(documents[0].page_content)

loader = PyMuPDFLoader(
    file_path=f"{directory_name}/{file_name}",
    extract_tables="markdown"
)
documents = loader.load()

#TODO: clean up the documents byremoving references etc.

def clean_arxiv_content(text):

    import re
    # Remove references, bibliography or works cited sections regardless of case or extra formatting
    text = re.sub(
        r'\n\s*(References|Bibliography|Works Cited)\s*:?\s*\n.*', 
        '', 
        text, 
        flags=re.DOTALL | re.IGNORECASE
    )
    
    # Remove citation patterns [1], [2-5], (Author, 2023)
    text = re.sub(r'\[\d+(?:[-,]\s*\d+)*\]', '', text)
    text = re.sub(r'\([A-Za-z\s]+,?\s*\d{4}[a-z]?\)', '', text)
    
    # Clean LaTeX artifacts
    text = re.sub(r'\\[a-zA-Z]+\{[^}]*\}', '', text)  # \textbf{}, \cite{}, etc.
    text = re.sub(r'\\[a-zA-Z]+', '', text)  # \section, \subsection
    
    # # Remove figure/table references
    # text = re.sub(r'Figure\s+\d+', 'Figure', text, flags=re.IGNORECASE)
    # text = re.sub(r'Table\s+\d+', 'Table', text, flags=re.IGNORECASE)
    # text = re.sub(r'Equation\s+\(\d+\)', '', text, flags=re.IGNORECASE)
    
    # # Clean extra whitespace
    # text = re.sub(r'\s+', ' ', text)
    # text = re.sub(r'\n\s*\n', '\n\n', text)
    
    return text.strip()

print("\nBefore cleaning:")
print(documents[6].page_content)

for doc in documents:
    doc.page_content = clean_arxiv_content(doc.page_content)

print("\nAfter cleaning:")
print(documents[6].page_content)

In [None]:
# Text Splitting

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 200,
    length_function = len,
    add_start_index = True,
)

texts = text_splitter.split_documents(documents)

from langchain_text_splitters

# replace metadata of every chunk with the metadata of the original document
metadata = extract_metadata(file_name)
for i, text in enumerate(texts):
    texts[i].metadata = metadata


print(f"Total number of chunks: {len(texts)}")
# print(texts[0].metadata)
print(f"First chunk content: {texts[0].page_content}")

In [None]:
# Embedding config

from langchain_ollama import OllamaEmbeddings
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient

collection_name = "arxiv-reader"

embeddings = OllamaEmbeddings(
   model="nomic-embed-text:latest"
)

client = QdrantClient(
    url="http://localhost:6333",
)

client.create_collection(
    collection_name=collection_name,
    vectors_config={
        "size": 768, # Size of the embedding vector
        "distance": "Cosine"
    }
)
vector_store = QdrantVectorStore(
    client=client,
    collection_name=collection_name,
    embedding=embeddings,
)

In [None]:

# Add documents to the vector store
vector_store.add_documents(texts)

In [None]:
# Retrieval

search_query = "Explain me about the implementation of aware-net in the paper in detail."

# TODO: set a threshold for similarity search score
# TODO: decide a vaue of k OR make it configurable

results = vector_store.similarity_search(
    search_query, 
    k=3, 
    score_threshold=0.5
)
pprint.pprint(len(results))
pprint.pprint(results)

In [None]:
# RAG Chain
from langchain_core.output_parsers import StrOutputParser

# TODO: improve prompt to include more persona
prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    prompt | llm
)

response1 = rag_chain.invoke({"question": search_query, "context": format_docs(results)})
print("Response with context: ", response1.content)

print("\n\n")

response2 = rag_chain.invoke({"question": search_query, "context": ""})
print("Response without context: ", response2.content)
