In [1]:
import logging
import sys
import os

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Loading data (Ingestion)
from llama_index.core import SimpleDirectoryReader
import os

# Create papers directory if it doesn't exist
papers_dir = "./papers"
if not os.path.exists(papers_dir):
    os.makedirs(papers_dir)

# Load all PDF files from the papers directory and its subdirectories
documents = SimpleDirectoryReader(
    input_dir=papers_dir,
    recursive=True,  # Include subdirectories
    filename_as_id=True,  # Use filenames as document IDs
    required_exts=['.pdf']  # Only load PDF files
).load_data()

print(f"Loaded {len(documents)} documents")
print("\nSample document:")
print(f"Document ID: {documents[0].doc_id}")
print(f"Text preview: {documents[0].text[:200]}...")
# documents = SimpleDirectoryReader(
#     input_files=["./eBook-How-to-Build-a-Career-in-AI.pdf"]
# ).load_data()

# print(type(documents), "\n")
# print(len(documents), "\n")
# print(type(documents[0]))
# print(documents[0])

Loaded 134 documents

Sample document:
Document ID: /Users/panda/Desktop/RAG-Based-Research-Paper-Assistant/papers/2502.19587v1.pdf_part_0
Text preview: NeoBERT: A Next-Generation BERT
Lola Le Breton1,2,3 Quentin Fournier2 Mariam El Mezouar4 Sarath Chandar1,2,3,5
1Chandar Research Lab 2Mila – Quebec AI Institute 3Polytechnique Montréal
4Royal Military...


In [3]:
from llama_index.core import Document

document = Document(text="\n\n".join([doc.text for doc in documents]))

print(type(document))

<class 'llama_index.core.schema.Document'>


In [4]:
# Defining the hierarchical node parser
# Splits a document into a recursive hierarchy Nodes
from llama_index.core.node_parser import HierarchicalNodeParser

# top-level nodes - chunk size 2048
# second-level nodes - chunk size 512
# third-level nodes - chunk size 128
# each parent node contains 4 children nodes
node_parser = HierarchicalNodeParser.from_defaults(
    chunk_sizes=[2048, 512, 128]
)

# Parsing the document into a hierarchy of nodes
nodes = node_parser.get_nodes_from_documents([document])


from llama_index.core.node_parser import get_leaf_nodes

leaf_nodes = get_leaf_nodes(nodes)
print(leaf_nodes[30].text)

Their findings suggested that most language models
were operating in a “depth-inefficiency” regime, where allocating more parameters to width rather than
depth would have improved performance. In contrast, small language models like BERT, RoBERTa, and
NomicBERT are instead in a width-inefficiency regime. To maximize NeoBERT’s parameter efficiency while
ensuring it remains a seamless plug-and-play replacement, we retain the original BERTbase width of 768 and
instead increase its depth to achieve this optimal ratio.
Positional Information Transformers inherently lack the ability to distinguish token positions.


In [5]:
# Setting LLM and ebemdding model
# from llama_index.llms.openai import OpenAI
# from llama_index.core import Settings

# Settings.llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

# from llama_index.embeddings.openai import OpenAIEmbedding

# Settings.embed_model = OpenAIEmbedding(
#     model="text-embedding-3-small", embed_batch_size=100
# )

In [6]:
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv

from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.pinecone import PineconeVectorStore
from IPython.display import Markdown, display
from llama_index.core import StorageContext
import time

load_dotenv()
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index_name = "rag-based-research-paper-assistant-v2"

existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        name=index_name,
        dimension=1536,
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
pinecone_index = pc.Index(index_name)
time.sleep(1)
# view index stats
print(pinecone_index.describe_index_stats())

# initialize vector store
vector_store = PineconeVectorStore(
    pinecone_index=pinecone_index,
    text_key="text"
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# Get current index stats
index_stats = pinecone_index.describe_index_stats()

# Only index documents if the index is empty
if index_stats.total_vector_count == 0:
    print("Indexing documents to Pinecone...")
    storage_context.docstore.add_documents(nodes)
    automerging_index = VectorStoreIndex(
        leaf_nodes,
        storage_context=storage_context,
    )
else:
    print(f"Using existing index with {index_stats.total_vector_count} vectors")
    automerging_index = VectorStoreIndex.from_vector_store(vector_store)


{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 990}},
 'total_vector_count': 990,
 'vector_type': 'dense'}
Using existing index with 990 vectors


In [7]:
# # Storage and Indexing
# import os
# from llama_index.core import VectorStoreIndex, StorageContext
# from llama_index.core import load_index_from_storage

# # check if the index already exists
# # if not, create a new index
# if not os.path.exists("./merging_index_test1"):
#     # Creating a storage context
#     # A utility container for storing nodes, indices, and vectors
#     storage_context = StorageContext.from_defaults()
#     storage_context.docstore.add_documents(nodes)

#     # Creating a vector store index
#     automerging_index = VectorStoreIndex(
#             leaf_nodes,
#             storage_context=storage_context,
#             # service_context=auto_merging_context
#         )

#     # Persisting the index
#     automerging_index.storage_context.persist(persist_dir="./merging_index_test1")
# else:
#     # Loading the index from storage
#     automerging_index = load_index_from_storage(
#         StorageContext.from_defaults(persist_dir="./merging_index_test1"),
#         # service_context=auto_merging_context
#     )

In [8]:
# Building the Query Engine
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.core.retrievers import AutoMergingRetriever
from llama_index.core.query_engine import RetrieverQueryEngine

# Creating a retriever, get top 12 similar nodes
automerging_retriever = automerging_index.as_retriever(
    similarity_top_k=12
)

# When combined with the HierarchicalNodeParser, this enables us to
    # automatically replace retrieved nodes with their parents 
        # when a majority of children are retrieved.
retriever = AutoMergingRetriever(
    automerging_retriever, 
    automerging_index.storage_context, 
    verbose=True
)

# Reranking the retrieved nodes and returning the top 6
rerank = SentenceTransformerRerank(top_n=6, model="BAAI/bge-reranker-base")

# Creating the query engine
auto_merging_engine = RetrieverQueryEngine.from_args(
    # automerging_retriever, node_postprocessors=[rerank]
    retriever, node_postprocessors=[rerank]
)

In [9]:
auto_merging_response = auto_merging_engine.query(
    'What is Collab-Overcooked"?'
)
print(auto_merging_response)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


ValueError: doc_id e56b8151-ae0a-45ce-b69c-76952bb43f57 not found.