In [6]:
import logging
import sys
import os

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [7]:
import warnings
warnings.filterwarnings('ignore')

# Loading data (Ingestion)
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_files=["./eBook-How-to-Build-a-Career-in-AI.pdf"]
).load_data()

print(type(documents), "\n")
print(len(documents), "\n")
print(type(documents[0]))
print(documents[0])

<class 'list'> 

41 

<class 'llama_index.core.schema.Document'>
Doc ID: ac2dc905-fc5f-4fd8-ba49-008f3be81b90
Text: PAGE 1 Founder, DeepLearning.AI Collected Insights from Andrew
Ng How to  Build Your Career in AI A Simple Guide


In [8]:
from llama_index.core import Document

document = Document(text="\n\n".join([doc.text for doc in documents]))

print(type(document))

<class 'llama_index.core.schema.Document'>


In [9]:
# Defining the hierarchical node parser
# Splits a document into a recursive hierarchy Nodes
from llama_index.core.node_parser import HierarchicalNodeParser

# top-level nodes - chunk size 2048
# second-level nodes - chunk size 512
# third-level nodes - chunk size 128
# each parent node contains 4 children nodes
node_parser = HierarchicalNodeParser.from_defaults(
    chunk_sizes=[2048, 512, 128]
)

# Parsing the document into a hierarchy of nodes
nodes = node_parser.get_nodes_from_documents([document])


from llama_index.core.node_parser import get_leaf_nodes

leaf_nodes = get_leaf_nodes(nodes)
print(leaf_nodes[30].text)

Of course, I also encourage learning driven by curiosity. If something interests you, go ahead 
and learn it regardless of how useful it might turn out to be!  Maybe this will lead to a creative 
spark or technical breakthrough.
How much math do you need to know to be a machine learning engineer?


In [10]:
# Setting LLM and ebemdding model
# from llama_index.llms.openai import OpenAI
# from llama_index.core import Settings

# Settings.llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

# from llama_index.embeddings.openai import OpenAIEmbedding

# Settings.embed_model = OpenAIEmbedding(
#     model="text-embedding-3-small", embed_batch_size=100
# )

In [11]:
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv

from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.pinecone import PineconeVectorStore
from IPython.display import Markdown, display
from llama_index.core import StorageContext

load_dotenv()

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

import time

index_name = "rag-based-research-paper-assistant"
# exists = pc.Index(index_name).exists()
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        name=index_name,
        dimension=1536,
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
pinecone_index = pc.Index(index_name)
time.sleep(1)
# view index stats
print(pinecone_index.describe_index_stats())
index_stats = pinecone_index.describe_index_stats()

# initialize vector store
vector_store = PineconeVectorStore(
    pinecone_index=pinecone_index,
    text_key="text"
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# Only index documents if the index is empty
if index_stats.total_vector_count == 0:
    print("Indexing documents to Pinecone...")
    storage_context.docstore.add_documents(nodes)
    automerging_index = VectorStoreIndex(
        leaf_nodes,
        storage_context=storage_context,
    )
else:
    print(f"Using existing index with {index_stats.total_vector_count} vectors")
    automerging_index = VectorStoreIndex.from_vector_store(vector_store)


INFO:pinecone_plugin_interface.logging:Discovering subpackages in _NamespacePath(['/Users/panda/Desktop/RAG-Based-Research-Paper-Assistant/venv/lib/python3.11/site-packages/pinecone_plugins'])
Discovering subpackages in _NamespacePath(['/Users/panda/Desktop/RAG-Based-Research-Paper-Assistant/venv/lib/python3.11/site-packages/pinecone_plugins'])
INFO:pinecone_plugin_interface.logging:Looking for plugins in pinecone_plugins.inference
Looking for plugins in pinecone_plugins.inference
INFO:pinecone_plugin_interface.logging:Installing plugin inference into Pinecone
Installing plugin inference into Pinecone
INFO:pinecone_plugin_interface.logging:Discovering subpackages in _NamespacePath(['/Users/panda/Desktop/RAG-Based-Research-Paper-Assistant/venv/lib/python3.11/site-packages/pinecone_plugins'])
Discovering subpackages in _NamespacePath(['/Users/panda/Desktop/RAG-Based-Research-Paper-Assistant/venv/lib/python3.11/site-packages/pinecone_plugins'])
INFO:pinecone_plugin_interface.logging:Looki

Upserted vectors:   0%|          | 0/110 [00:00<?, ?it/s]

In [12]:
# # Storage and Indexing
# import os
# from llama_index.core import VectorStoreIndex, StorageContext
# from llama_index.core import load_index_from_storage

# # check if the index already exists
# # if not, create a new index
# if not os.path.exists("./merging_index_test1"):
#     # Creating a storage context
#     # A utility container for storing nodes, indices, and vectors
#     storage_context = StorageContext.from_defaults()
#     storage_context.docstore.add_documents(nodes)

#     # Creating a vector store index
#     automerging_index = VectorStoreIndex(
#             leaf_nodes,
#             storage_context=storage_context,
#             # service_context=auto_merging_context
#         )

#     # Persisting the index
#     automerging_index.storage_context.persist(persist_dir="./merging_index_test1")
# else:
#     # Loading the index from storage
#     automerging_index = load_index_from_storage(
#         StorageContext.from_defaults(persist_dir="./merging_index_test1"),
#         # service_context=auto_merging_context
#     )

In [13]:
# Building the Query Engine
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.core.retrievers import AutoMergingRetriever
from llama_index.core.query_engine import RetrieverQueryEngine

# Creating a retriever, get top 12 similar nodes
automerging_retriever = automerging_index.as_retriever(
    similarity_top_k=12
)

# When combined with the HierarchicalNodeParser, this enables us to
    # automatically replace retrieved nodes with their parents 
        # when a majority of children are retrieved.
retriever = AutoMergingRetriever(
    automerging_retriever, 
    automerging_index.storage_context, 
    verbose=True
)

# Reranking the retrieved nodes and returning the top 6
rerank = SentenceTransformerRerank(top_n=6, model="BAAI/bge-reranker-base")

# Creating the query engine
auto_merging_engine = RetrieverQueryEngine.from_args(
    # automerging_retriever, node_postprocessors=[rerank]
    retriever, node_postprocessors=[rerank]
)

In [15]:
auto_merging_response = auto_merging_engine.query(
    # 'How do I pick projects to strengthen my AI resume?'
    # "What is the importance of networking in AI?"
    'What are the best practices for building a career in AI?'
)
print(auto_merging_response)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:llama_index.core.retrievers.auto_merging_retriever:> Merging 3 nodes into parent node.
> Parent node id: d704f99c-fcfd-4a69-8ed7-265566a31128.
> Parent node text: LEARNING
PROJECTS
JOB

PAGE 8
Learning Technical 
Skills for a Promising 
AI Career
CHAPTER 2
LEA...

> Merging 3 nodes into parent node.
> Parent node id: d704f99c-fcfd-4a69-8ed7-265566a31128.
> Parent node text: LEARNING
PROJECTS
JOB

PAGE 8
Learning Technical 
Skills for a Promising 
AI Career
CHAPTER 2
LEA...

> Merging 3 nodes into parent node.
> Parent node id: d704f99c-fcfd-4a69-8ed7-265566a31128.
> Parent node text: LEARNING
PROJECTS
JOB

PAGE 8
Learning Technical 
Skills for a Promising 
AI Career
CHAPTER 2
LEA...



Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
Start by learning foundational technical skills such as machine learning models, deep learning, software development, and math relevant to machine learning. Prioritize topic selection and understand core concepts behind machine learning. Additionally, work on projects to gain practical experience and build a portfolio that shows skill progression. Finally, find the right AI job for you by using informational interviews, scoping successful AI projects, and aligning projects with your career goals.
