In [26]:
from dotenv import load_dotenv

load_dotenv()


True

In [27]:
from llama_index.core import SimpleDirectoryReader


# load documents
documents = SimpleDirectoryReader('./paul_graham/').load_data()
len(documents)

1

In [28]:
from llama_index.core.node_parser import SentenceSplitter

# Initialize the sentence splitter with desired parameters
node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=64)

# Assuming 'documents' is a list of Document objects
nodes = node_parser.get_nodes_from_documents(documents)


In [29]:
len(nodes[0].text.split())

355

In [30]:
from llama_index.vector_stores.deeplake import DeepLakeVectorStore

my_activeloop_id = "charanvardhan"
my_activeloop_dataset = "LlamaIndex_paulgraham_essay"
dataset_path = f"hub://{my_activeloop_id}/{my_activeloop_dataset}"

# Create a DeepLake vector storepip install deeplake[enterprise]
vector_store = DeepLakeVectorStore(
    dataset_path=dataset_path,
    overwrite=False,
)

In [31]:
from llama_index.core import StorageContext

storage_context = StorageContext.from_defaults(vector_store=vector_store)
storage_context.docstore.add_documents(nodes)

In [32]:
all_node_ids = list(storage_context.docstore.docs.keys())
print(f"Total nodes: {len(all_node_ids)}")
print("First 5 node IDs:", all_node_ids[:5])


Total nodes: 42
First 5 node IDs: ['1ee5f532-adf4-4e5a-9510-75055a85cce6', '555ec155-01bf-414e-ac0c-bd78f40b10eb', 'f918e3df-2b18-4d88-bb0b-e0a9d9e3118f', '4ecb4164-f644-4b27-901d-8bd54121de62', 'd518d469-8525-4711-8f57-b0a55cbc411d']


In [33]:
all_nodes = [storage_context.docstore.get_node(node_id) for node_id in all_node_ids]
all_nodes[0].text


'What I Worked On\n\nFebruary 2021\n\nBefore college the two main things I worked on, outside of school, were writing and programming. I didn\'t write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.\n\nThe first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The school district\'s 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain\'s lair down there, with all these alien-looking machines — CPU, disk drives, printer, card reader — sitting up on a raised floor under bright fluorescent lights.\n\nThe language we used was an early version of Fortran. You had to type programs on punch cards, then stack

In [34]:
from llama_index.core import VectorStoreIndex

vector_index = VectorStoreIndex(nodes, storage_context=storage_context)

In [35]:
query_engine = vector_index.as_query_engine(streaming=True, similarity_top_k=10)

In [36]:
  # type: ignore

In [37]:
dir(query_engine)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_apply_node_postprocessors',
 '_aquery',
 '_as_query_component',
 '_get_prompt_modules',
 '_get_prompts',
 '_node_postprocessors',
 '_query',
 '_response_synthesizer',
 '_retriever',
 '_update_prompts',
 '_validate_prompts',
 'aquery',
 'aretrieve',
 'as_query_component',
 'asynthesize',
 'callback_manager',
 'from_args',
 'get_prompts',
 'query',
 'retrieve',
 'retriever',
 'synthesize',
 'update_prompts',
 'with_retriever']

In [38]:
streaming_response = query_engine.query(
    "What does Paul Graham do?",
)
streaming_response.print_response_stream()

Paul Graham organizes a Summer Founders Program for undergraduates, where he invites them to apply and selects a few to fund. He also gives talks on startups and provides advice on starting a startup. Additionally, he shares insights on seed funding and the importance of successful startup founders as sources of advice.

## SubQuestion Query Engine

In [39]:
query_engine = vector_index.as_query_engine(similarity_top_k=10)

In [40]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import SubQuestionQueryEngine

query_engine_tools = [QueryEngineTool
                     (
                        query_engine=query_engine,
                        metadata=ToolMetadata(
                            name="pg_essay",
                            description="Paul Graaham essay on what i worked on"
                        ),
                     ),]

query_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=query_engine_tools,
    use_async=True,
)

In [41]:
query_engine

<llama_index.core.query_engine.sub_question_query_engine.SubQuestionQueryEngine at 0x128f13010>

In [42]:
import nest_asyncio
nest_asyncio.apply()


In [43]:
response = query_engine.query(
    "How was Paul Grahams life different before, during, and after YC?"
)

Generated 3 sub questions.
[1;3;38;2;237;90;200m[pg_essay] Q: What did Paul Graham work on before Y Combinator?
[0m[1;3;38;2;90;149;237m[pg_essay] Q: What did Paul Graham work on during Y Combinator?
[0m[1;3;38;2;11;159;203m[pg_essay] Q: What did Paul Graham work on after Y Combinator?
[0m[1;3;38;2;11;159;203m[pg_essay] A: After Y Combinator, Paul Graham worked on angel investing.
[0m[1;3;38;2;237;90;200m[pg_essay] A: Before Y Combinator, Paul Graham was involved in projects with Robert and Trevor, and he was also considering angel investing.
[0m[1;3;38;2;90;149;237m[pg_essay] A: Paul Graham worked on scaling startup funding and creating a community of startups through Y Combinator. He also focused on providing support and resources to batches of startups, fostering a network where startups could help each other and even become each other's customers. Additionally, he initially intended to work on three things: hacking, writing essays, and Y Combinator, but as Y Combinator g

In [44]:
print( "The final response :\n", response )

The final response :
 Paul Graham's life was focused on projects with Robert and Trevor before Y Combinator, during Y Combinator he worked on scaling startup funding and building a community of startups, and after Y Combinator, he shifted his focus to angel investing.


### Reranking Documents

In [45]:
import cohere

# Get your cohere API key on: www.cohere.com
co = cohere.Client()

# Example query and passages
query = "What is the capital of the United States?"
documents = [
   "Carson City is the capital city of the American state of Nevada. At the  2010 United States Census, Carson City had a population of 55,274.",
   "The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean that are a political division controlled by the United States. Its capital is Saipan.",
   "Charlotte Amalie is the capital and largest city of the United States Virgin Islands. It has about 20,000 people. The city is on the island of Saint Thomas.",
   "Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district. ",
   "Capital punishment (the death penalty) has existed in the United States since before the United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states.",
   "North Dakota is a state in the United States. 672,591 people lived in North Dakota in the year 2010. The capital and seat of government is Bismarck."
   ]

In [76]:
results = co.rerank(query=query, documents=documents, top_n=3, model="rerank-english-v2.0")

In [83]:
for rank, item in enumerate(results.results, start=1):
    doc_idx = item.index           # this is an int
    text    = documents[doc_idx]   # grab it from your original list

    print(f"Document Rank: {rank}, Document Index: {doc_idx}")
    print(f"Document: {text}")
    print(f"relevance: {item.relevance_score:.4f}\n")

Document Rank: 1, Document Index: 3
Document: Washington, D.C. (also known as simply Washington or D.C., and officially as the District of Columbia) is the capital of the United States. It is a federal district. 
relevance: 0.9800

Document Rank: 2, Document Index: 1
Document: The Commonwealth of the Northern Mariana Islands is a group of islands in the Pacific Ocean that are a political division controlled by the United States. Its capital is Saipan.
relevance: 0.2991

Document Rank: 3, Document Index: 4
Document: Capital punishment (the death penalty) has existed in the United States since before the United States was a country. As of 2017, capital punishment is legal in 30 of the 50 states.
relevance: 0.2849



## Cohere in LlamaIndex

In [93]:
import os
from llama_index.postprocessor.cohere_rerank import CohereRerank

In [94]:
cohere_rerank = CohereRerank(api_key=os.environ['CO_API_KEY'],top_n=2)

In [95]:
query_engine = vector_index.as_query_engine(similarity_top_k=10, node_postprocessors=[cohere_rerank])

In [96]:
response = query_engine.query("what did Sam Altman do in this essay?")
print(response)

Sam Altman was approached to become the president of Y Combinator (YC) after the original founders decided to hand over control of the organization to ensure its longevity. Initially hesitant due to his interest in starting a startup focused on nuclear reactors, Sam eventually agreed to take over as president of YC starting with the winter 2014 batch.
