In [1]:
from gettext import install 
import pip 

In [2]:
import os

In [3]:
from dotenv import load_dotenv 
load_dotenv()

True

In [4]:
os.environ["OPENAI_API_KEY"]=os.getenv("openai_api_key")

In [5]:
import llama_index

In [6]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader


In [7]:
# load data
documents = SimpleDirectoryReader("data").load_data()
documents

[Document(id_='e29427d7-9944-40bb-9306-50d321319691', embedding=None, metadata={'file_path': 'data\\Atlanta.txt', 'file_name': 'Atlanta.txt', 'file_type': 'text/plain', 'file_size': 75773, 'creation_date': '2024-02-14', 'last_modified_date': '2024-02-14', 'last_accessed_date': '2024-02-15'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='Atlanta ( at-LAN-tə, or  at-LAN-ə) is the capital and most populous city in the U.S. state of Georgia. It is the seat of Fulton County, and a portion of the city extends into neighboring DeKalb County. With a population of 498,715 living within the city limits, Atlanta is the eighth most populous city in the Southeast and 38th most populous city in the United States according to the 2020 U.S. census. It is the core of t

In [8]:
index = VectorStoreIndex.from_documents(documents)

In [9]:
index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x2001c987580>

In [10]:
# Auto retriever
from llama_index.core.query_engine import RetrieverQueryEngine 

In [11]:
# Define QueryEngine
retriever = index.as_retriever(similarity_top_k=2)
retriever_query_engine = RetrieverQueryEngine.from_args(
    retriever)

# Used your advanced RAG
response = retriever_query_engine.query("A user query")

In [12]:
# sub question query engine

In [13]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import SubQuestionQueryEngine

In [14]:
# setup base query engine as tool
query_engine_tools = [
    QueryEngineTool(
        query_engine=retriever_query_engine,
            metadata=ToolMetadata(
            name="pg_essay",
            description="Paul Graham essay on What I Worked On",
        ),
    ),
]

query_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=query_engine_tools,
    use_async=True,
)

In [15]:
import nest_asyncio

nest_asyncio.apply()

In [16]:
response = query_engine.query(
    "What was the author's content,theme,plans in the data?")


Generated 3 sub questions.
[1;3;38;2;237;90;200m[pg_essay] Q: What is the content of the essay?
[0m[1;3;38;2;90;149;237m[pg_essay] Q: What is the theme of the essay?
[0m[1;3;38;2;11;159;203m[pg_essay] Q: What are the author's plans mentioned in the essay?
[0m[1;3;38;2;90;149;237m[pg_essay] A: The theme of the essay is the relationship between intelligence and the generation of new ideas.
[0m[1;3;38;2;11;159;203m[pg_essay] A: The author does not mention any specific plans in the essay.
[0m[1;3;38;2;237;90;200m[pg_essay] A: The essay discusses the relationship between intelligence and new ideas. It argues that writing ability is an important ingredient in discovering new ideas and that there is a kind of thinking that one does by writing. The essay also explores other ingredients in having new ideas, such as independent-mindedness and an obsessive interest in a particular topic. It suggests that there are general techniques and collections of techniques for generating new idea

In [17]:
print(response)

The author's content in the essay is the relationship between intelligence and the generation of new ideas. The theme of the essay is also the relationship between intelligence and the generation of new ideas. However, the author does not mention any specific plans in the essay.


In [18]:
###MULTI DOCUMENT AGENTS

In [19]:
%pip install llama-index-llms-openai

Note: you may need to restart the kernel to use updated packages.


In [20]:
from llama_index.core import (
    VectorStoreIndex,
    SimpleKeywordTableIndex,
    SimpleDirectoryReader,
)
from llama_index.core import SummaryIndex
from llama_index.core.schema import IndexNode
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.llms.openai import OpenAI
from llama_index.core.callbacks import CallbackManager

In [21]:
wiki_titles = [
    "Toronto",
    "Seattle",
    "Chicago",
]


In [22]:
from pathlib import Path

import requests

for title in wiki_titles:
    response = requests.get(
        "https://en.wikipedia.org/w/api.php",
        params={
            "action": "query",
            "format": "json",
            "titles": title,
            "prop": "extracts",
            # 'exintro': True,
            "explaintext": True,
        },
    ).json()
    page = next(iter(response["query"]["pages"].values()))
    wiki_text = page["extract"]

    data_path = Path("data")
    if not data_path.exists():
        Path.mkdir(data_path)

    with open(data_path / f"{title}.txt", "w") as fp:
        fp.write(wiki_text)

In [23]:
# Load all wiki documents
city_docs = {}
for wiki_title in wiki_titles:
    city_docs[wiki_title] = SimpleDirectoryReader(
        input_files=[f"data/{wiki_title}.txt"]
    ).load_data()

In [24]:
import os

os.environ["OPENAI_API_KEY"] = os.getenv("openai_api_key")

In [25]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings

Settings.llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-ada-002")

In [26]:
from llama_index.agent.openai import OpenAIAgent
from llama_index.core import load_index_from_storage, StorageContext
from llama_index.core.node_parser import SentenceSplitter
import os

node_parser = SentenceSplitter()

# Build agents dictionary
agents = {}
query_engines = {}

# this is for the baseline
all_nodes = []

for idx, wiki_title in enumerate(wiki_titles):
    nodes = node_parser.get_nodes_from_documents(city_docs[wiki_title])
    all_nodes.extend(nodes)

    if not os.path.exists(f"./data/{wiki_title}"):
        # build vector index
        vector_index = VectorStoreIndex(nodes)
        vector_index.storage_context.persist(
            persist_dir=f"./data/{wiki_title}"
        )
    else:
        vector_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=f"./data/{wiki_title}"),
        )

    # build summary index
    summary_index = SummaryIndex(nodes)
    # define query engines
    vector_query_engine = vector_index.as_query_engine(llm=llama_index)
    summary_query_engine = summary_index.as_query_engine(llm=llama_index)

    # define tools
    query_engine_tools = [
        QueryEngineTool(
            query_engine=vector_query_engine,
            metadata=ToolMetadata(
                name="vector_tool",
                description=(
                    "Useful for questions related to specific aspects of"
                    f" {wiki_title} (e.g. the history, arts and culture,"
                    " sports, demographics, or more)."
                ),
            ),
        ),
        QueryEngineTool(
            query_engine=summary_query_engine,
            metadata=ToolMetadata(
                name="summary_tool",
                description=(
                    "Useful for any requests that require a holistic summary"
                    f" of EVERYTHING about {wiki_title}. For questions about"
                    " more specific sections, please use the vector_tool."
                ),
            ),
        ),
    ]

    # build agent
    function_llm = OpenAI(model="gpt-4")
    agent = OpenAIAgent.from_tools(
        query_engine_tools,
        llm=function_llm,
        verbose=True,
        system_prompt=f"""\
You are a specialized agent designed to answer queries about {wiki_title}.
You must ALWAYS use at least one of the tools provided when answering a question; do NOT rely on prior knowledge.\
""",
    )

    agents[wiki_title] = agent
    query_engines[wiki_title] = vector_index.as_query_engine(
        similarity_top_k=2
    )

In [33]:
base_index = VectorStoreIndex(all_nodes)
base_query_engine = base_index.as_query_engine(similarity_top_k=4)

In [34]:
# baseline
response = base_query_engine.query(
    "Tell me about the arts and culture in Toronto"
)
print(str(response))

Toronto has a vibrant arts and culture scene. The city is home to numerous art galleries and museums, including the Royal Ontario Museum, the Art Gallery of Ontario, and the Gardiner Museum of ceramic art. Toronto also hosts the Ontario Science Centre, the Bata Shoe Museum, and the Textile Museum of Canada. The city has a thriving theatre and performing arts scene, with ballet and dance companies, opera companies, symphony orchestras, and theatres. Notable performance venues include the Four Seasons Centre for the Performing Arts, Roy Thomson Hall, and the Princess of Wales Theatre. Toronto is also a major center for film and television production, with the Toronto International Film Festival being a prominent annual event. The city is known for its festivals, such as Caribana and Pride Week, which attract large numbers of visitors. Overall, Toronto offers a rich and diverse arts and culture experience for residents and tourists alike.
