In [None]:
!pip install sentence-transformers
!pip install llama-index
!pip install langchain
!pip install faiss-gpu
!pip install openai

# import locale
# locale.getpreferredencoding = lambda: "UTF-8"

# !pip install protobuf
# !pip install --upgrade protobuf

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
!pip install pdf2image

In [None]:
from pathlib import Path

from llama_index import (
    GPTVectorStoreIndex,
    ServiceContext,
    StorageContext,
    download_loader,
    load_index_from_storage,
)

years = [2022, 2021, 2020, 2019]
UnstructuredReader = download_loader("UnstructuredReader", refresh_cache=True)

loader = UnstructuredReader()

doc_set = {}
all_docs = []

for year in years:
    year_docs = loader.load_data(
        file=Path(f"/content/drive/MyDrive/Colab Notebooks/data/UBER/UBER_{year}.html"),
        split_documents=False,
    )
    for d in year_docs:
        d.extra_info = {"year": year}
    doc_set[year] = year_docs
    all_docs.extend(year_docs)

In [25]:
from llama_index import LangchainEmbedding
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import os

os.environ['OPENAI_API_KEY'] = 'YOUR_OPENAI_API_KEY'

embed_model = LangchainEmbedding(
    HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
)

service_context = ServiceContext.from_defaults(embed_model=embed_model, chunk_size_limit=512)
index_set = {}

for year in years:
  storage_context = StorageContext.from_defaults()
  cur_index = GPTVectorStoreIndex.from_documents(
      doc_set[year],
      service_context=service_context,
      storage_context=storage_context
  )
  index_set[year] = cur_index
  storage_context.persist(persist_dir=f'./storage/{year}')

!pwd
!ls storage

/content
2019  2020  2021  2022	root


In [26]:
from llama_index import (
    GPTListIndex,
    LLMPredictor,
    ServiceContext,
    load_graph_from_storage,
)
from langchain.chat_models import ChatOpenAI
from llama_index.indices.composability import ComposableGraph

index_summaries = [f"UBER 10-k Filing for {year} fiscal year" for year in years]

llm_predictor = LLMPredictor(
    llm=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, max_tokens=512)
)
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)
storage_context = StorageContext.from_defaults()

graph = ComposableGraph.from_indices(
    GPTListIndex,
    [index_set[y] for y in years],
    index_summaries=index_summaries,
    service_context=service_context,
    storage_context=storage_context,
)

root_id = graph.root_id

storage_context.persist(persist_dir=f"./storage/root")

graph = load_graph_from_storage(
    root_id=root_id, service_context=service_context, storage_context=storage_context
)



In [27]:
from langchain.agents import Tool
from langchain.chains.conversation.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.agents import initialize_agent
from llama_index.langchain_helpers.agents import (
    LlamaToolkit,
    create_llama_chat_agent,
    IndexToolConfig,
)

In [28]:
from llama_index.indices.query.query_transform.base import DecomposeQueryTransform

decompose_transform = DecomposeQueryTransform(llm_predictor, verbose=True)

In [29]:
from llama_index.query_engine.transform_query_engine import TransformQueryEngine

custom_query_engines = {}

for index in index_set.values():
    query_engine = index.as_query_engine()
    query_engine = TransformQueryEngine(
        query_engine,
        query_transform=decompose_transform,
        transform_extra_info={"index_summary": index.index_struct.summary},
    )
    custom_query_engines[index.index_id] = query_engine

custom_query_engines[graph.root_id] = graph.root_index.as_query_engine(
    response_mode="tree_summarize", verbose=True
)

In [30]:
graph_config = IndexToolConfig(
    query_engine=query_engine,
    name=f"Graph Index",
    description="useful for when you want to answer queries that require analyzing multiple SEC 10-K documents for Uber.",
    tool_kwargs={"return_direct": True},
)

In [31]:
index_configs = []
for y in years:
    query_engine = index_set[y].as_query_engine(similarity_top_k=3)
    tool_config = IndexToolConfig(
        query_engine=query_engine,
        name=f"Vector Index {y}",
        description=f"useful for when you want to answer queries about the {y} SEC 10-K for Uber",
        tool_kwargs={"return_direct": True},
    )
    index_configs.append(tool_config)

In [32]:
toolkit = LlamaToolkit(index_configs=index_configs + [graph_config])

In [33]:
memory = ConversationBufferMemory(memory_key="chat_history")
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, max_tokens=512)

agent_chain = create_llama_chat_agent(toolkit, llm, memory=memory, verbose=True)

In [34]:
agent_chain.run(input="hi, i am bob")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: Do I need to use a tool? No
AI: Hello Bob! How can I assist you today?[0m

[1m> Finished chain.[0m


'Hello Bob! How can I assist you today?'

In [35]:
agent_chain.run(input="What were some of the biggest risk factors in 2020 for Uber?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: Do I need to use a tool? Yes
Action: Vector Index 2020
Action Input: "What were some of the biggest risk factors in 2020 for Uber?"[0m
Observation: [38;5;200m[1;3m
Some of the biggest risk factors in 2020 for Uber included:

1. Safety and security risks for Drivers and riders, including potential robbery, assault, violent or fatal attacks, and other criminal acts.

2. Inability to collect the entire service fee for certain cash-based trips.

3. Loss or material modification of credit card acceptance privileges.

4. Compliance risks with respect to a variety of rules and regulations, including anti-money laundering laws.

5. Inability to attract and engage Drivers, consumers, merchants, shippers, and carriers, as well as contractors and consultants that support global operations.

6. Lack of protections and benefits for independent workers.[0m
[32;1m[1;3m[0m

[1m> Finished chain.[0m


'\nSome of the biggest risk factors in 2020 for Uber included:\n\n1. Safety and security risks for Drivers and riders, including potential robbery, assault, violent or fatal attacks, and other criminal acts.\n\n2. Inability to collect the entire service fee for certain cash-based trips.\n\n3. Loss or material modification of credit card acceptance privileges.\n\n4. Compliance risks with respect to a variety of rules and regulations, including anti-money laundering laws.\n\n5. Inability to attract and engage Drivers, consumers, merchants, shippers, and carriers, as well as contractors and consultants that support global operations.\n\n6. Lack of protections and benefits for independent workers.'

In [36]:
cross_query_str = "Compare/contrast the risk factors described in the Uber 10-K across years. Give answer in bullet points."
agent_chain.run(input=cross_query_str)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: Do I need to use a tool? Yes
Action: Vector Index 2022
Action Input: Compare/contrast the risk factors described in the Uber 10-K across years.[0m
Observation: [36;1m[1;3m
In the Uber 10-K for 2022, the risk factors discussed include the potential for Drivers to be classified as employees, workers, or quasi-employees instead of independent contractors, the competitive nature of the mobility, delivery, and logistics industries, the potential for operating expenses to increase significantly, the need to attract and maintain a critical mass of Drivers, consumers, merchants, Shippers, and Carriers, the need to retain and attract high-quality personnel, the importance of maintaining and enhancing the brand and reputation, and the need to address historical workplace culture and forward-leaning approach challenges.

In comparison, the risk factors discussed in the Uber 10-K for prior years include the potential for Driv

'\nIn the Uber 10-K for 2022, the risk factors discussed include the potential for Drivers to be classified as employees, workers, or quasi-employees instead of independent contractors, the competitive nature of the mobility, delivery, and logistics industries, the potential for operating expenses to increase significantly, the need to attract and maintain a critical mass of Drivers, consumers, merchants, Shippers, and Carriers, the need to retain and attract high-quality personnel, the importance of maintaining and enhancing the brand and reputation, and the need to address historical workplace culture and forward-leaning approach challenges.\n\nIn comparison, the risk factors discussed in the Uber 10-K for prior years include the potential for Drivers to be classified as employees, workers, or quasi-employees instead of independent contractors, the competitive nature of the mobility, delivery, and logistics industries, the potential for operating expenses to increase significantly, t

In [38]:
while True:
    text_input = input("User: ")
    response = agent_chain.run(input=text_input)
    print(f"Agent: {response}")

User: What were some of the legal proceedings against Uber in 2022?


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: Do I need to use a tool? Yes
Action: Vector Index 2022
Action Input: "legal proceedings"[0m
Observation: [36;1m[1;3m
Yes, legal proceedings are mentioned in the context information. The context information mentions that Uber is involved in legal proceedings, litigation, claims, and government investigations in the ordinary course of business, as well as class action lawsuits alleging that Uber entities conspired to injure certain participants in the taxi, hire-car, and limousine industries. The context information also mentions that Uber is subject to claims related to the classification of Drivers and the compliance of its business with applicable law, and that Uber is obligated to develop and maintain proper and effective internal controls over financial reporting.[0m
[32;1m[1;3m[0m

[1m> Finished chain.[0m
Agent: 
Yes, legal proceedings a

KeyboardInterrupt: ignored