In [1]:
#!mkdir data
#!wget "https://www.dropbox.com/s/948jr9cfs7fgj99/UBER.zip?dl=1" -O data/UBER.zip
#!unzip data/UBER.zip -d data

A subdirectory or file data already exists.
'wget' is not recognized as an internal or external command,
operable program or batch file.
'unzip' is not recognized as an internal or external command,
operable program or batch file.


In [2]:

from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [3]:
!pip3 install llama_index

Collecting llama_index
  Downloading llama_index-0.6.9-py3-none-any.whl (403 kB)
     -------------------------------------- 403.9/403.9 kB 2.8 MB/s eta 0:00:00
Collecting openai>=0.26.4
  Downloading openai-0.27.7-py3-none-any.whl (71 kB)
     ---------------------------------------- 72.0/72.0 kB 3.9 MB/s eta 0:00:00
Collecting dataclasses-json
  Downloading dataclasses_json-0.5.7-py3-none-any.whl (25 kB)
Collecting langchain>=0.0.154
  Downloading langchain-0.0.174-py3-none-any.whl (869 kB)
     -------------------------------------- 869.7/869.7 kB 6.9 MB/s eta 0:00:00
Collecting fsspec>=2023.5.0
  Downloading fsspec-2023.5.0-py3-none-any.whl (160 kB)
     ------------------------------------- 160.1/160.1 kB 10.0 MB/s eta 0:00:00
Collecting tenacity<9.0.0,>=8.2.0
  Downloading tenacity-8.2.2-py3-none-any.whl (24 kB)
Collecting tiktoken
  Downloading tiktoken-0.4.0-cp310-cp310-win_amd64.whl (635 kB)
     -------------------------------------- 635.3/635.3 kB 9.9 MB/s eta 0:00:00
Collec

In [3]:
from llama_index import download_loader, GPTVectorStoreIndex, ServiceContext
from pathlib import Path

Could not import azure.core python package.


### Ingest Unstructured Data Through the Unstructured.io Reader
##### Leverage the capabilities of Unstructured.io HTML parsing. Downloaded through LlamaHub.

In [4]:
years = [2022, 2021, 2020, 2019]

In [6]:
UnstructuredReader = download_loader("UnstructuredReader", refresh_cache=True)

In [10]:
!pip3 install pdf2image

Collecting pdf2image
  Downloading pdf2image-1.16.3-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.16.3


In [8]:
!pip3 install pytesseract

Collecting pytesseract
  Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.10


In [7]:
loader = UnstructuredReader()
doc_set = {}
all_docs = []
for year in years:
    year_docs = loader.load_data(file=Path(f'./data/UBER/UBER_{year}.html'), split_documents=False)
    # insert year metadata into each year
    for d in year_docs:
        d.extra_info = {"year": year}
    doc_set[year] = year_docs
    all_docs.extend(year_docs)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [8]:
import os
os.environ["OPENAI_API_KEY"] = 'Api key'

### Setup a Vector Index for each SEC filing
##### We setup a separate vector index for each SEC filing from 2019-2022.

##### We also optionally initialize a "global" index by dumping all files into the vector store.

In [10]:
# initialize simple vector indices + global vector index
# NOTE: don't run this cell if the indices are already loaded! 
index_set = {}
service_context = ServiceContext.from_defaults(chunk_size_limit=512)
for year in years:
    cur_index = GPTVectorStoreIndex.from_documents(doc_set[year], service_context=service_context)
    index_set[year] = cur_index


In [11]:
index_set = {}
for year in years:
    index_set[year] = cur_index

In [12]:
from llama_index import GPTListIndex, LLMPredictor
from langchain import OpenAI
from llama_index.indices.composability import ComposableGraph

### Composing a Graph to synthesize answers across 10-K filings (2019-2022)
##### We want our queries to aggregate/synthesize information across all 10-K filings. To do this, we define a List index on top of the 4 vector indices.

In [13]:
# set summary text for each doc
index_summaries = [f"UBER 10-k Filing for {year} fiscal year" for year in years]

In [14]:
# set number of output tokens
llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, max_tokens=512))
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)

In [15]:
# define a list index over the vector indices
# allows us to synthesize information across each index
graph = ComposableGraph.from_indices(
    GPTListIndex, 
    [index_set[y] for y in years], 
    index_summaries=index_summaries,
    service_context=service_context
)

### Setting up the Chatbot Agent
##### We use Langchain to define the outer chatbot abstraction. We use LlamaIndex as a core Tool within this abstraction.

In [16]:
from langchain.agents import Tool
from langchain.chains.conversation.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.agents import initialize_agent

from llama_index.langchain_helpers.agents import LlamaToolkit, create_llama_chat_agent, IndexToolConfig

In [17]:
# define a decompose transform
from llama_index.indices.query.query_transform.base import DecomposeQueryTransform
from llama_index.query_engine.transform_query_engine import TransformQueryEngine
decompose_transform = DecomposeQueryTransform(
    llm_predictor, verbose=True
)

# define custom query engines
custom_query_engines = {}
for index in index_set.values():
    query_engine = index.as_query_engine()
    query_engine = TransformQueryEngine(
        query_engine,
        query_transform=decompose_transform,
        transform_extra_info={'index_summary': index.index_struct.summary},
    )
    custom_query_engines[index.index_id] = query_engine
custom_query_engines[graph.root_id] = graph.root_index.as_query_engine(
    response_mode='tree_summarize',
    verbose=True,
)

# construct query engine
graph_query_engine = graph.as_query_engine(custom_query_engines=custom_query_engines)

In [18]:
# index configs
index_configs = []
for y in range(2019, 2023):
    query_engine = index_set[y].as_query_engine(
        similarity_top_k=3,
    )
    tool_config = IndexToolConfig(
        query_engine=query_engine, 
        name=f"Vector Index {y}",
        description=f"useful for when you want to answer queries about the {y} SEC 10-K for Uber",
        tool_kwargs={"return_direct": True, "return_sources": True},
    )
    index_configs.append(tool_config)
    
# graph config
graph_config = IndexToolConfig(
    query_engine=graph_query_engine,
    name=f"Graph Index",
    description="useful for when you want to answer queries that require analyzing multiple SEC 10-K documents for Uber.",
    tool_kwargs={"return_direct": True, "return_sources": True},
    return_sources=True
)

toolkit = LlamaToolkit(
    index_configs=index_configs,
    graph_configs=[graph_config]
)

In [19]:
memory = ConversationBufferMemory(memory_key="chat_history")
llm=OpenAI(temperature=0)
agent_chain = create_llama_chat_agent(
    toolkit,
    llm,
    memory=memory,
    verbose=True
)

In [20]:
agent_chain.run(input="hi, This is deepak jaiswal")



[1m> Entering new AgentExecutor chain...[0m


Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIConnectionError: Error communicating with OpenAI: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')).


[32;1m[1;3m
Thought: Do I need to use a tool? No
AI: Hi Deepak, nice to meet you! How can I help you today?[0m

[1m> Finished chain.[0m


'Hi Deepak, nice to meet you! How can I help you today?'

In [21]:
agent_chain.run(input="What were some of the biggest risk factors in 2020 for Uber?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Thought: Do I need to use a tool? Yes
Action: Vector Index 2020
Action Input: Risk Factors[0m
Observation: [33;1m[1;3m{'answer': '\nRisks related to our business include: competition from existing, well-established, and low-cost alternatives; low barriers to entry; low switching costs; and well-capitalized competitors in nearly every major geographic region. We also face risks related to data security and privacy, including potential security breaches, data loss, and undesirable activities on our platform. Additionally, our platform is highly technical, and any undetected errors could adversely affect our business.', 'sources': [{'start': 219104, 'end': 221323, 'ref_doc_id': 'aad2e979-92e6-4344-a5bd-97ffcf56db8b', 'score': 0.8055078648934106}, {'start': 70976, 'end': 73149, 'ref_doc_id': 'aad2e979-92e6-4344-a5bd-97ffcf56db8b', 'score': 0.8006703509795317}, {'start': 259320, 'end': 261573, 'ref_doc_id': 'aad2e979-92e6-4344

"{'answer': '\\nRisks related to our business include: competition from existing, well-established, and low-cost alternatives; low barriers to entry; low switching costs; and well-capitalized competitors in nearly every major geographic region. We also face risks related to data security and privacy, including potential security breaches, data loss, and undesirable activities on our platform. Additionally, our platform is highly technical, and any undetected errors could adversely affect our business.', 'sources': [{'start': 219104, 'end': 221323, 'ref_doc_id': 'aad2e979-92e6-4344-a5bd-97ffcf56db8b', 'score': 0.8055078648934106}, {'start': 70976, 'end': 73149, 'ref_doc_id': 'aad2e979-92e6-4344-a5bd-97ffcf56db8b', 'score': 0.8006703509795317}, {'start': 259320, 'end': 261573, 'ref_doc_id': 'aad2e979-92e6-4344-a5bd-97ffcf56db8b', 'score': 0.7939831430277798}]}"

In [24]:
cross_query_str = (
    "Compare/contrast the risk factors described in the Uber 10-K across years. Give answer in bullet points."
)
agent_chain.run(input=cross_query_str)



[1m> Entering new AgentExecutor chain...[0m


Retrying langchain.llms.openai.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIConnectionError: Error communicating with OpenAI: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')).


[32;1m[1;3m
AI: The risk factors described in the Uber 10-K have changed over the years. In 2019, the risk factors included competition from existing, well-established, and low-cost alternatives; low barriers to entry; low switching costs; and well-capitalized competitors in nearly every major geographic region. In 2020, the risk factors included data security and privacy risks, such as potential security breaches, data loss, and undesirable activities on the platform. In 2021, the risk factors included the potential for regulatory changes, the impact of the COVID-19 pandemic, and the potential for increased competition. In 2022, the risk factors included the potential for increased competition, the impact of the COVID-19 pandemic, and the potential for regulatory changes.[0m

[1m> Finished chain.[0m


'The risk factors described in the Uber 10-K have changed over the years. In 2019, the risk factors included competition from existing, well-established, and low-cost alternatives; low barriers to entry; low switching costs; and well-capitalized competitors in nearly every major geographic region. In 2020, the risk factors included data security and privacy risks, such as potential security breaches, data loss, and undesirable activities on the platform. In 2021, the risk factors included the potential for regulatory changes, the impact of the COVID-19 pandemic, and the potential for increased competition. In 2022, the risk factors included the potential for increased competition, the impact of the COVID-19 pandemic, and the potential for regulatory changes.'

### Setup Chatbot Loop Within Notebook
##### We'll keep a running loop so that you can converse with the agent.

In [26]:
# reinitialize agent
memory = ConversationBufferMemory(memory_key="chat_history")
llm=OpenAI(temperature=0)
agent_chain = create_llama_chat_agent(
    toolkit,
    llm,
    memory=memory,
)

In [None]:
while True:
    text_input = input("User: ")
    response = agent_chain.run(input=text_input)
    print(f'Agent: {response}')

User: What were some of the legal proceedings against Uber in 2022?
Agent: 
It is not possible to answer this question without prior knowledge.
User: What were some of the legal proceedings against Uber in 2022?
Agent: 
Legal proceedings against Uber include the Australia Class Action, AB5 Lawsuits and Governmental Inquiries, State Unemployment Tax Proceedings, Google v. Levandowski and Google v. Levandowski & Ron, Copenhagen Criminal Prosecution, Aslam, Farrar, Hoy and Mithu v. Uber B.V., Uber Britannia Ltd. and Uber London Ltd., Malden Transportation v. Uber Technologies, Inc., and Swiss Social Security Reclassification.
User: What were some of the legal proceedings against Uber in 2020?
Agent: Legal proceedings against Uber in 2020 include the California AB5 Lawsuits, the US Department of Justice Antitrust Investigation, the UK Supreme Court Employment Status Case, the Netherlands Supreme Court Employment Status Case, the European Commission Antitrust Investigation, the US Departmen