In [None]:
!pip install sentence-transformers
!pip install llama-index
!pip install langchain
!pip install faiss-gpu
!pip install openai
!pip install pdf2image

In [2]:
from llama_index import (
    download_loader, GPTVectorStoreIndex, LangchainEmbedding, ServiceContext, StorageContext, LLMPredictor,
    GPTListIndex
)
from pathlib import Path
import os
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.chat_models import ChatOpenAI
from llama_index.indices.composability import ComposableGraph

os.environ['OPENAI_API_KEY'] = ''

UnstructuredReader = download_loader('UnstructuredReader', refresh_cache=True)
loader = UnstructuredReader()

years = [year for year in range(2019, 2023)]


doc_set = {}
all_docs = []
for year in years:
  docs = loader.load_data(
      file=Path(f'/content/drive/MyDrive/Colab Notebooks/data/UBER/UBER_{year}.html'),
      split_documents=False
  )
  for doc in docs:
    doc.extra_info = {'year': year}
  doc_set[year] = docs
  all_docs.extend(docs)

embed_model = LangchainEmbedding(
    HuggingFaceEmbeddings(model_name='sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
)

llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0, max_tokens=512)
llm_predictor = LLMPredictor(llm=llm)

service_context = ServiceContext.from_defaults(embed_model=embed_model, llm_predictor=llm_predictor, chunk_size_limit=512)

index_set = {}

for year in years:
  storage_context = StorageContext.from_defaults()
  cur_index = GPTVectorStoreIndex.from_documents(
      doc_set[year],
      service_context=service_context,
      storage_context=storage_context
  )
  index_set[year] = cur_index

index_summaries = [f'UBER 10-k Filing for {year} fiscal year' for year in years]

graph_storage_context = StorageContext.from_defaults()

graph = ComposableGraph.from_indices(
    GPTListIndex,
    [index_set[year] for year in years],
    index_summaries=index_summaries,
    service_context=service_context,
    storage_context=graph_storage_context
)

from langchain.agents import Tool, initialize_agent
from langchain.chains.conversation.memory import ConversationBufferMemory
from llama_index.langchain_helpers.agents import LlamaToolkit, create_llama_chat_agent, IndexToolConfig
from llama_index.indices.query.query_transform.base import DecomposeQueryTransform
from llama_index.query_engine.transform_query_engine import TransformQueryEngine

decompose_transform = DecomposeQueryTransform(llm_predictor=llm_predictor, verbose=True)

index_configs = []

for index in index_set.values():
  query_engine = TransformQueryEngine(
      query_engine=index.as_query_engine(similarity_top_k=3),
      query_transform=decompose_transform,
      transform_extra_info={'index_summary': index.index_struct.summary}
  )
  tool_config = IndexToolConfig(
      query_engine=query_engine,
      name=f'Vector Index {year}',
      description=f"useful for when you want to answer queries about the {year} SEC 10-K for Uber",
      tool_kwargs={"return_direct": True},
  )
  index_configs.append(tool_config)

graph_query_engine = graph.root_index.as_query_engine(response_mode='tree_summarize', verbose=True)

graph_config = IndexToolConfig(
    query_engine=graph_query_engine,
    name='Graph Index',
    description='useful for when you want to answer queries that require analyzing multiple SEC 10-K documents for Uber.',
    tool_kwargs={'return_direct': True}
)

toolkit = LlamaToolkit(index_configs=index_configs + [graph_config])

memory = ConversationBufferMemory(memory_key='chat_history')

agent_chain = create_llama_chat_agent(toolkit, llm, memory=memory, verbose=True)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Downloading (…)9e268/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)f2cd19e268/README.md:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

Downloading (…)cd19e268/config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)9e268/tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

Downloading (…)d19e268/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]



In [3]:
while True:
    text_input = input("User: ")
    response = agent_chain.run(input=text_input)
    print(f'Agent: {response}')

User: hi, i am bob


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: Do I need to use a tool? No
AI: Hello Bob! How can I assist you today?[0m

[1m> Finished chain.[0m
Agent: Hello Bob! How can I assist you today?
User: What were some of the biggest risk factors in 2020 for Uber?


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: Do I need to use a tool? Yes
Action: Vector Index 2022
Action Input: "What were some of the biggest risk factors in 2020 for Uber?"[0m[33;1m[1;3m> Current query: What were some of the biggest risk factors in 2020 for Uber?
[0m[38;5;200m[1;3m> New query: What were some of the key risks identified by Uber in their 10-k filing for the 2022 fiscal year?
[0m




Observation: [31;1m[1;3mSome of the key risks identified by Uber in their 10-k filing for the 2022 fiscal year include the potential reclassification of drivers as employees, intense competition in the mobility, delivery, and logistics industries, the need to lower fares and offer incentives to remain competitive, significant losses and uncertain profitability, the risk of losing critical mass of platform users, the importance of retaining high-quality personnel, the need to maintain and enhance brand and reputation, operational, compliance, and cultural challenges, diversity and inclusion, investment in new ventures, dependence on large metropolitan areas, the potential failure of autonomous vehicle technologies, security and data privacy breaches, cyberattacks, climate change risks, the impact of outbreaks of contagious diseases, and reliance on third-party marketplaces and software providers.[0m
[32;1m[1;3m[0m

[1m> Finished chain.[0m
Agent: Some of the key risks identified

KeyboardInterrupt: ignored