# Edgar: Comparative Q&A

In [2]:
import os
import sys
import nest_asyncio
nest_asyncio.apply()
from typing import List, Any
import logging


current_dir = os.getcwd()
kit_dir = os.path.abspath(os.path.join(current_dir, ".."))
repo_dir = os.path.abspath(os.path.join(kit_dir, ".."))

sys.path.append(kit_dir)
sys.path.append(repo_dir)

# Langchain imports
from langchain.vectorstores import Chroma
from langchain.embeddings.huggingface import HuggingFaceInstructEmbeddings
from langchain.chains import LLMChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.prompts import (
    PromptTemplate,
    load_prompt
)
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import BaseOutputParser
from langchain_core.documents.base import Document
from langchain.retrievers.multi_query import MultiQueryRetriever

# Llama index imports
from llama_index import SimpleDirectoryReader, ServiceContext, VectorStoreIndex
from llama_index.tools import QueryEngineTool, ToolMetadata
from llama_index.query_engine import SubQuestionQueryEngine
from llama_index.llms import (
    CustomLLM,
    CompletionResponse,
    CompletionResponseGen,
    LLMMetadata,
)
from llama_index.llms.base import llm_completion_callback


#from utils.sambanova_endpoint import SambaNovaEndpoint
from langchain_community.llms.sambanova import SambaStudio, Sambaverse
from utils.model_wrappers.langchain_llms import SambaNovaFastAPI


from dotenv import load_dotenv
load_dotenv(os.path.join(repo_dir,'.env'))

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

## Get the data

In [9]:
# Define the directory path
dir_path = f'{kit_dir}/data/sec-edgar-filings/reports'

# Check if the directory exists and create it if it doesn't
if not os.path.exists(dir_path):
    os.makedirs(dir_path)
    print("Directory created:", dir_path)
else:
    print("Directory already exists:", dir_path)

Directory created: /Users/rodrigom/ai-starter-kit-snova/edgar_qna/data/sec-edgar-filings/reports


In [10]:
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/10k/uber_2021.pdf' -O '{dir_path}/uber_2021.pdf'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/10k/lyft_2021.pdf' -O '{dir_path}/lyft_2021.pdf'

--2024-08-09 16:20:23--  https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/10k/uber_2021.pdf
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8002::154, 2606:50c0:8001::154, 2606:50c0:8000::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8002::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1880483 (1.8M) [application/octet-stream]
Saving to: ‘/Users/rodrigom/ai-starter-kit-snova/edgar_qna/data/sec-edgar-filings/reports/uber_2021.pdf’


2024-08-09 16:20:23 (16.8 MB/s) - ‘/Users/rodrigom/ai-starter-kit-snova/edgar_qna/data/sec-edgar-filings/reports/uber_2021.pdf’ saved [1880483/1880483]

--2024-08-09 16:20:24--  https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/10k/lyft_2021.pdf
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8002::154, 2606:50c0:8001::154, 2606:50c0:8000::154, ...
Connecting t

## Llama index

### Uber vs Lift 2021

In [70]:
class SambaNovaLLMWrapper(CustomLLM):
    context_window: int = 3900
    num_output: int = 256
    model_name: str = "sambanova_llama7b"
    
    def _get_sambanova_llm(self):

        # Using Sambaverse
        llm = Sambaverse(
           sambaverse_model_name='Meta/llama-2-7b-chat-hf',
           model_kwargs={
               'do_sample': False,
               'max_tokens_to_generate': 512,
               'select_expert': 'llama-2-7b-chat-hf',
               'process_prompt': False,
               'temperature': 0.01,
           },
        )

        # Using SambaStudio
        # llm = SambaStudio(
        #     streaming=True,
        #     model_kwargs={
        #         'max_tokens_to_generate': 512,
        #         'select_expert': 'Meta-Llama-3-70B-Instruct',
        #         'process_prompt': False,
        #     },
        # )

        # Using SambaNovaFastAPI 
        #llm = SambaNovaFastAPI(
        #        max_tokens = 512,
        #        model= 'llama3-8b',
        #)

        return llm

    @property
    def metadata(self) -> LLMMetadata:
        """Get LLM metadata."""
        return LLMMetadata(
            context_window=self.context_window,
            num_output=self.num_output,
            model_name=self.model_name,
        )

    @llm_completion_callback()
    def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
        llm = self._get_sambanova_llm()
        response = llm(prompt)
        return CompletionResponse(text=response)

    @llm_completion_callback()
    def stream_complete(
        self, prompt: str, **kwargs: Any
    ) -> CompletionResponseGen:
        llm = self._get_sambanova_llm()
        llm_response = llm(prompt)
        for token in llm_response:
            response += token
            yield CompletionResponse(text=response, delta=token)

In [40]:
# Instantiate LLM 
llm = SambaNovaLLMWrapper()

In [41]:
# Instatiate embedding model
embedding_model = HuggingFaceInstructEmbeddings(
    query_instruction="Represent the query for retrieval: "
)

load INSTRUCTOR_Transformer
max_seq_length  512


In [71]:
# Declare service context
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embedding_model)

In [43]:
## Load data 
lyft_docs = SimpleDirectoryReader(
    input_files=[os.path.join(kit_dir,"data/sec-edgar-filings/reports/lyft_2021.pdf")]
).load_data()
uber_docs = SimpleDirectoryReader(
    input_files=[os.path.join(kit_dir,"data/sec-edgar-filings/reports/uber_2021.pdf")]
).load_data()

## Build indices
lyft_index = VectorStoreIndex.from_documents(lyft_docs, show_progress=True, service_context=service_context)

uber_index = VectorStoreIndex.from_documents(uber_docs, show_progress=True, service_context=service_context)

## Build query engines
lyft_engine = lyft_index.as_query_engine(similarity_top_k=3)

uber_engine = uber_index.as_query_engine(similarity_top_k=3)

Parsing nodes: 100%|██████████| 238/238 [00:00<00:00, 645.67it/s]
Generating embeddings: 100%|██████████| 350/350 [03:14<00:00,  1.80it/s]
Parsing nodes: 100%|██████████| 307/307 [00:00<00:00, 678.65it/s]
Generating embeddings: 100%|██████████| 419/419 [03:38<00:00,  1.91it/s]


In [44]:
# Instantiate query engine tools
query_engine_tools = [
    QueryEngineTool(
        query_engine=lyft_engine,
        metadata=ToolMetadata(
            name="lyft_10k",
            description=(
                "Provides information about Lyft financials for year 2021"
            ),
        ),
    ),
    QueryEngineTool(
        query_engine=uber_engine,
        metadata=ToolMetadata(
            name="uber_10k",
            description=(
                "Provides information about Uber financials for year 2021"
            ),
        ),
    ),
]

# Instantiate Sub query engine
s_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=query_engine_tools,
    service_context=service_context
)

In [45]:
## Run queries
response = s_engine.query(
    "Compare and contrast the customer segments and geographies that grew the fastest"
)

print(response)

response = s_engine.query(
    "Compare revenue growth of Uber and Lyft from 2020 to 2021"
)

print(response)

  warn_deprecated(


stream true, handleling requests
Generated 4 sub questions.
[1;3;38;2;237;90;200m[uber_10k] Q: What are the customer segments that grew the fastest for Uber
[0m[1;3;38;2;90;149;237m[uber_10k] Q: What are the geographies that grew the fastest for Uber
[0m[1;3;38;2;11;159;203m[lyft_10k] Q: What are the customer segments that grew the fastest for Lyft
[0m[1;3;38;2;155;135;227m[lyft_10k] Q: What are the geographies that grew the fastest for Lyft
[0mstream true, handleling requests
[1;3;38;2;237;90;200m[uber_10k] A:  The customer segments that grew the fastest for Uber are not explicitly mentioned in the provided context. However, it can be inferred that the active advertising merchants grew to over 170,000 during the fourth quarter of 2021, indicating a significant growth in this segment. Additionally, Uber One, Uber Pass, Eats Pass, and Rides Pass membership programs had over 6 million members, suggesting growth in these segments as well.
[0mstream true, handleling requests
[1;

## Langchain

### Uber vs Lift 2021

In [46]:
chunk_size = 1000
chunk_overlap = 0

In [48]:
# Load uber data
loader = PyPDFLoader(os.path.join(kit_dir,"data/sec-edgar-filings/reports/uber_2021.pdf"))
data = loader.load()
for document in data:
    document.metadata['company'] = 'Uber'
    document.metadata['year'] = 2021

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
uber_splits = text_splitter.split_documents(data)

In [55]:
# check uber splits
uber_splits[:3]

[Document(metadata={'source': '/Users/rodrigom/ai-starter-kit-snova/edgar_qna/data/sec-edgar-filings/reports/uber_2021.pdf', 'page': 0, 'company': 'Uber', 'year': 2021}, page_content='UNITED STATESSECURITIES AND EXCHANGE COMMISSION\nWashington, D.C. 20549\n____________________________________________ \nFORM\n 10-K____________________________________________ \n(Mark One)\n☒\n ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934For the fiscal year ended\n December 31, 2021OR\n☐\n TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934For the transition period from_____ to _____            \nCommission File Number: 001-38902\n____________________________________________ \nUBER TECHNOLOGIES, INC.\n(Exact name of registrant as specif\nied in its charter)____________________________________________ \nDelaware\n45-2647441 (State or other jurisdiction of inco\nrporation or organization)(I.R.S. Employer Identification No.) 1515 3r

In [49]:
# Load lyft data
loader = PyPDFLoader(os.path.join(kit_dir,"data/sec-edgar-filings/reports/lyft_2021.pdf"))
data = loader.load()
for document in data:
    document.metadata['company'] = 'Lyft'
    document.metadata['year'] = 2021

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
lyft_splits = text_splitter.split_documents(data)

In [54]:
# check lyft splits
lyft_splits[:3]

[Document(metadata={'source': '/Users/rodrigom/ai-starter-kit-snova/edgar_qna/data/sec-edgar-filings/reports/lyft_2021.pdf', 'page': 0, 'company': 'Lyft', 'year': 2021}, page_content='UNITED STATESSECURITIES AND EXCHANGE COMMISSION\nWashington, D.C. 20549\nFORM 10-K\n \n(Mark One)\n☒\nANNUAL REPORT PURS UANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934For the fiscal year ended December 31, 2021\nOR\n☐\nTRANSITION REPORT PURS UANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 FOR THE TRANSITION PERIODFROM                      TO\nCommission File Number 001-38846\nLyft, Inc.\n(Exact name of Registrant as specified in i\nts Charter)Delaware\n20-8809830 (State or other jurisdiction of\nincorporation or organization)\n(I.R.S. EmployerIdentification No.)\n185 Berry Street, Suite 5000\nSan Francisco, California\n94107 (Address of principal executive offices)\n(Zip Code) Registrant’s telephone number, including area code: (844) 250\n-2773Securities registered p

In [50]:
splits = [*uber_splits,*lyft_splits]

print(f"{len(uber_splits)} uber split docs")
print(f"{len(lyft_splits)} lyft split docs")
print(f"{len(splits)} all docs")

1499 uber split docs
1043 lyft split docs
2542 all docs


In [51]:
# Load embeddings and create vector store
embedding = HuggingFaceInstructEmbeddings(
    query_instruction="Represent the query for retrieval: "
)

vectordb = Chroma.from_documents(documents=splits, embedding=embedding)

load INSTRUCTOR_Transformer
max_seq_length  512


In [74]:
# create llm object from Sambanova endpoint class
# Using Sambaverse
llm = Sambaverse(
           sambaverse_model_name='Meta/llama-2-7b-chat-hf',
           model_kwargs={
               'do_sample': False,
               'max_tokens_to_generate': 512,
               'select_expert': 'llama-2-7b-chat-hf',
               'process_prompt': False,
               'temperature': 0.01,
           },
       )

# Using SambaStudio
# llm = SambaStudio(
#             streaming=True,
#             model_kwargs={
#                 'max_tokens_to_generate': 512,
#                 'select_expert': 'Meta-Llama-3-70B-Instruct',
#                 'process_prompt': False,
#             },
#         )

# Using SambaNovaFastAPI 
#llm = SambaNovaFastAPI(
#        max_tokens = 512,
#        model= 'llama3-8b',
#)

In [60]:
# Output parser will split the LLM result into a list of queries
class LineListOutputParser(BaseOutputParser[List[str]]):
    """Output parser for a list of lines."""

    def parse(self, text: str) -> List[str]:
        lines = text.strip().split("\n")
        questions = [question.strip() for question in lines if '?' in question]
        return list(filter(None, questions))  # Remove empty lines

output_parser = LineListOutputParser()

# Testing parser
parsing = output_parser.parse("  1. What are the revenue breakdowns for Document 1?\n                       2. What are the revenue breakdowns for Document 2?")
parsing

['1. What are the revenue breakdowns for Document 1?',
 '2. What are the revenue breakdowns for Document 2?']

In [61]:
# Saving and Loading a prompt template
query_decomposition_prompt = PromptTemplate(
    input_variables=["question"],
    
    template="""Given the following complex query, decompose the query into a list of questions directly and concisely.
    Complex query: {question}
    List of decomposed questions: """,
)


query_decomposition_prompt.save(os.path.join(kit_dir,'prompts/edgar_comparative_qna-query_decomposition_prompt.yaml'))
query_decomposition_prompt = load_prompt(os.path.join(kit_dir,'prompts/edgar_comparative_qna-query_decomposition_prompt.yaml'))

In [79]:
# Testing a Chain including the parser
llm_chain = LLMChain(llm=llm, prompt=query_decomposition_prompt, output_parser=output_parser)
llm_chain.invoke("What are the key risks mentioned in the risk factors section of both Microsoft and Apple's 10-K reports, and how do they differ in terms of potential impact and mitigation strategies?")

{'question': "What are the key risks mentioned in the risk factors section of both Microsoft and Apple's 10-K reports, and how do they differ in terms of potential impact and mitigation strategies?",
 'text': ["1. What are the key risks mentioned in Microsoft's 10-K report?",
  "2. What are the key risks mentioned in Apple's 10-K report?",
  "3. How do the key risks in Microsoft's report differ from the key risks in Apple's report?",
  "4. How do the potential impacts of the key risks in Microsoft's report differ from the potential impacts of the key risks in Apple's report?",
  "5. What are the mitigation strategies mentioned in Microsoft's report to address the key risks?",
  "6. What are the mitigation strategies mentioned in Apple's report to address the key risks?",
  "7. How do the mitigation strategies in Microsoft's report differ from the mitigation strategies in Apple's report?"]}

In [114]:
# Setting up the Chain and MultiqueryRetriever
llm_chain = LLMChain(llm=llm, prompt=query_decomposition_prompt, output_parser=output_parser)

multiquery_retriever = MultiQueryRetriever(
    retriever=vectordb.as_retriever(search_kwargs={
        'k': 3,
        'filter': {'$or': [{'company': {'$eq': 'Uber'}}, {'company': {'$eq': 'Lyft'}}]},
    }), 
    llm_chain=llm_chain, 
    parser_key="decomposed_questions", 
    verbose = True
)  

question = "What are the key risks mentioned in the risk factors section of both Uber and Lyft's 10-K reports, and how do they differ in terms of potential impact and mitigation strategies?"

# Testing multiquery results
multiquery_retrieved_docs = multiquery_retriever.get_relevant_documents(
    query=question
)[:6]
multiquery_retrieved_docs

INFO:langchain.retrievers.multi_query:Generated queries: ["1. What are the key risks mentioned in Uber's 10-K report?", "2. What are the key risks mentioned in Lyft's 10-K report?", "3. How do the key risks mentioned in Uber's 10-K report differ from the key risks mentioned in Lyft's 10-K report?", "4. How do the key risks mentioned in both Uber and Lyft's 10-K reports differ in terms of potential impact?", "5. What are the mitigation strategies mentioned in Uber's 10-K report for the key risks mentioned?", "6. What are the mitigation strategies mentioned in Lyft's 10-K report for the key risks mentioned?", "7. How do the mitigation strategies mentioned in Uber's 10-K report differ from the mitigation strategies mentioned in Lyft's 10-K report?"]


[Document(metadata={'company': 'Uber', 'page': 11, 'source': '/Users/rodrigom/ai-starter-kit-snova/edgar_qna/data/sec-edgar-filings/reports/uber_2021.pdf', 'year': 2021}, page_content='additional discussion, see the risk factor titled “—If we are unable to attract or maintain a critical mass of Drivers, consumers, merchants, shippers, andcarriers, whether\n as a result of competition or other factors, our platform will become less appealing to platform users, and our financial results would be adverselyimpacted.”\n included  in  Part  I,  Item  1A  of  this  Annual  Report  on  Form  10-K  as  well  our  2021  ESG  Report  and  our  2021  People  and  Culture  Report.  Theinformation in these r\neports is not a part of this Form 10-K.Additional Information\nWe\n were  founded  in  2009  and  incorporated  as  Ubercab,  Inc.,  a  Delaware  corporation,  in  July  2010.  In  February  2011,  we  changed  our  name  to  UberTechnologies, Inc. Our principa\nl executive offices are located 

In [115]:
# Define prompt for answering and summarization
summarization_prompt_template = """You're a helpful assistant. Follow these rules:
1. Use only the information provided in the context section.
2. Provide relevant information to answer the question.
Write an answer to the following question based on the following context information and metadata:
Question:
{original_question}
Context:
{context}
Answer: """
summarization_prompt = PromptTemplate.from_template(summarization_prompt_template)

summarization_prompt.save(os.path.join(kit_dir,'prompts/edgar_comparative_qna-answering_and_summarization_prompt.yaml'))
summarization_prompt = load_prompt(os.path.join(kit_dir,'prompts/edgar_comparative_qna-answering_and_summarization_prompt.yaml'))

In [116]:
# Transform the retrieved docs to include metadata in page_content
docs_for_summary = []
for doc in multiquery_retrieved_docs:
    metadata_str = ", ".join([f"{key}: {value}" for key, value in doc.metadata.items() if key in ("company", "year", "page")])
    extended_page_content = f"Metadata: \"{metadata_str}\", Information: \"{doc.page_content}\""
    extended_doc = Document(page_content=extended_page_content)
    docs_for_summary.append(extended_doc)
docs_for_summary

[Document(page_content='Metadata: "company: Uber, page: 11, year: 2021", Information: "additional discussion, see the risk factor titled “—If we are unable to attract or maintain a critical mass of Drivers, consumers, merchants, shippers, andcarriers, whether\n as a result of competition or other factors, our platform will become less appealing to platform users, and our financial results would be adverselyimpacted.”\n included  in  Part  I,  Item  1A  of  this  Annual  Report  on  Form  10-K  as  well  our  2021  ESG  Report  and  our  2021  People  and  Culture  Report.  Theinformation in these r\neports is not a part of this Form 10-K.Additional Information\nWe\n were  founded  in  2009  and  incorporated  as  Ubercab,  Inc.,  a  Delaware  corporation,  in  July  2010.  In  February  2011,  we  changed  our  name  to  UberTechnologies, Inc. Our principa\nl executive offices are located at 1515 3rd Street, San Francisco, California 94158, and our telephone number is (415) 612-8582.10

In [118]:
# Define StuffDocumentsChain for question answering 
llm_chain = LLMChain(llm=llm, prompt=summarization_prompt)
stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="context", verbose=True)

response = stuff_chain.invoke({"input_documents": docs_for_summary, 'original_question': question})
print(response['output_text'])



[1m> Entering new StuffDocumentsChain chain...[0m

[1m> Finished chain.[0m

Both Uber and Lyft face similar risks related to driver attraction and retention, as well as potential impacts on their financial results. However, there are some differences in the potential impact and mitigation strategies for these risks.

For Uber, the risk of being unable to attract or maintain a critical mass of drivers is a significant concern, as it could lead to a decrease in the appeal of their platform to users. To mitigate this risk, Uber has implemented various strategies, such as offering incentives to drivers, improving the driver experience, and investing in technology to improve the efficiency and safety of the platform.

For Lyft, the risk of being unable to attract or retain qualified drivers is also a concern, particularly given the competitive nature of the ride-hailing industry. To mitigate this risk, Lyft has implemented strategies such as offering competitive compensation and benef

In [None]:
# other questions to try
questions = [
    "What are the revenue breakdowns for Uber and Lyft in their respective 10-K reports, and how do they compare in terms of total revenue and revenue from different segments?",
    "What are the key risks mentioned in the risk factors section of both Uber and Lyft's 10-K reports, and how do they differ in terms of potential impact and mitigation strategies?",
    "How do the corporate governance structures of Uber and Lyft, as outlined in their 10-K filings, compare in terms of board composition, executive compensation, and shareholder rights?",
    "What are the major investments and acquisitions disclosed in the investment section of Uber and Lyft's 10-K reports, and how do they reflect each company's strategic priorities and growth strategies?",
    "How do the research and development expenditures disclosed in Uber and Lyft's 10-K reports compare in terms of absolute spending and percentage of revenue, and what insights can be drawn regarding their innovation efforts?",
    "What are the legal proceedings and regulatory issues disclosed in the legal proceedings section of both Uber and Lyft's 10-K filings, and how do they differ in terms of nature, severity, and potential impact on the companies?",
    "How do the financial performance metrics such as net income, operating margins, and cash flow ratios disclosed in Uber and Lyft's 10-K reports compare, and what factors contribute to any observed differences?",
    "What are the geographical revenue breakdowns provided in the geographic segments section of both Uber and Lyft's 10-K reports, and how do they reflect each company's international presence and market diversification?",
    "How do the sustainability initiatives and environmental disclosures in Uber and Lyft's 10-K filings compare, including information on energy consumption, carbon footprint, and supply chain sustainability efforts?",
    "What are the forward-looking statements and risk factors outlined in the Management's Discussion and Analysis (MD&A) sections of Uber and Lyft's 10-K reports, and how do they reflect each company's outlook, challenges, and opportunities in the market?",
]