In [34]:
import os
from tqdm import tqdm

import pandas as pd

import langchain
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_core.documents import Document
from langchain_community.vectorstores import Chroma
 
from langchain_google_vertexai import (
    VertexAI , 
    VertexAIEmbeddings
)
from datasets import load_dataset

In [35]:
#Create LLM
llm = VertexAI(model_name = "gemini-1.0-pro-002")

# create the embedding function
embedding_function = VertexAIEmbeddings(model_name="textembedding-gecko@003")

In [6]:
#Load the dataset
financial_qa_dataset = load_dataset("adityarane/financial-qa-dataset")
financial_qa_dataset

DatasetDict({
    train: Dataset({
        features: ['Questions', 'Answers', 'Contexts', 'Document', 'Page_no', 'Year', 'Sector', 'Entity', 'Document_type', 'Quarter'],
        num_rows: 475
    })
    test: Dataset({
        features: ['Questions', 'Answers', 'Contexts', 'Document', 'Page_no', 'Year', 'Sector', 'Entity', 'Document_type', 'Quarter'],
        num_rows: 53
    })
})

In [8]:
dataset_train = pd.DataFrame(financial_qa_dataset['train'])
dataset_test = pd.DataFrame(financial_qa_dataset['train'])
dataset_df = pd.concat([dataset_train , dataset_test])
dataset_df.head()

Unnamed: 0,Questions,Answers,Contexts,Document,Page_no,Year,Sector,Entity,Document_type,Quarter
0,What was the Accumulated other comprehensive ...,"($1,434) million",Year Ended December\n$ in millions 2022 2021 2...,2022-10-k.pdf,page_2,2022,Financial Services,Goldman Sachs,annual report,
1,"In 2021, how much cash did Goldman Sachs use f...","$30,465 million",Year Ended December\n$ in millions 2022 2021 2...,2022-10-k.pdf,page_3,2022,Financial Services,Goldman Sachs,annual report,
2,What was the net earnings of Goldman Sachs in ...,"$11,261 million",Year Ended December\n$ in millions 2022 2021 2...,2022-10-k.pdf,page_3,2022,Financial Services,Goldman Sachs,annual report,
3,What was the depreciation and amortization exp...,"$1,098 million",Table of Contents\nNVIDIA CORPORATION AND SUBS...,4e9abe7b-fdc7-4cd2-8487-dc3a99f30e98.pdf,page_4,2022,Technology,Nvidia,annual report,
4,What was the total Accumulated other equity of...,"€57,829 million",Accumulated other equity\nin € million NoteSub...,BMW-Group-Bericht-2020-EN.pdf,page_7,2020,Automotive,BMW,annual report,


In [20]:
# Remove all duplicate rows
dataset_dedup = dataset_df.drop_duplicates(subset='Contexts')
dataset_dedup.shape

(88, 10)

In [21]:
docs = []
for idx , row in dataset_dedup.iterrows():
    metadata = {'Document' : row['Document'] 
                , 'Page_no' : row['Page_no']
                , 'Year' : row['Year']
                , 'Sector' : row['Sector']
                , 'Document_type' : row['Document_type']
                }
    docs.append(Document(page_content=row['Contexts'] , metadata = metadata))


In [25]:
# load it into Chroma
db = Chroma.from_documents(docs, embedding_function)

In [30]:
# Set retreiver
retriever = db.as_retriever()

In [33]:
# query retreiver
query = "In 2021, how much cash did Goldman Sachs use?"
docs = retriever.invoke(query 
                        , search_kwargs={"score_threshold": 0.5 
                                         , 'k' : 3}
                                         )

# print results
print(docs[0].page_content)

Year Ended December
$ in millions 2022 2021 2020
Cash flows from operating activities   
Net earnings $ 11,261 $ 21,635 $ 9,459 
Adjustments to reconcile net earnings to net cash provided by/(used for) operating activities   
Depreciation and amortization  2,455  2,015  1,902 
Deferred income taxes  (2,412)  5  (833) 
Share-based compensation  4,083  2,348  1,920 
Gain related to extinguishment of unsecured borrowings  –  –  (1) 
Provision for credit losses  2,715  357  3,098 
Changes in operating assets and liabilities:   
Customer and other receivables and payables, net  35,014  21,971  (30,895) 
Collateralized transactions (excluding other secured financings), net  (100,996)  (70,058)  (13,007) 
Trading assets  45,278  15,232  (33,405) 
Trading liabilities  8,062  26,616  44,892 
Loans held for sale, net  3,161  (5,556)  1,820 
Other, net  87  (8,267)  (3,485) 
Net cash provided by/(used for) operating activities  8,708  6,298  (18,535) 
Cash flows from investing activities   
Purch

In [36]:
template = """You are employed by HR COnsultancy.
You are helpful chatbot that answers questions related to finance

Use the following context to answer the question at the end.
{context}

- Answer only if you are very confident of the answer.
- If you cannot answer using from the context alone, say "I cannot determine the answer to that due to lack of context"
- If the context is empty, just say "I do not know the answer to that."

Answers should be informative.

Question: {question}
Helpful Answer:"""

PROMPT = PromptTemplate(
    template=template,
    input_variables=["context", "question"],
)

chain_type_kwargs = {"prompt": PROMPT}

retrieval_qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

In [40]:
query = "Goldman Sachs Net earnings in 2022??"

In [41]:
response = retrieval_qa({"query" : query})
print(f"{response['result']}")

## Goldman Sach's Net Earnings in 2022: 

Based on the provided financial statements, Goldman Sachs' net earnings in 2022 were **$11,261 million**. 

This information can be found on page 123 of the 2022 Form 10-K under the "Consolidated Statements of Earnings" section. 



In [39]:
print(f"{response['source_documents']}")

[Document(page_content='Year Ended December\n$ in millions 2022 2021 2020\nCash flows from operating activities   \nNet earnings $ 11,261 $ 21,635 $ 9,459 \nAdjustments to reconcile net earnings to net cash provided by/(used for) operating activities   \nDepreciation and amortization  2,455  2,015  1,902 \nDeferred income taxes  (2,412)  5  (833) \nShare-based compensation  4,083  2,348  1,920 \nGain related to extinguishment of unsecured borrowings  –  –  (1) \nProvision for credit losses  2,715  357  3,098 \nChanges in operating assets and liabilities:   \nCustomer and other receivables and payables, net  35,014  21,971  (30,895) \nCollateralized transactions (excluding other secured financings), net  (100,996)  (70,058)  (13,007) \nTrading assets  45,278  15,232  (33,405) \nTrading liabilities  8,062  26,616  44,892 \nLoans held for sale, net  3,161  (5,556)  1,820 \nOther, net  87  (8,267)  (3,485) \nNet cash provided by/(used for) operating activities  8,708  6,298  (18,535) \nCas