In [16]:
import os
from pathlib import Path
from pprint import pprint

import giskard
import nest_asyncio
import numpy as np
import pandas as pd
from dotenv import find_dotenv, load_dotenv
from giskard.llm.client.openai import OpenAIClient
from giskard.rag import KnowledgeBase, QATestset, RAGReport, evaluate, generate_testset
from giskard.rag.metrics.ragas_metrics import (
    ragas_answer_relevancy,
    ragas_context_precision,
    ragas_context_recall,
    ragas_faithfulness,
)
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.groq import Groq
from llama_parse import LlamaParse

load_dotenv(find_dotenv())

nest_asyncio.apply()

# prepare data


## download pdf


In [2]:
pdf_path = "../data/llm_eval/amzn_2023_10k.pdf"

if not os.path.exists(pdf_path):
    os.system(
        'wget "https://d18rn0p25nwr6d.cloudfront.net/CIK-0001018724/c7c14359-36fa-40c3-b3ca-5bf7f3fa0b96.pdf" -O ../data/amzn_2023_10k.pdf'
    )
else:
    print("File already exists")

File already exists


## parse pdf


In [3]:
documents = SimpleDirectoryReader(input_files=[pdf_path]).load_data()

# building RAG pipeline


In [4]:
splitter = SentenceSplitter(
    chunk_size=1024,
    chunk_overlap=0,
)

nodes = splitter.get_nodes_from_documents(documents)

In [5]:
documents[0].metadata, len(documents)

({'page_label': '1',
  'file_name': 'amzn_2023_10k.pdf',
  'file_path': '../data/llm_eval/amzn_2023_10k.pdf',
  'file_type': 'application/pdf',
  'file_size': 800598,
  'creation_date': '2024-09-09',
  'last_modified_date': '2024-02-02'},
 94)

In [6]:
nodes[0].metadata, len(nodes)

({'page_label': '1',
  'file_name': 'amzn_2023_10k.pdf',
  'file_path': '../data/llm_eval/amzn_2023_10k.pdf',
  'file_type': 'application/pdf',
  'file_size': 800598,
  'creation_date': '2024-09-09',
  'last_modified_date': '2024-02-02'},
 118)

In [7]:
index = VectorStoreIndex(nodes, embed_model="local:BAAI/bge-small-en-v1.5")
groq_llm = Groq(model="llama3-70b-8192", temperature=0.0)

query_engine = index.as_query_engine(
    similarity_top_k=3,
    llm=groq_llm,
    streaming=True,
)

## query the llm


In [8]:
query = "What is the company's revenue for the first quarter of 2023?"
response = query_engine.query(query)

In [9]:
pprint(response)

StreamingResponse(response_gen=<generator object stream_chat_response_to_tokens.<locals>.gen at 0x34cef3510>,
                  source_nodes=[NodeWithScore(node=TextNode(id_='cbf84cf6-300c-4bd9-93f1-7b4e9702125f', embedding=None, metadata={'page_label': '31', 'file_name': 'amzn_2023_10k.pdf', 'file_path': '../data/llm_eval/amzn_2023_10k.pdf', 'file_type': 'application/pdf', 'file_size': 800598, 'creation_date': '2024-09-09', 'last_modified_date': '2024-02-02'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='0dd280f3-850d-4445-bfbb-2d3ab86fa576', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '31', 'file_name': 'amzn_2023_10k.pdf', 'file_path': '../data/llm_eval/amzn_2023_10k.pdf', 'file_type

In [10]:
pprint(str(response))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


("The company's revenue for the first quarter of 2023 is not explicitly stated "
 'in the provided context. However, the guidance for the first quarter of 2024 '
 'is provided, which is expected to be between $138.0 billion and $143.5 '
 'billion, or to grow between 8% and 13% compared with the first quarter of '
 '2023.')


In [11]:
pprint(response.source_nodes)

[NodeWithScore(node=TextNode(id_='cbf84cf6-300c-4bd9-93f1-7b4e9702125f', embedding=None, metadata={'page_label': '31', 'file_name': 'amzn_2023_10k.pdf', 'file_path': '../data/llm_eval/amzn_2023_10k.pdf', 'file_type': 'application/pdf', 'file_size': 800598, 'creation_date': '2024-09-09', 'last_modified_date': '2024-02-02'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='0dd280f3-850d-4445-bfbb-2d3ab86fa576', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '31', 'file_name': 'amzn_2023_10k.pdf', 'file_path': '../data/llm_eval/amzn_2023_10k.pdf', 'file_type': 'application/pdf', 'file_size': 800598, 'creation_date': '2024-09-09', 'last_modified_date': '2024-02-02'}, hash='0260761441ff8d826bb06075

# Giskard AI


## generate a test set on the 10k report


In [12]:
knowledge_base_df = pd.DataFrame([node.text for node in nodes], columns=["text"])

knowledge_base_df.head()

Unnamed: 0,text
0,Table of Contents\nUNITED STATES\nSECURITIES A...
1,Yes ☐ No ☒\nAggregate market value of vot...
2,"Table of Contents\nAMAZON.COM, INC.\nFORM 10-K..."
3,"Table of Contents\nAMAZON.COM, INC.\nPART I\nI..."
4,Table of Contents\nCompetition\nOur businesses...


In [13]:
# number of clusters
round(2 + np.log(len(knowledge_base_df)))

7

In [17]:
giskard.llm.set_llm_api("openai")
gpt4o_mini = OpenAIClient("gpt-4o-mini")

knowledge_base = KnowledgeBase(
    knowledge_base_df, llm_client=giskard.llm.set_default_client(gpt4o_mini)
)

In [18]:
%%time
testset = generate_testset(knowledge_base, num_questions=60,
  agent_description="A chatbot answering questions about the Amazon 10K financial report of 2023")

2024-09-10 10:43:51,232 pid:32707 MainThread giskard.rag  INFO     Finding topics in the knowledge base.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2024-09-10 10:44:03,475 pid:32707 MainThread giskard.rag  INFO     Found 6 topics in the knowledge base.


Generating questions:   0%|          | 0/60 [00:00<?, ?it/s]

CPU times: user 3.72 s, sys: 664 ms, total: 4.39 s
Wall time: 2min 22s


In [19]:
df_testset = testset.to_pandas()

In [20]:
df_testset.head()

Unnamed: 0_level_0,question,reference_answer,reference_context,conversation_history,metadata
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
337ee6de-03c2-4a21-b860-9d353f3c02cf,What is the total square footage of leased and...,The total square footage of leased and owned p...,Document 25: Table of Contents\nItem 2. Proper...,[],"{'question_type': 'simple', 'seed_document_id'..."
14123ed0-64a5-4935-8273-4d8d1e35923f,What factors are considered in estimating self...,We estimate self-insurance liabilities by cons...,Document 64: We estimate self-insurance liabil...,[],"{'question_type': 'simple', 'seed_document_id'..."
c69f37a8-60cc-49b7-b914-ca2908dd0ee6,Who is the President and Chief Executive Offic...,Andrew R. Jassy,Document 98: Table of Contents\nSIGNATURES\nPu...,[],"{'question_type': 'simple', 'seed_document_id'..."
4d80002d-1004-4600-9a14-eb40a4d26b4e,Who is the Senior Vice President and Chief Fin...,Brian T. Olsavsky,Document 97: Table of Contents\n32.2 Certifica...,[],"{'question_type': 'simple', 'seed_document_id'..."
dd8c2000-3125-428c-bb2a-071d3d74cf4a,What are some risks related to the optimizatio...,Failures to adequately predict customer demand...,Document 12: Table of Contents\n•changes in us...,[],"{'question_type': 'simple', 'seed_document_id'..."


## check test set


In [21]:
df_testset["question_type"] = df_testset["metadata"].apply(lambda x: x["question_type"])

In [22]:
df_testset["question_type"].value_counts()

question_type
simple                 10
complex                10
distracting element    10
situational            10
double                 10
conversational         10
Name: count, dtype: int64

In [23]:
df_simple = df_testset.query("question_type == 'simple'")

ref_context = df_simple["reference_context"].iloc[7]

df_ = df_testset.query("reference_context == @ref_context")

for i in range(len(df_)):
    print(df_.iloc[i]["question"])
    print(df_.iloc[i]["reference_answer"])
    print("\n\n")

What was the total assets of Amazon.com, Inc. as of December 31, 2023?
$527,854 million





In [24]:
df_simple = df_testset.query("question_type == 'complex'")

ref_context = df_simple["reference_context"].iloc[7]

df_ = df_testset.query("reference_context == @ref_context")

for i in range(len(df_)):
    print(df_.iloc[i]["question"])
    print(df_.iloc[i]["reference_answer"])
    print("\n\n")

What was the final ruling on the validity of the patent in the Eolas case, and what subsequent actions were taken by both the district court and the appeals court following the initial judgment?
In May 2022, the district court granted summary judgment, holding that the patent is invalid.





In [25]:
df_simple = df_testset.query("question_type == 'distracting element'")

ref_context = df_simple["reference_context"].iloc[7]

df_ = df_testset.query("reference_context == @ref_context")

for i in range(len(df_)):
    print(df_.iloc[i]["question"])
    print(df_.iloc[i]["reference_answer"])
    print("\n\n")

Considering the complexities involved in evaluating tax positions and their implications on financial reporting, what specific processes are implemented to manage material risks related to cybersecurity in the context of tax contingencies?
We have processes in place for assessing, identifying, and managing material risks from potential unauthorized occurrences on or through our electronic information systems.



What processes do you have in place for cybersecurity and what could adversely affect the confidentiality, integrity, or availability of your information systems?
We have processes for assessing, identifying, and managing material risks from potential unauthorized occurrences on or through our electronic information systems, which could adversely affect their confidentiality, integrity, or availability.





In [26]:
df_simple = df_testset.query("question_type == 'situational'")

ref_context = df_simple["reference_context"].iloc[7]

df_ = df_testset.query("reference_context == @ref_context")

for i in range(len(df_)):
    print(df_.iloc[i]["question"])
    print(df_.iloc[i]["reference_answer"])
    print("\n\n")

Hi there! As an employee who resides outside of the United States and is currently reviewing the Amazon 2023 financial report, I'm curious about what steps I should take to request information about my Data related to the new equity awards and their impact on my participation in the company's stock plan.
If you reside outside of the United States, you may, at any time, request a list with the names and addresses of any potential recipients of the Data, request access to the Data, request additional information about the storage and processing of Data, require any necessary amendments to Data or refuse or withdraw the consents herein, in any case without cost, by contacting in writing your human resources representative.





In [27]:
df_simple = df_testset.query("question_type == 'double'")

ref_context = df_simple["reference_context"].iloc[7]

df_ = df_testset.query("reference_context == @ref_context")

for i in range(len(df_)):
    print(df_.iloc[i]["question"])
    print(df_.iloc[i]["reference_answer"])
    print("\n\n")

What are the potential risks related to data security mentioned in the report, and how do labor market constraints affect the fulfillment network?
The potential risks related to data security include data loss, theft, misuse, unauthorized access, or other security breaches, while labor market constraints affect productivity by increasing payroll costs and hindering the hiring, training, and deployment of sufficient personnel in the fulfillment network.





In [28]:
df_simple = df_testset.query("question_type == 'conversational'")

ref_context = df_simple["reference_context"].iloc[7]

df_ = df_testset.query("reference_context == @ref_context")

for i in range(len(df_)):
    print(df_.iloc[i]["question"])
    print(df_.iloc[i]["reference_answer"])
    print("\n\n")

What was the amount?
$24,023 million





# Evaluation of the RAG pipeline


In [29]:
def answer_fn(question):
    answer = query_engine.query(question)
    return str(answer)


report = evaluate(
    answer_fn,
    testset=testset,
    knowledge_base=knowledge_base,
    metrics=[
        ragas_context_recall,
        ragas_context_precision,
        ragas_faithfulness,
        ragas_answer_relevancy,
    ],
)

Asking questions to the agent:   0%|          | 0/60 [00:00<?, ?it/s]

CorrectnessMetric evaluation:   0%|          | 0/60 [00:00<?, ?it/s]

RagasMetric evaluation:   0%|          | 0/60 [00:00<?, ?it/s]



RagasMetric evaluation:   0%|          | 0/60 [00:00<?, ?it/s]



RagasMetric evaluation:   0%|          | 0/60 [00:00<?, ?it/s]



RagasMetric evaluation:   0%|          | 0/60 [00:00<?, ?it/s]



In [30]:
display(report.to_html(embed=True))

