# Imports and load data

In [27]:
from dotenv import load_dotenv
from rag_utils import setup_rag_embeddings, build_rag_chain
from ragas import SingleTurnSample, EvaluationDataset, evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.metrics import FactualCorrectness
from gen_ai_hub.proxy.langchain.openai import ChatOpenAI as ChatOpenAIProxy
from gen_ai_hub.proxy.core.proxy_clients import get_proxy_client
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

import asyncio
import pandas as pd
import os
import json

load_dotenv(override=True)

model_name = "gpt-4o-mini"
model_name_SAP = "gpt-4o"
DATA_DIR = "input_big_context/"
DATASET = "evaluation_dataset_big_context.json"
TEMP = "temp_large_context/"

proxy_client = get_proxy_client('gen-ai-hub')
llm_sap = ChatOpenAIProxy(proxy_model_name=model_name_SAP, proxy_client=proxy_client)
llm = ChatOpenAI(model=model_name, temperature=0)
#embeddings_model = OpenAIEmbeddings(proxy_model_name='text-embedding-ada-002', proxy_client=proxy_client)
embeddings_model_own = OpenAIEmbeddings(model="text-embedding-3-small")



In [None]:
# only for parsing the pdf reports to markdown for the first time
"""
for file in os.listdir(DATA_DIR):
    if file.endswith(".pdf"):
        try:
            print(f"Converting {file} to markdown")
            md_text = LlamaParse(
                result_type="markdown", 
                verbose=True,
                #use_vendor_multimodal_model=True,
                #vendor_multimodal_model_name="openai-gpt-4o-mini",
                #vendor_multimodal_api_key=os.getenv("OPENAI_API_KEY"),
                language="en",
                numWorkers=5).load_data(DATA_DIR + file)
            combined_md_text = "\n\n".join([doc.text for doc in md_text])
            md_file_path = DATA_DIR + file.replace(".pdf", ".md")
            print(f"Saving markdown to {md_file_path}")
            with open(md_file_path, "w", encoding="utf-8") as f:
                f.write(combined_md_text)
            print(f"Successfully converted {file}")
        except Exception as e:
            print(f"Error converting {file}: {e}")
"""

# RAG

In [3]:
#setup embeddings and chain
retriever = setup_rag_embeddings(data_dir=DATA_DIR, faiss_path="faiss_big_context", embeddings_model=embeddings_model_own)
rag_chain = build_rag_chain(retriever=retriever, llm=llm)

Embedding documents...
amount of documents used:  4
Saved 5029 chunks to faiss_big_context.




prompt input_variables=['context', 'question'] input_types={} partial_variables={} metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})]


Map generated questions and ground_truths into evaluation dataset


In [4]:

with open(DATASET, 'r') as f:
    synthetic_data = json.load(f)

if isinstance(synthetic_data, dict) and 'responses' in synthetic_data:
    synthetic_data = synthetic_data['responses']

queries = [item.get('question', "") for item in synthetic_data]
ground_truths = [item.get('ground_truth', "") for item in synthetic_data]
contexts = [item.get('context', "") for item in synthetic_data]

answers = []
retrieved_contexts = []

for query in queries:
    answer = rag_chain.invoke(query)
    answers.append(answer)
    print("Query: ",query)
    print("Anwer: ",answer)
    retrieved_context = [doc.page_content for doc in retriever.invoke(query)]
    retrieved_contexts.append(retrieved_context)
    print("Retrieved context:",retrieved_context)


Query:  How can SAP leverage generative AI to enhance its sustainability solutions, enabling customers to more effectively track and reduce their carbon footprints while simultaneously improving the efficiency of their internal processes?
Anwer:  SAP can leverage generative AI to enhance its sustainability solutions by embedding AI capabilities across its portfolio, enabling customers to systematically track and manage their carbon emissions. This integration simplifies user interactions and improves operational efficiency, allowing organizations to take effective action towards sustainability. Additionally, SAP's focus on AI-driven efficiencies supports its goal of helping customers reduce their environmental impact while optimizing internal processes.
Retrieved context: ['SAP has been an early adopter of AI technology. During the SAP Sapphire conference in May, the Company showcased the use of AI in various products. At the beginning of the second half of 2023, the Company announced 

In [5]:
# populate eval dataset

evaluation_samples = []

for query, answer, retrieved_context, ground_truth in zip(queries, answers, retrieved_contexts, ground_truths):
    sample = SingleTurnSample(
        user_input=query,
        response=answer,
        reference=ground_truth,
        retrieved_contexts=retrieved_context
    )
    evaluation_samples.append(sample)

evaluation_dataset = EvaluationDataset(samples=evaluation_samples)



In [6]:
print(evaluation_dataset.samples)
print(type(evaluation_dataset.samples))
print(type(evaluation_dataset.samples[0]))
print(dir(evaluation_dataset.samples[0]))



<class 'list'>
<class 'ragas.dataset_schema.SingleTurnSample'>
['__abstractmethods__', '__annotations__', '__class__', '__class_getitem__', '__class_vars__', '__copy__', '__deepcopy__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__fields__', '__fields_set__', '__format__', '__ge__', '__get_pydantic_core_schema__', '__get_pydantic_json_schema__', '__getattr__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__pretty__', '__private_attributes__', '__pydantic_complete__', '__pydantic_core_schema__', '__pydantic_custom_init__', '__pydantic_decorators__', '__pydantic_extra__', '__pydantic_fields_set__', '__pydantic_generic_metadata__', '__pydantic_init_subclass__', '__pydantic_parent_namespace__', '__pydantic_post_init__', '__pydantic_private__', '__pydantic_root_model__', '__pydantic_serializer__', '__pydantic_validator__', '__reduce__', '__reduce_ex__', '__repr_

In [10]:

evaluator_llm = LangchainLLMWrapper(llm)

metrics = [
    FactualCorrectness(llm=evaluator_llm, mode="precision", name="FactualCorrectness_Precision"),
    FactualCorrectness(llm=evaluator_llm, mode="recall", name="FactualCorrectness_Recall"),
    FactualCorrectness(llm=evaluator_llm, mode="f1", name="FactualCorrectness_F1"),
    #Faithfulness(llm=evaluator_llm), 
]
results = evaluate(dataset=evaluation_dataset, metrics=metrics)
df_rag = results.to_pandas()


Evaluating: 100%|██████████| 102/102 [04:04<00:00,  2.40s/it]


# Results

In [11]:
pd.set_option('display.max_colwidth', 50) # default is 50 / None for unlimited

df_rag

# context recall = measures how many of the relevant documents (or pieces of information) were successfully retrieved
# factual correctness (precision) = proportion of correct claims made that also found in the reference / how many are correct 
# factual correctness (recall) = proportion of facts in the reference that are also present in the response / how many are found
# faithfulness = are the claims made in the response supported by the retrieved context
# semantic similarity = how similar the response is to the ground truth
 


Unnamed: 0,user_input,retrieved_contexts,response,reference,FactualCorrectness_Precision,FactualCorrectness_Recall,FactualCorrectness_F1
0,How can SAP leverage generative AI to enhance ...,[SAP has been an early adopter of AI technolog...,SAP can leverage generative AI to enhance its ...,SAP can leverage generative AI to significantl...,0.5,0.17,0.15
1,Considering the increasing importance of ESG f...,[in two different ways: by offering solutions ...,SAP can integrate sustainable practices by opt...,To minimize its environmental impact and enhan...,0.29,0.08,0.1
2,"Given the rapid advancements in AI, how can SA...",[SAP has been an early adopter of AI technolog...,SAP can strategically invest in research and d...,To maintain its leadership and expand into bus...,0.57,0.22,0.31
3,How can SAP utilize AI to optimize its global ...,"[To address new and critical environmental, ec...",SAP can utilize AI to optimize its global supp...,SAP can leverage AI to optimize its global sup...,0.5,0.28,0.34
4,In what ways can SAP leverage AI to enhance it...,[SAP Business AI refers to artificial intellig...,SAP can leverage AI to enhance customer experi...,SAP can leverage AI in several ways to enhance...,0.64,0.28,0.37
5,Considering the potential risks associated wit...,"[In addition, our Global AI Ethics Policy help...",SAP can develop and implement ethical guidelin...,To develop and implement responsible AI at SAP...,0.29,0.1,0.08
6,How can SAP leverage AI to improve its risk ma...,"[To address new and critical environmental, ec...",SAP can leverage AI to enhance its risk manage...,SAP can leverage AI to bolster its risk manage...,0.5,0.24,0.31
7,How can SAP utilize AI to enhance its sales an...,[SAP Business AI refers to artificial intellig...,SAP can enhance its sales and marketing effort...,"SAP can leverage AI, particularly generative A...",0.5,0.13,0.0
8,How can SAP leverage AI to improve its talent ...,[- Build SAP’s Skills for the Future by attrac...,SAP can leverage AI to enhance its talent acqu...,SAP can leverage AI in several ways to enhance...,0.5,0.17,0.27
9,How can SAP utilize AI to enhance its product ...,[SAP has been an early adopter of AI technolog...,SAP can enhance its product development proces...,"SAP can leverage AI, particularly generative A...",0.67,0.17,0.0


In [12]:
average_metrics = df_rag[['FactualCorrectness_Precision', 'FactualCorrectness_Recall', 'FactualCorrectness_F1']].mean().round(4)

print("Average for each metric:")
print(average_metrics)

Average for each metric:
FactualCorrectness_Precision    0.5982
FactualCorrectness_Recall       0.2232
FactualCorrectness_F1           0.3176
dtype: float64


# GraphRAG

## Indexing

In [13]:
# graphrag initial setup
!mkdir ./graphrag_big_context
!mkdir ./input_big_context
!python -m graphrag init --root ./graphrag_big_context

The syntax of the command is incorrect.
The syntax of the command is incorrect.


⠋ GraphRAG Indexer 
Initializing project at 
E:\Repositories\graphrag-businessqa-evaluation\graphrag_big_context
⠋ GraphRAG Indexer 


┌───────────────────── Traceback (most recent call last) ─────────────────────┐
│ e:\Repositories\graphrag-businessqa-evaluation\.conda\Lib\site-packages\gra │
│ phrag\cli\main.py:105 in _initialize_cli                                    │
│                                                                             │
│   102 │   """Generate a default configuration file."""                      │
│   103 │   from graphrag.cli.initialize import initialize_project_at         │
│   104 │                                                                     │
│ > 105 │   initialize_project_at(path=root)                                  │
│   106                                                                       │
│   107                                                                       │
│   108 @app.command("index")                                                 │
│                                                                             │
│ ┌──────────────────────────────── loca

In [15]:
# indexing, only run once unless you want to update the index
!python -m graphrag index --root ./graphrag_big_context


⠋ GraphRAG Indexer 
Logging enabled at 
E:\Repositories\graphrag-businessqa-evaluation\graphrag_big_context\logs\indexi
ng-engine.log
⠋ GraphRAG Indexer 
⠋ GraphRAG Indexer 
⠙ GraphRAG Indexer 
├── Loading Input (InputFileType.text) - 4 files loaded (4 filtered) - 100%  0…
⠙ GraphRAG Indexer 
├── Loading Input (InputFileType.text) - 4 files loaded (4 filtered) - 100%  0…
└── create_base_text_units
⠹ GraphRAG Indexer 
├── Loading Input (InputFileType.text) - 4 files loaded (4 filtered) - 100%  0…
└── create_base_text_units
⠏ GraphRAG Indexer 
├── Loading Input (InputFileType.text) - 4 files loaded (4 filtered) - 100%  0…
└── create_base_text_units
⠙ GraphRAG Indexer 
├── Loading Input (InputFileType.text) - 4 files loaded (4 filtered) - 100%  0…
└── create_base_text_units
⠹ GraphRAG Indexer 
├── Loading Input (InputFileType.text) - 4 files loaded (4 filtered) - 100%  0…
└── create_base_text_units
🚀 create_base_text_units
⠸ GraphRAG Indexer 
├── Loading Input (InputFileType.text) - 4 fi

## Query Engine

### Global Search

follows the implementations guide by the docs https://microsoft.github.io/graphrag/examples_notebooks/global_search/

In [17]:
from graphrag_utils import setup_graphrag
import pandas as pd
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.typing import OpenaiApiType
import os
import json
import asyncio

api_key = os.environ["GRAPHRAG_API_KEY"]

llm = ChatOpenAI(
    api_key=api_key,
    model="gpt-4o-mini",
    api_type=OpenaiApiType.OpenAI,
    max_retries=3,
)
community_level = 1

with open(DATASET, 'r') as file:
    data = json.load(file)
    
queries = [response['question'] for response in data['responses']]
references = [response['ground_truth'] for response in data['responses']]

search_engine_global = setup_graphrag(model_name, llm, community_level)

async def perform_global_search(query):
    print(f"Performing search with query: {query}")
    result = await search_engine_global.asearch(query)
    print(f"Result for query: {query} is: {result.response}")
    return result.response

async def run_queries_sequentially(queries, references):
    evaluation_samples = []
    for i, (query, reference) in enumerate(zip(queries, references)):
        result = await perform_global_search(query)
        sample = SingleTurnSample(
            user_input=query,
            response=result,
            reference=reference
        )
        evaluation_samples.append(sample)

        # delay for 90 seconds to avoid rate limiting / token limit
        if i < len(queries) - 1:
            await asyncio.sleep(60)

    return evaluation_samples

evaluation_samples = await run_queries_sequentially(queries, references)

evaluation_dataset = EvaluationDataset(samples=evaluation_samples)


Missing reports for communities: [43, 12, 23, 4, 11, 42, 2, 25, 14, 26, 27, 29, 18, 38, 15, 32, 1, 36, 31, 39, 22, 28, 13, 37, 30, 16, 47, 6, 44, 49, 7, 48, 24, 35, 20, 45, 50, 41, 33, 19, 40, 379, 364, 246, 80, 161, 144, 169, 362, 149, 280, 403, 365, 293, 342, 130, 184, 370, 369, 338, 124, 133, 353, 129, 366, 317, 381, 383, 314, 358, 132, 110, 304, 274, 399, 345, 230, 347, 315, 312, 350, 275, 346, 322, 325, 343, 302, 352, 348, 397, 400, 354, 367, 308, 269, 281, 290, 284, 286, 303, 331, 291, 289, 277, 320, 299, 283, 298, 297, 300, 301, 413, 382, 251, 318, 412, 282, 295, 285, 378, 296, 416, 363, 360, 387, 173, 385, 390, 388, 386, 392, 418, 407, 361, 408, 406, 270, 417, 398, 377, 404, 307, 334, 368, 309, 376, 336, 375, 356, 419, 310, 339, 344, 384, 372, 337, 401, 405, 159, 330, 335, 332, 333, 340, 311, 306, 329, 394, 328, 359, 323, 414, 409, 410, 393, 287, 411, 389, 374, 324, 326, 327, 396, 279, 355, 402, 305, 288, 415, 634, 628, 456, 765, 650, 469, 573, 776, 683, 529, 509, 884, 541, 840

                                     id  human_readable_id  community  level  \
0  eb3dcb87-2f35-41f6-acd4-ab9b2b0ddefe               1071       1071      4   
1  3c26ae78-a0fe-4aba-a560-fb6423582011               1072       1072      4   
2  81cb72d7-a386-4cf3-8d1f-c8935e6fc1cf               1073       1073      4   
3  b45e85bf-2ddc-4d70-b911-0c9eb360f9b2               1074       1074      4   
4  fdf8ff70-2451-4c40-9336-c115ca1e646f               1075       1075      4   

                                               title  \
0      Corporate Environmental Performance Community   
1           A. M. Romi and Kelley School of Business   
2  Corporate Social Responsibility Research Commu...   
3     Cowen et al. and Correlation Studies Community   
4           Panel B and Emissions Analysis Community   

                                             summary  \
0  The community focuses on Corporate Environment...   
1  The community centers around A. M. Romi, an au...   
2  This commun

In [20]:
data_to_save = [
    {
        "user_input": sample.user_input,
        "response": sample.response,
        "reference": sample.reference
    }
    for sample in evaluation_samples
]

with open(TEMP+'/evaluation_dataset_final_after_grag.json', 'w', encoding='utf-8') as f:
    json.dump(data_to_save, f, ensure_ascii=False, indent=4)


In [28]:
with open(TEMP+'evaluation_dataset_final_after_grag.json', 'r', encoding="utf-8") as f:
    dataset = json.load(f)

evaluation_dataset = EvaluationDataset.from_dict(dataset)

evaluator_llm = LangchainLLMWrapper(llm_sap)
#evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

metrics = [
    FactualCorrectness(llm=evaluator_llm, mode="precision", name="FactualCorrectness_Precision"),
    FactualCorrectness(llm=evaluator_llm, mode="recall", name="FactualCorrectness_Recall"),
    FactualCorrectness(llm=evaluator_llm, mode="f1", name="FactualCorrectness_F1"),
    #Faithfulness(llm=evaluator_llm), 
]
results = evaluate(dataset=evaluation_dataset, metrics=metrics)
df_grag = results.to_pandas()



Evaluating: 100%|██████████| 102/102 [04:58<00:00,  2.93s/it]


In [1]:
pd.set_option('display.max_colwidth', 2500) # default is 50 / None for unlimited
df_grag

NameError: name 'pd' is not defined

In [30]:
average_metrics_grag = df_grag[['FactualCorrectness_Precision', 'FactualCorrectness_Recall', 'FactualCorrectness_F1']].mean().round(4)
average_metrics_rag = df_rag[['FactualCorrectness_Precision', 'FactualCorrectness_Recall', 'FactualCorrectness_F1']].mean().round(4)

print("Average for each metric (GraphRAG):")
print(average_metrics_grag)
print("Average for each metric (RAG):")
print(average_metrics_rag)


Average for each metric (GraphRAG):
FactualCorrectness_Precision    0.5085
FactualCorrectness_Recall       0.5041
FactualCorrectness_F1           0.4794
dtype: float64
Average for each metric (RAG):
FactualCorrectness_Precision    0.5982
FactualCorrectness_Recall       0.2232
FactualCorrectness_F1           0.3176
dtype: float64
