# Imports and load data

In [12]:
from dotenv import load_dotenv
from rag_utils import setup_rag_embeddings, build_rag_chain
from ragas import SingleTurnSample, EvaluationDataset, evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import FactualCorrectness
#from gen_ai_hub.proxy.langchain.openai import ChatOpenAI as ChatOpenAIProxy
#from gen_ai_hub.proxy.core.proxy_clients import get_proxy_client
from langchain_openai import OpenAIEmbeddings, ChatOpenAI


import asyncio
import pandas as pd
import os
import json

load_dotenv(override=True)

model_name = "gpt-4o-mini"
model_name_SAP = "gpt-4o"
DATA_DIR = "input_references/"
DATASET = "evaluation_dataset_references.json"
TEMP = "temp_reference/"

#proxy_client = get_proxy_client('gen-ai-hub')
#llm_sap = ChatOpenAIProxy(proxy_model_name=model_name_SAP, proxy_client=proxy_client, )
llm = ChatOpenAI(model=model_name, temperature=0)
#embeddings_model = OpenAIEmbeddings(proxy_model_name='text-embedding-ada-002', proxy_client=proxy_client)
embeddings_model_own = OpenAIEmbeddings(model="text-embedding-3-small")



In [None]:
# only for parsing the pdf reports to markdown for the first time
"""
for file in os.listdir(DATA_DIR):
    if file.endswith(".pdf"):
        try:
            print(f"Converting {file} to markdown")
            md_text = LlamaParse(
                result_type="markdown", 
                verbose=True,
                #use_vendor_multimodal_model=True,
                #vendor_multimodal_model_name="openai-gpt-4o-mini",
                #vendor_multimodal_api_key=os.getenv("OPENAI_API_KEY"),
                language="en",
                numWorkers=5).load_data(DATA_DIR + file)
            combined_md_text = "\n\n".join([doc.text for doc in md_text])
            md_file_path = DATA_DIR + file.replace(".pdf", ".md")
            print(f"Saving markdown to {md_file_path}")
            with open(md_file_path, "w", encoding="utf-8") as f:
                f.write(combined_md_text)
            print(f"Successfully converted {file}")
        except Exception as e:
            print(f"Error converting {file}: {e}")
"""

# RAG

In [13]:
#setup embeddings and chain
retriever = setup_rag_embeddings(data_dir=DATA_DIR, faiss_path="faiss_references", embeddings_model=embeddings_model_own)
rag_chain = build_rag_chain(retriever=retriever, llm=llm)

Loading existing FAISS index from faiss_references...




prompt input_variables=['context', 'question'] input_types={} partial_variables={} metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})]


Map generated questions and ground_truths into evaluation dataset


In [20]:

with open(DATASET, 'r') as f:
    synthetic_data = json.load(f)

if isinstance(synthetic_data, dict) and 'responses' in synthetic_data:
    synthetic_data = synthetic_data['responses']

queries = [item.get('question', "") for item in synthetic_data]
ground_truths = [item.get('ground_truth', "") for item in synthetic_data]
contexts = [item.get('context', "") for item in synthetic_data]

answers = []
retrieved_contexts = []

for query in queries:
    answer = rag_chain.invoke(query)
    answers.append(answer)
    print("Query: ",query)
    print("Anwer: ",answer)
    retrieved_context = [doc.page_content for doc in retriever.invoke(query)]
    retrieved_contexts.append(retrieved_context)
    print("Retrieved context:",retrieved_context)

Query:  How can the integration of a Corporate Sustainability Management System (CSMS) in the accounting sector strategically enhance a company's financial performance while simultaneously advancing environmental conservation and Corporate Social Responsibility (CSR)?
Anwer:  The integration of a Corporate Sustainability Management System (CSMS) in the accounting sector can enhance financial performance by strategically allocating environmental costs, which can lead to more efficient resource use and reduced waste. This approach not only improves profitability but also supports environmental conservation and Corporate Social Responsibility (CSR) initiatives. By aligning financial strategies with sustainability goals, companies can foster a positive public image and potentially attract more customers and investors.
Retrieved context: ['Though their activities, companies have an impact on environmental problems and nature conservation. The accounting sector can play a role in environment

In [21]:
# populate eval dataset

evaluation_samples = []

for query, answer, retrieved_context, ground_truth in zip(queries, answers, retrieved_contexts, ground_truths):
    sample = SingleTurnSample(
        user_input=query,
        response=answer,
        reference=ground_truth,
        retrieved_contexts=retrieved_context
    )
    evaluation_samples.append(sample)

evaluation_dataset = EvaluationDataset(samples=evaluation_samples)



In [23]:

evaluator_llm = LangchainLLMWrapper(llm)

metrics = [
    FactualCorrectness(llm=evaluator_llm, mode="precision", name="FactualCorrectness_Precision"),
    FactualCorrectness(llm=evaluator_llm, mode="recall", name="FactualCorrectness_Recall"),
    FactualCorrectness(llm=evaluator_llm, mode="f1", name="FactualCorrectness_F1"),
]
results = evaluate(dataset=evaluation_dataset, metrics=metrics)
df_rag = results.to_pandas()


Evaluating:   0%|          | 0/81 [00:00<?, ?it/s]Exception raised in Job[3]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-IDSzXwOFLXiOywnvb6RkIg85 on tokens per min (TPM): Limit 200000, Used 195148, Requested 4896. Please try again in 13ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})
Evaluating:   1%|          | 1/81 [01:28<1:58:39, 88.99s/it]Exception raised in Job[15]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-IDSzXwOFLXiOywnvb6RkIg85 on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'requests', 'param': None, 'code': 'rate_limit_exceeded'}})
Evaluating:   2%|▏         | 2/81 [01:39<56:21, 42.80s/it]  Exception raised in Job[7]: RateLimitEr

KeyboardInterrupt: 

Exception raised in Job[10]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-IDSzXwOFLXiOywnvb6RkIg85 on tokens per min (TPM): Limit 200000, Used 195714, Requested 4918. Please try again in 189ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})
Exception raised in Job[0]: TimeoutError()
Exception raised in Job[14]: TimeoutError()
Exception raised in Job[8]: TimeoutError()
Exception raised in Job[4]: TimeoutError()
Exception raised in Job[5]: TimeoutError()
Exception raised in Job[13]: TimeoutError()
Exception raised in Job[1]: TimeoutError()
Exception raised in Job[2]: TimeoutError()
Exception raised in Job[12]: TimeoutError()
Exception raised in Job[6]: TimeoutError()
Exception raised in Job[16]: TimeoutError()
Exception raised in Job[17]: TimeoutError()
Exception raised in Job[20]: RateLimitError(Error code: 429 - {'error': {'message'

# Results

In [17]:
pd.set_option('display.max_colwidth', 50) # default is 50 / None for unlimited

df_rag

# context recall = measures how many of the relevant documents (or pieces of information) were successfully retrieved
# factual correctness (precision) = proportion of correct claims made that also found in the reference / how many are correct 
# factual correctness (recall) = proportion of facts in the reference that are also present in the response / how many are found
# faithfulness = are the claims made in the response supported by the retrieved context
# semantic similarity = how similar the response is to the ground truth
 


Unnamed: 0,user_input,retrieved_contexts,response,reference,FactualCorrectness_Precision,FactualCorrectness_Recall,FactualCorrectness_F1
0,How can the integration of a Corporate Sustain...,"[Though their activities, companies have an im...",The integration of a Corporate Sustainability ...,Implementing a Corporate Sustainability Manage...,0.44,0.67,0.5
1,"How can the integration of board independence,...","[board independence, the absence of CEO dualit...","The integration of board independence, gender ...","A solid Green Shared Vision, which integrates ...",0.43,0.37,0.38
2,How can corporations strategically integrate s...,[# In what ways can AI enhance stakeholder eng...,Corporations can strategically integrate susta...,Integrating a comprehensive range of practices...,0.4,0.14,0.33
3,How can multinational corporations strategical...,[While GRI provides such comprehensive sets of...,Multinational corporations can strategically a...,The integration of global reporting initiative...,0.44,0.5,0.63
4,How can the strategic integration of managemen...,[how the various aspects of sustainability may...,The strategic integration of management roles ...,Achieving corporate sustainability standards n...,0.5,0.5,0.55
5,How can Generative AI be strategically leverag...,[Introducing generative AI to sustainable HRM ...,Generative AI can enhance management accountin...,The integration of management accounting into ...,0.14,0.11,0.12
6,How can companies strategically harmonize thei...,[to improve corporate sustainability programs ...,Companies can harmonize their internal sustain...,Integrating environmental performance into bus...,0.5,0.57,0.53
7,How can organizations strategically integrate ...,[to improve corporate sustainability programs ...,Organizations can strategically integrate deca...,"Implementing decarbonization strategies, when ...",0.33,0.8,0.46
8,How can companies strategically integrate comp...,[When integrating the concept of sustainabilit...,Companies can strategically integrate comprehe...,To effectively enhance corporate social respon...,0.33,0.5,0.55
9,How does the integration of Corporate Social R...,[Corporate social responsibility (CSR) actions...,The integration of Corporate Social Responsibi...,The positive and significant impact of Corpora...,0.75,0.45,0.6


In [18]:
average_metrics = df_rag[['FactualCorrectness_Precision', 'FactualCorrectness_Recall', 'FactualCorrectness_F1']].mean().round(4)

print("Average for each metric:")
print(average_metrics)

Average for each metric:
FactualCorrectness_Precision    0.3685
FactualCorrectness_Recall       0.4341
FactualCorrectness_F1           0.4026
dtype: float64


# GraphRAG

## Indexing

In [12]:
# graphrag initial setup
!mkdir ./graphrag
!mkdir ./input_references
!python -m graphrag init --root ./graphrag

The syntax of the command is incorrect.
The syntax of the command is incorrect.


⠋ GraphRAG Indexer 
Initializing project at E:\Repositories\graphrag-businessqa-evaluation\graphrag
⠋ GraphRAG Indexer 


┌───────────────────── Traceback (most recent call last) ─────────────────────┐
│ e:\Repositories\graphrag-businessqa-evaluation\.conda\Lib\site-packages\gra │
│ phrag\cli\main.py:105 in _initialize_cli                                    │
│                                                                             │
│   102 │   """Generate a default configuration file."""                      │
│   103 │   from graphrag.cli.initialize import initialize_project_at         │
│   104 │                                                                     │
│ > 105 │   initialize_project_at(path=root)                                  │
│   106                                                                       │
│   107                                                                       │
│   108 @app.command("index")                                                 │
│                                                                             │
│ ┌──────────────────────────────── loca

In [14]:
# indexing, only run once unless you want to update the index
!python -m graphrag index --root ./graphrag


⠋ GraphRAG Indexer 
Logging enabled at 
E:\Repositories\graphrag-businessqa-evaluation\graphrag\logs\indexing-engine.lo
g
⠋ GraphRAG Indexer 
⠋ GraphRAG Indexer 
⠹ GraphRAG Indexer 
├── Loading Input (InputFileType.text) - 48 files loaded (48 filtered)  100%  …
⠹ GraphRAG Indexer 
├── Loading Input (InputFileType.text) - 48 files loaded (48 filtered)  100%  …
└── create_base_text_units
⠹ GraphRAG Indexer 
├── Loading Input (InputFileType.text) - 48 files loaded (48 filtered)  100%  …
└── create_base_text_units
⠋ GraphRAG Indexer 
├── Loading Input (InputFileType.text) - 48 files loaded (48 filtered)  100%  …
└── create_base_text_units
⠙ GraphRAG Indexer 
├── Loading Input (InputFileType.text) - 48 files loaded (48 filtered)  100%  …
└── create_base_text_units
⠸ GraphRAG Indexer 
├── Loading Input (InputFileType.text) - 48 files loaded (48 filtered)  100%  …
└── create_base_text_units
⠼ GraphRAG Indexer 
├── Loading Input (InputFileType.text) - 48 files loaded (48 filtered)  100%  …
└─


├── create_final_nodes
├── create_final_communities
├── create_final_relationships
├── create_final_text_units
└── create_final_community_reports
⠙ GraphRAG Indexer 
├── Loading Input (InputFileType.text) - 48 files loaded (48 filtered)  100%  …
├── create_base_text_units
├── create_final_documents
├── create_base_entity_graph
├── create_final_entities
├── create_final_nodes
├── create_final_communities
├── create_final_relationships
├── create_final_text_units
└── create_final_community_reports
⠸ GraphRAG Indexer 
├── Loading Input (InputFileType.text) - 48 files loaded (48 filtered)  100%  …
├── create_base_text_units
├── create_final_documents
├── create_base_entity_graph
├── create_final_entities
├── create_final_nodes
├── create_final_communities
├── create_final_relationships
├── create_final_text_units
└── create_final_community_reports
⠦ GraphRAG Indexer 
├── Loading Input (InputFileType.text) - 48 files loaded (48 filtered)  100%  …
├── create_base_text_units
├── create_final

## Query Engine

### Global Search

follows the implementations guide by the docs https://microsoft.github.io/graphrag/examples_notebooks/global_search/

In [4]:
from graphrag_utils import setup_graphrag
import pandas as pd
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.typing import OpenaiApiType

api_key = os.environ["GRAPHRAG_API_KEY"]

llm = ChatOpenAI(
    api_key=api_key,
    model="gpt-4o-mini",
    api_type=OpenaiApiType.OpenAI,
    max_retries=3,
)
community_level = 1

with open(DATASET, 'r') as file:
    data = json.load(file)
    
queries = [response['question'] for response in data['responses']]
references = [response['ground_truth'] for response in data['responses']]

search_engine_global = setup_graphrag(model_name, llm, community_level)

async def perform_global_search(query):
    print(f"Performing search with query: {query}")
    result = await search_engine_global.asearch(query)
    print(f"Result for query: {query} is: {result.response}")
    return result.response

async def run_queries_sequentially(queries, references):
    evaluation_samples = []
    for i, (query, reference) in enumerate(zip(queries, references)):
        result = await perform_global_search(query)
        sample = SingleTurnSample(
            user_input=query,
            response=result,
            reference=reference
        )
        evaluation_samples.append(sample)

        # delay for 60 seconds to avoid rate limiting / token limit
        if i < len(queries) - 1:
            await asyncio.sleep(60)

    return evaluation_samples

evaluation_samples = await run_queries_sequentially(queries, references)

evaluation_dataset = EvaluationDataset(samples=evaluation_samples)


Missing reports for communities: [47, 31, 5, 22, 0, 29, 12, 40, 8, 44, 23, 11, 7, 13, 42, 20, 24, 41, 36, 2, 21, 16, 14, 48, 10, 32, 9, 37, 6, 30, 50, 38, 18, 19, 46, 28, 49, 34, 43, 411, 410, 332, 102, 257, 261, 429, 53, 315, 309, 60, 168, 263, 365, 366, 129, 383, 267, 316, 154, 160, 259, 64, 117, 182, 278, 377, 380, 232, 59, 172, 386, 328, 374, 378, 318, 283, 373, 399, 90, 348, 72, 161, 252, 381, 201, 412, 272, 236, 229, 233, 159, 246, 388, 253, 228, 416, 239, 396, 370, 368, 97, 96, 215, 295, 329, 330, 120, 327, 91, 191, 245, 214, 173, 218, 213, 212, 94, 119, 106, 333, 122, 210, 165, 121, 217, 153, 66, 340, 204, 256, 144, 392, 133, 265, 199, 98, 74, 258, 205, 127, 202, 128, 107, 103, 292, 260, 335, 243, 110, 184, 291, 277, 95, 58, 132, 417, 92, 351, 398, 183, 230, 176, 414, 391, 297, 112, 255, 270, 288, 70, 143, 273, 266, 287, 322, 284, 285, 290, 192, 68, 424, 422, 279, 360, 430, 166, 301, 431, 171, 425, 99, 56, 276, 421, 141, 147, 148, 116, 140, 136, 82, 125, 219, 149, 359, 156, 186

                                     id  human_readable_id  community  level  \
0  e81dfd94-daa2-4634-a127-e13f3fe64f68                971        971      3   
1  f718455d-7a0d-4b4a-8060-4f11596624fb                972        972      3   
2  cb3e2aaf-1b90-46a7-abd3-d12df292f0ab                973        973      3   
3  6e3a62da-ba3a-432a-8daa-bf55a95def53                976        976      3   
4  6d94c47e-f5fc-478d-9ff5-ae26715b1365                979        979      3   

                                               title  \
0        Plan Creation Leadership and Sustainability   
1                     IEEE and AI Research Community   
2  Tingting Zhang and the Hospitality Research Co...   
3                   Firm Risk and Leverage Community   
4  Baron and Diermeier: Pioneers in Corporate Soc...   

                                             summary  \
0  The community centers around Plan Creation, wi...   
1  The community is centered around the IEEE, a p...   
2  The communi

TypeError: Object of type SingleTurnSample is not JSON serializable

In [27]:
print(evaluation_dataset)

print(json.dumps(evaluation_dataset, indent=4))


EvaluationDataset(features=['user_input', 'retrieved_contexts', 'response', 'reference'], len=27)


TypeError: Object of type EvaluationDataset is not JSON serializable

In [6]:
from langchain_openai import ChatOpenAI
chat = ChatOpenAI(model=model_name, temperature=0)

evaluator_llm = LangchainLLMWrapper(chat)

metrics = [
    FactualCorrectness(llm=evaluator_llm, mode="precision", name="FactualCorrectness_Precision"),
    FactualCorrectness(llm=evaluator_llm, mode="recall", name="FactualCorrectness_Recall"),
    FactualCorrectness(llm=evaluator_llm, mode="f1", name="FactualCorrectness_F1"),
]
results = evaluate(dataset=evaluation_dataset, metrics=metrics)
df_grag = results.to_pandas()



Evaluating: 100%|██████████| 81/81 [04:00<00:00,  2.97s/it]


In [7]:
pd.set_option('display.max_colwidth', 50) # default is 50 / None for unlimited
df_grag

Unnamed: 0,user_input,response,reference,FactualCorrectness_Precision,FactualCorrectness_Recall,FactualCorrectness_F1
0,How can the integration of a Corporate Sustain...,## Enhancing Financial Performance through CSM...,Implementing a Corporate Sustainability Manage...,0.32,0.9,0.83
1,"How can the integration of board independence,...",## Enhancing CSR Performance through Governanc...,"A solid Green Shared Vision, which integrates ...",0.35,0.71,0.39
2,How can corporations strategically integrate s...,# Strategic Integration of Sustainable Practic...,Integrating a comprehensive range of practices...,0.56,0.71,0.57
3,How can multinational corporations strategical...,# Strategic Alignment of Multinational Corpora...,The integration of global reporting initiative...,0.35,0.75,0.32
4,How can the strategic integration of managemen...,## Strategic Integration of Management Roles i...,Achieving corporate sustainability standards n...,0.27,0.86,0.43
5,How can Generative AI be strategically leverag...,## Strategic Leveraging of Generative AI for C...,The integration of management accounting into ...,0.1,0.1,0.12
6,How can companies strategically harmonize thei...,# Harmonizing Internal Sustainability Initiati...,Integrating environmental performance into bus...,0.16,0.85,0.19
7,How can organizations strategically integrate ...,## Strategic Integration of Decarbonization St...,"Implementing decarbonization strategies, when ...",0.37,1.0,0.43
8,How can companies strategically integrate comp...,# Strategic Integration of Valuation Categorie...,To effectively enhance corporate social respon...,0.15,0.57,0.3
9,How does the integration of Corporate Social R...,## Influence of CSR Initiatives on Financial P...,The positive and significant impact of Corpora...,0.46,1.0,0.67


In [19]:
average_metrics_grag = df_grag[['FactualCorrectness_Precision', 'FactualCorrectness_Recall', 'FactualCorrectness_F1']].mean().round(4)
average_metrics_rag = df_rag[['FactualCorrectness_Precision', 'FactualCorrectness_Recall', 'FactualCorrectness_F1']].mean().round(4)

print("Average for each metric (GraphRAG):")
print(average_metrics_grag)
print("Average for each metric (RAG):")
print(average_metrics_rag)


Average for each metric (GraphRAG):
FactualCorrectness_Precision    0.3859
FactualCorrectness_Recall       0.7452
FactualCorrectness_F1           0.4870
dtype: float64
Average for each metric (RAG):
FactualCorrectness_Precision    0.3685
FactualCorrectness_Recall       0.4341
FactualCorrectness_F1           0.4026
dtype: float64
