# Imports and init models

In [None]:
from dotenv import load_dotenv
from datasets import Dataset
from rag_utils import setup_rag_embeddings, build_rag_chain
from ragas import SingleTurnSample, EvaluationDataset
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.metrics import Faithfulness, FactualCorrectness
from ragas import evaluate
from gen_ai_hub.proxy.langchain.openai import ChatOpenAI
from gen_ai_hub.proxy.core.proxy_clients import get_proxy_client
from gen_ai_hub.proxy.langchain.openai import OpenAIEmbeddings

import asyncio
import pandas as pd
import os
import json

load_dotenv(override=True)
DATA_DIR = "reports/"

proxy_client = get_proxy_client('gen-ai-hub')
llm_sap = ChatOpenAI(proxy_model_name="gpt-4o", proxy_client=proxy_client)
embeddings_model = OpenAIEmbeddings(proxy_model_name='text-embedding-ada-002', proxy_client=proxy_client)

# RAG

In [5]:
#setup embeddings and chain
retriever = setup_rag_embeddings(data_dir=DATA_DIR, faiss_path="faiss_1", embeddings_model=embeddings_model)
rag_chain = build_rag_chain(retriever=retriever, llm=llm)

TypeError: setup_rag_embeddings() got an unexpected keyword argument 'faiss_path'

Map generated questions and ground_truths into evaluation dataset


In [None]:

with open('synthetic_data_big_context.json', 'r') as f:
    synthetic_data = json.load(f)

if isinstance(synthetic_data, dict) and 'responses' in synthetic_data:
    synthetic_data = synthetic_data['responses']

queries = [item.get('question', "") for item in synthetic_data]
ground_truths = [item.get('ground_truth', "") for item in synthetic_data]
contexts = [item.get('context', "") for item in synthetic_data]

answers = []
retrieved_contexts = []

for query in queries:
    answer = rag_chain.invoke(query)
    answers.append(answer)
    print("Query: ",query)
    print("Anwer: ",answer)
    retrieved_context = [doc.page_content for doc in retriever.invoke(query)]
    retrieved_contexts.append(retrieved_context)
    print("Retrieved context:",retrieved_context)


In [None]:
# populate eval dataset

evaluation_samples = []

for query, answer, retrieved_context, ground_truth in zip(queries, answers, retrieved_contexts, ground_truths):
    sample = SingleTurnSample(
        user_input=query,
        response=answer,
        reference=ground_truth,
        retrieved_contexts=retrieved_context
    )
    evaluation_samples.append(sample)

evaluation_dataset = EvaluationDataset(samples=evaluation_samples)



In [None]:
print(evaluation_dataset.samples)
print(type(evaluation_dataset.samples))
print(type(evaluation_dataset.samples[0]))
print(dir(evaluation_dataset.samples[0]))



In [None]:

evaluator_llm = LangchainLLMWrapper(llm)
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())


metrics = [
    FactualCorrectness(llm=evaluator_llm, mode="precision", name="FactualCorrectness_Precision"),
    FactualCorrectness(llm=evaluator_llm, mode="recall", name="FactualCorrectness_Recall"),
    FactualCorrectness(llm=evaluator_llm, mode="f1", name="FactualCorrectness_F1"),
    Faithfulness(llm=evaluator_llm), 
 
]
results = evaluate(dataset=evaluation_dataset, metrics=metrics)
df = results.to_pandas()

# Results

In [None]:
pd.set_option('display.max_colwidth', 250) # default is 50 / None for unlimited

df

# context recall = measures how many of the relevant documents (or pieces of information) were successfully retrieved
# factual correctness (precision) = proportion of correct claims made that also found in the reference / how many are correct 
# factual correctness (recall) = proportion of facts in the reference that are also present in the response / how many are found
# faithfulness = are the claims made in the response supported by the retrieved context
# semantic similarity = how similar the response is to the ground truth
 


# GraphRAG

## Indexing

In [None]:
# graphrag initial setup
!mkdir -p ./graphrag
!python -m graphrag init --root ./graphrag

In [None]:
# indexing, only run once unless you want to update the index
# !graphrag index --root ./graphrag

## Query Engine

### Global Search

follows the implementations guide by the docs https://microsoft.github.io/graphrag/examples_notebooks/global_search/

In [None]:
from utils.graphrag_utils import setup_graphrag
import os
import json
import asyncio

api_key = os.environ["GRAPHRAG_API_KEY"]
community_level = 2
model_name = "gpt-4o"

with open('synthetic_data_big_context.json', 'r') as file:
    data = json.load(file)
    
queries = [response['question'] for response in data['responses']]

search_engine_global = setup_graphrag(api_key, model_name, community_level)

async def perform_global_search(query):
    print(f"Performing search with query: {query}")
    result = await search_engine_global.asearch(query)
    print(f"Result for query: {query} is: {result.response}")
    return result.response

tasks = [perform_global_search(query) for query in queries]
results = await asyncio.gather(*tasks)

evaluation_samples = []
for query, result in zip(queries, results):
    sample = SingleTurnSample(
        user_input=query,
        response=result,
    )
    evaluation_samples.append(sample)
query = "What is McKinsey ?"
result = await search_engine_global.asearch(query)
print(result)

evaluation_dataset = EvaluationDataset(samples=evaluation_samples)

In [None]:
evaluator_llm = LangchainLLMWrapper(llm)
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

metrics = [
    FactualCorrectness(llm=evaluator_llm, mode="precision", name="FactualCorrectness_Precision"),
    FactualCorrectness(llm=evaluator_llm, mode="recall", name="FactualCorrectness_Recall"),
    Faithfulness(llm=evaluator_llm), 
]
results = evaluate(dataset=evaluation_dataset, metrics=metrics)
df = results.to_pandas()



In [None]:
#pd.set_option('display.max_colwidth', 300) # default is 50 / None for unlimited
df