In [None]:
!pip install -q ragas rapidfuzz
!pip install -q sentence_transformers tiktoken lark
!pip install -q langchain langchain-core langchain-google-genai

In [4]:
from dotenv import load_dotenv
import os
import pandas as pd

load_dotenv()

gemini_api_key = os.getenv("GEMINI_API_KEY")

In [5]:
from langchain_google_genai import ChatGoogleGenerativeAI

MODEL_NAME = "gemini-2.5-flash-preview-05-20"

gemini_chat = ChatGoogleGenerativeAI(model=MODEL_NAME, temperature=0, api_key=gemini_api_key)
gemini_chat_half = ChatGoogleGenerativeAI(model=MODEL_NAME, temperature=0.5, api_key=gemini_api_key)
gemini_chat_1 = ChatGoogleGenerativeAI(model=MODEL_NAME, temperature=1, api_key=gemini_api_key)


In [None]:
from ragas.llms import LangchainLLMWrapper
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en")
ragas_llm = LangchainLLMWrapper(gemini_chat)

In [None]:
# import documents

from ragas.testset import TestsetGenerator

generator = TestsetGenerator(llm=ragas_llm, embedding_model=embeddings)

test_set = generator.generate_with_langchain_docs(documents, testset_size=10)


In [None]:
test_df = test_set.to_pandas()
test_df.to_pickle('./ragas_testset.pkl')
test_df.head(3)

In [None]:
import pandas as pd

test_df = pd.read_pickle('./ragas_testset.pkl')

In [None]:
test_questions = test_df["user_input"].values.tolist()
test_ground_truths = test_df["reference"].values.tolist()

In [None]:
# import agent

results = {
    "answers": [],
    "contexts": []
}

for question in test_questions:
    q = {"question": question}
    response = rag_agent.invoke(q)

    results["answers"].append(response["response"])
    results["contexts"].append([response["context"]])



In [None]:
from datasets import Dataset

response_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer" : results["answers"],
    "contexts" : results["contexts"],
    "ground_truth" : test_ground_truths
})


In [None]:
response_dataset.save_to_disk('./response')


In [None]:
from datasets import load_from_disk
response_dataset = load_from_disk('./response')


In [10]:
from ragas import evaluate
from ragas.metrics import faithfulness, answer_correctness, context_recall, context_precision

metrics = [
    faithfulness,
    answer_correctness,
    context_recall,
    context_precision,
]

In [16]:
eval = evaluate(dataset=response_dataset, metrics=metrics, llm=ragas_llm, embeddings=embeddings,)

print(f"RAG Score: {eval.scores}")


Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


RAG Score: [{'faithfulness': 1.0, 'answer_correctness': 1.0, 'context_recall': 1.0, 'context_precision': 0.9999999999}]
