# RAGAS로 벤치마크 데이터셋 현실적으로 생성해보기

In [None]:
import os
os.environ["OPENAI_API_KEY"] = ''
from llama_index.core import Document
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.settings import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from pprint import pprint
import nest_asyncio

nest_asyncio.apply()

documents = SimpleDirectoryReader('papers').load_data()

In [None]:
from datasets import load_dataset

from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context, conditional
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

# Generator_llm. : Golden Dataset 생성 시 활용하는 모델
generator_llm = OpenAI(model="gpt-4o-mini")
# Critic_llm : 각종 지표에 대한 계산 모델
critic_llm = OpenAI(model="gpt-4o")
embeddings = OpenAIEmbedding(model="text-embedding-3-small")

distributions = {
    simple:0.2,
    reasoning:0.3,
    multi_context:0.3,
    conditional:0.2
}

generator = TestsetGenerator.from_llama_index(
    generator_llm=generator_llm,
    critic_llm=critic_llm,
    embeddings=embeddings
)
distributions = {
    simple:0.1,
    reasoning:0.4,
    multi_context:0.4,
    conditional:0.1
}

testset = generator.generate_with_llamaindex_docs(documents, 10, distributions, with_debugging_logs=True)
testset.to_pandas()

In [None]:
# 비정상 QA 페어 전처리
import pandas as pd
ds = testset.to_dataset()

# 답변이 안되는 QA페어
exclude_idx = [1,4,9]

ds_ex = ds.select(
    (
        i for i in range(len(ds))
        if i not in set(exclude_idx)
    )
)

ds_dict = ds_ex.to_dict()
ds_df = pd.DataFrame(ds_dict)
ds_df

In [None]:
# 생성 쿼리에 클렌징 필요 시 :ds_df.at[2, 'question']= "직위 변경 신청 조건은?"


In [None]:
#전처리 후 생성 질문 더 필요 시 생성 작업 반복
testset = generator.generate_with_llamaindex_docs(documents, 10, distributions, with_debugging_logs=True)
testset.to_pandas()

In [None]:
# 생성 2회차 결과 전처리 및 다른 DF로 저장
ds = testset.to_dataset()

# 답변이 안되는 QA페어
exclude_idx = [2,3,7,8]

ds_ex = ds.select(
    (
        i for i in range(len(ds))
        if i not in set(exclude_idx)
    )
)

ds_dict = ds_ex.to_dict()
ds_df2 = pd.DataFrame(ds_dict)
ds_df2

In [None]:
#전처리 후 생성 질문 더 필요 시 생성 작업 반복
testset = generator.generate_with_llamaindex_docs(documents, 10, distributions, with_debugging_logs=True)
testset.to_pandas()

In [None]:
# 생성 3회차 결과 전처리 및 다른 DF로 저장
ds = testset.to_dataset()

# 답변이 안되는 QA페어
exclude_idx = [2,5,7,9]

ds_ex = ds.select(
    (
        i for i in range(len(ds))
        if i not in set(exclude_idx)
    )
)

ds_dict = ds_ex.to_dict()
ds_df3 = pd.DataFrame(ds_dict)
ds_df3

In [None]:
golden_df = pd.concat([ds_df,ds_df2,ds_df3]).reset_index().iloc[:,1:]

In [None]:
golden_df
#golden_df[['evolution_type','question','ground_truth']].to_csv()

# 생성된 벤치마크에 Custom Query Engine 결과 붙히고 평가 받아보기

In [None]:
import pandas as pd
bench = pd.read_csv('benchmark_papers.csv').iloc[:,1:]
bench

In [None]:
Settings.llm = OpenAI(model="gpt-4o-mini", temperature=0)
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

index = VectorStoreIndex.from_documents(documents)
naive_engine = index.as_query_engine()

In [None]:
# 벤치마크 세트에 naive RAG 답안 생성
def process_query(row):
    result = naive_engine.query(row['question'])

    answer = result.response

    context = [node.node.text for node in result.source_nodes]

    return pd.Series({'answer': answer, 'contexts': context})

bench[['answer', 'contexts']] = bench.apply(process_query, axis=1)



In [None]:
bench

In [None]:
from ragas import evaluate as evaluate_vanilla
from datasets import Dataset 
eval_set= Dataset.from_dict(bench.to_dict(orient='list'))

In [None]:
from langchain_openai.embeddings import OpenAIEmbeddings
#from langchain_openai.llms import AzureOpenAI, OpenAI
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    answer_similarity,
    context_recall,
)

metrics = [
    context_precision,
    context_recall,
    faithfulness,
    answer_similarity,
    answer_relevancy
]


In [None]:
#from langchain_openai.llms import OpenAI
result = evaluate_vanilla(embeddings=OpenAIEmbeddings(model='text-embedding-3-small'),
    dataset=eval_set,
    metrics=[
        context_precision,
        faithfulness,
        answer_similarity,
        context_recall
    ],
)

In [None]:
result

In [None]:
result.to_pandas()

In [None]:
def compare_result(df,idx):
    print(f"질문: {df.loc[idx]['question']}")
    print('\n')
    print(f"<Ground Truth>\n{df.loc[idx]['ground_truth']}\n")
    print(f"<Naive RAG Answer>\n{df.loc[idx]['answer']}\n")
    print(f"<Retrieved Contexts>\n{df.loc[idx]['contexts']}")
    #print(f"<Advanced RAG Answer>\n{df.loc[idx]['answer']}\n")

In [None]:
compare_result(result.to_pandas(),1)