In [None]:
from dotenv import load_dotenv
import os
import json
from datasets import Dataset
from ragas import evaluate

# .env 파일에서 환경 변수 로드
load_dotenv()

from langchain_openai import OpenAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_chroma import Chroma

In [None]:
# 임베딩 모델 초기화 (OpenAI)
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# LLM 초기화 (Google Gemini Flash)
llm = ChatGoogleGenerativeAI(
    model = "gemini-2.0-flash",
)

# ChromaDB 영속 디렉토리 설정
PERSIST_DIRECTORY = "C:\\Users\\Sese\\AI_Study_Record\\RAG_AGENT\\rag_0705\\chroma_db"
COLLECTION_NAME = "html_docs"

# ChromaDB 로드 및 Retriever 생성
db = Chroma(
    persist_directory=PERSIST_DIRECTORY,
    embedding_function=embeddings,
    collection_name=COLLECTION_NAME,
)
retriever = db.as_retriever()


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.0 requires fsspec==2025.3.0, but you have fsspec 2024.5.0 which is incompatible.
google-cloud-documentai-toolbox 0.14.2a0 requires Pillow<11.0.0,>=10.0.0, but you have pillow 11.1.0 which is incompatible.
google-generativeai 0.8.5 requires google-ai-generativelanguage==0.6.15, but you have google-ai-generativelanguage 0.6.18 which is incompatible.
llama-index-readers-file 0.4.6 requires beautifulsoup4<5.0.0,>=4.12.3, but you have beautifulsoup4 4.12.2 which is incompatible.
llama-index-readers-file 0.4.6 requires pypdf<6.0.0,>=5.1.0, but you have pypdf 4.3.1 which is incompatible.
markitdown-mcp 0.0.1a4 requires mcp~=1.8.0, but you have mcp 1.10.1 which is incompatible.
notion-database 1.2.2 requires requests==2.32.3, but you have requests 2.32.4 which is incompatible.
notion-database 1.2.2 requires u

Collecting ragas
  Downloading ragas-0.2.15-py3-none-any.whl.metadata (9.0 kB)
Collecting diskcache>=5.6.3 (from ragas)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting requests>=2.32.2 (from datasets)
  Using cached requests-2.32.4-py3-none-any.whl.metadata (4.9 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading ragas-0.2.15-py3-none-any.whl (190 kB)
Downloading fsspec-2024.5.0-py3-none-any.whl (316 kB)
Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
Using cached requests-2.32.4-py3-none-any.whl (64 kB)
Installing collected packages: requests, fsspec, diskcache, ragas

  Attempting uninstall: requests

    Found existing installation: requests 2.31.0

    Uninstalling requests-2.31.0:

      Successfully uninstalled requests-2.31.0

   ---------------------------------------- 0/4 [requests]
  Attempting uninstall: fsspec
   -------------

## 2. 평가 데이터셋 로드

In [None]:
from datasets import load_dataset
dataset = load_dataset("s1000secent/0705_rag_dataset", token=os.environ.get("HUGGINGFACE_API_KEY"))
evaluation_data = dataset['0705']

for sample in evaluation_data:
    print(sample)
    break
dataset

Loaded 161 evaluation samples.
First 3 samples:
Sample 1: {'query': '이 문서에서 주로 다루는 증상은 무엇인가?', 'answer': '발열과 불명열', 'relevant_ids': ['63138f2a-6cd4-41e0-acb7-b4709755763f'], 'relevant_docs_metadata': [{'chunk_index': 0, 'chunk_metadata': "{'Header 1': '#TITLE#'}", 'source': 'C:\\Users\\Sese\\autosave\\알렌 이론 추출\\theory_texts\\1636_3826_발열, 불명열.html', 'total_chunks': 3}]}
Sample 2: {'query': '발열의 가장 흔한 원인은 무엇인가요?', 'answer': '감염', 'relevant_ids': ['00d0c747-0aa7-4791-9130-b5b23af111a7'], 'relevant_docs_metadata': [{'chunk_index': 1, 'chunk_metadata': "{'Header 1': '1. 발열(fever)'}", 'source': 'C:\\Users\\Sese\\autosave\\알렌 이론 추출\\theory_texts\\1636_3826_발열, 불명열.html', 'total_chunks': 3}]}
Sample 3: {'query': '불명열(FUO) 진단 시 FDG-PET/CT의 효용이 높은 이유는 무엇인가요?', 'answer': '암, 염증성 질환(혈관염 등)을 쉽게 확인할 수 있기 때문입니다.', 'relevant_ids': ['3eec88cf-e345-4494-ad92-2bec413b5b34'], 'relevant_docs_metadata': [{'chunk_index': 2, 'chunk_metadata': "{'Header 1': '2. 불명열(fever of unknown origin, FUO)'}", 'source': 

In [None]:
import ast

html_path = "C:\\Users\\Sese\\autosave\\알렌 이론 추출"

def convert_to_list(example):
    contexts = []
    for idx, docs in enumerate(example['relevant_docs_metadata']):
        context_source = docs['source']
        context_path = os.path.join(html_path, context_source)
        with open(context_path, 'r', encoding='utf-8') as f:
            context_html = f.read()
        contexts.append(context_html)
    
    return {"contexts":contexts, "user_input":example['query'], "ground_truth":example['answer']}

dataset = dataset.map(convert_to_list)
dataset

## 4. RAGAS를 이용한 성능 평가

In [None]:
from ragas import evaluate

from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness

from langchain.callbacks.stdout import StdOutCallbackHandler
from ragas import evaluate
import os

from tenacity import retry, wait_random_exponential, stop_after_attempt
from openai import RateLimitError
from ragas.llms import LangchainLLMWrapper


Evaluating:   0%|          | 0/628 [00:00<?, ?it/s]

Exception raised in Job[119]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-aVigf2W3LKDGsLzWGD4JJds8 on tokens per min (TPM): Limit 200000, Used 199327, Requested 1718. Please try again in 313ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})
Exception raised in Job[235]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-aVigf2W3LKDGsLzWGD4JJds8 on tokens per min (TPM): Limit 200000, Used 200000, Requested 1079. Please try again in 323ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}})
Exception raised in Job[247]: RateLimitError(Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-aVigf2W3LKDGsLzWGD4JJds8 on tokens per min (TPM): Limit 200000,

{'answer_relevancy': 0.6919, 'faithfulness': 0.8438, 'context_recall': 0.8192, 'context_precision': 0.6885}


In [None]:
print(os.getenv("GOOGLE_API_KEY"))

wrapped_llm = LangchainLLMWrapper(llm)

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6), retry_error_callback=lambda retry_state: print("최대 재시도 횟수 초과"))
def safe_evaluate():
    return evaluate(
        dataset=dataset['0705'],
        metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness()],
        llm = wrapped_llm
    )


result = safe_evaluate()

result


{'answer_relevancy': 0.6919, 'faithfulness': 0.8438, 'context_recall': 0.8192, 'context_precision': 0.6885}