In [1]:
from langchain_openai import OpenAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_chroma import Chroma
from langchain_community.document_loaders import UnstructuredHTMLLoader
from langchain.schema import Document

import os
import dotenv
import yaml

dotenv.load_dotenv()


True

In [2]:
base_path = os.getenv("BASE_PATH", default=".")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", default=None)
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", default=None)

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    openai_api_key = OPENAI_API_KEY
)

vectordb_path = "C://Users/Sese/AI_Study_Record/RAG_AGENT/rag_0621/chroma_db"

vectorstore = Chroma(
    collection_name="html_docs",
    persist_directory=vectordb_path,
    embedding_function=embeddings,
)

llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    api_key=GOOGLE_API_KEY,
)



In [None]:
from langchain_text_splitters import HTMLSectionSplitter

headers_to_split_on = [
    ("h1", "Header 1"),
]


html_splitter = HTMLSectionSplitter(
    headers_to_split_on=headers_to_split_on,
)

In [4]:
vectorstore_path = r"C:\Users\Sese\AI_Study_Record\RAG_AGENT\rag_0621\chroma_db"
vectorstore = Chroma(
    collection_name="html_docs",
    persist_directory=vectorstore_path,
    embedding_function=embeddings,
)


# 벡터스토어와 retriever 세팅
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 10}
)

print(os.getcwd())
with open("queries.yaml", "r", encoding="utf-8") as f:
    queries = yaml.safe_load(f).get("queries", [])

c:\Users\Sese\AI_Study_Record\RAG_AGENT\rag_0621


In [5]:
query_log = []

for query in queries:
    docs = retriever.invoke(query)
    print(f"\n🔍 Query: {query}")

    result_entry = {"query": query, "results": []}
    
    for doc in docs:
        source = doc.metadata.get("source", "Unknown")
        excerpt = doc.page_content.replace("\n", " ").strip()
        result_entry["results"].append({
            "source": os.path.basename(source),
            "chunk_index": doc.metadata.get("chunk_index", 99),
            "total_chunks": doc.metadata.get("total_chunks", 99),
            "excerpt": excerpt,
        })
        print(f"  - {source}")

    query_log.append(result_entry)





🔍 Query: 뎅기열과 비슷한 매개체로 인한 감염병은?
  - C:\Users\Sese\autosave\알렌 이론 추출\theory_texts\2522_5676_Leptospira interrogans.html
  - C:\Users\Sese\autosave\알렌 이론 추출\theory_texts\2522_5660_Vibrio spp..html
  - C:\Users\Sese\autosave\알렌 이론 추출\theory_texts\1757_4091_요로감염.html
  - C:\Users\Sese\autosave\알렌 이론 추출\theory_texts\2747_5639_염증의 양상.html
  - C:\Users\Sese\autosave\알렌 이론 추출\theory_texts\2521_5678_Pseudomonas aeruginosa.html
  - C:\Users\Sese\autosave\알렌 이론 추출\theory_texts\2525_5684_모기 매개 감염병.html
  - C:\Users\Sese\autosave\알렌 이론 추출\theory_texts\1643_3848_여행 관련 바이러스 감염.html
  - C:\Users\Sese\autosave\알렌 이론 추출\theory_texts\1640_3835_녹농균 감염.html
  - C:\Users\Sese\autosave\알렌 이론 추출\theory_texts\1778_4160_위식도역류질환.html
  - C:\Users\Sese\autosave\알렌 이론 추출\theory_texts\1636_3826_발열, 불명열.html

🔍 Query: 불명열 진단 순서를 설명하시오
  - C:\Users\Sese\autosave\알렌 이론 추출\theory_texts\1698_3968_석회화 건염.html
  - C:\Users\Sese\autosave\알렌 이론 추출\theory_texts\1698_3967_회전근개파열.html
  - C:\Users\Sese\autosave\알렌 이론 추출\theor

# Self Rag

In [12]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
from langchain import hub
from langchain_core.output_parsers import StrOutputParser

### === Retrieval Grader ===

class GradeDocuments(BaseModel):
    binary_score: str = Field(description="Documents are relevant to the question, 'yes' or 'no'")

retrieval_grader_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
structured_retrieval_grader = retrieval_grader_llm.with_structured_output(GradeDocuments)

retrieval_grader_prompt = ChatPromptTemplate.from_messages([
    ("system", 
     """You are a grader assessing relevance of a retrieved document to a user question. 
     It does not need to be a stringent test. The goal is to filter out erroneous retrievals.
     If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant.
     Give a binary score 'yes' or 'no'."""),
    ("human", "Retrieved document: \n\n {document} \n\n User question: {question}")
])

retrieval_grader = retrieval_grader_prompt | structured_retrieval_grader


### === Generation Chain (RAG) ===

generator_llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)
rag_prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = rag_prompt | generator_llm | StrOutputParser()


### === Hallucination Grader ===

class GradeHallucinations(BaseModel):
    binary_score: str = Field(description="Answer is grounded in the facts, 'yes' or 'no'")

hallucination_grader_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
structured_hallu_grader = hallucination_grader_llm.with_structured_output(GradeHallucinations)

hallucination_prompt = ChatPromptTemplate.from_messages([
    ("system", 
     """You are a grader assessing whether an LLM generation is grounded in / supported by a set of retrieved facts. 
     If the key claims are reasonably inferred from the facts or are widely known and not contradicted, grade 'yes'.
     Avoid penalizing common medical knowledge or minor elaborations. Grade only truly unsupported or conflicting claims as 'no'."""),
    ("human", "Set of facts: \n\n {documents} \n\n LLM generation: {generation}")
])


hallucination_grader = hallucination_prompt | structured_hallu_grader


### === Answer Grader ===

class GradeAnswer(BaseModel):
    binary_score: str = Field(description="Answer addresses the question, 'yes' or 'no'")

answer_grader_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
structured_answer_grader = answer_grader_llm.with_structured_output(GradeAnswer)

answer_prompt = ChatPromptTemplate.from_messages([
    ("system", 
     """You are a grader assessing whether an answer addresses / resolves a question.
     Give a binary score 'yes' or 'no'. 'Yes' means that the answer resolves the question."""),
    ("human", "User question: \n\n {question} \n\n LLM generation: {generation}")
])

answer_grader = answer_prompt | structured_answer_grader


### === Question Rewriter ===

question_rewriter_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
rewrite_prompt = ChatPromptTemplate.from_messages([
    ("system", 
     """You are a question re-writer that converts an input question to a better version optimized for vectorstore retrieval.
     Look at the input and try to reason about the underlying semantic intent / meaning."""),
    ("human", "Here is the initial question: \n\n {question} \n Formulate an improved question.")
])

question_rewriter = rewrite_prompt | question_rewriter_llm | StrOutputParser()




In [None]:
def run_self_rag(question, retriever, rag_chain, retrieval_grader, hallucination_grader, answer_grader, question_rewriter, max_iter=5):
    patience = max_iter
    current_question = question

    while patience > 0:
        # 1. Retrieve
        docs = retriever.invoke(current_question)
        if not docs:
            print(f"No docs found for: {current_question}")
            current_question = question_rewriter.invoke({"question": current_question})
            patience -= 1
            continue

        # 2. Grade documents
        filtered_docs = []
        for doc in docs:
            score = retrieval_grader.invoke({"question": current_question, "document": doc.page_content})
            if score.binary_score.lower() == "yes":
                filtered_docs.append(doc)
        print(f"Retrieved {len(docs)} docs, {len(filtered_docs)} relevant after grading.")
        for filter_doc in filtered_docs:
            print(f"  - {filter_doc.metadata.get('source', 'Unknown')}: {filter_doc.page_content[:100]}...")

        if not filtered_docs:
            print("No relevant docs after grading. Rewriting question...")
            current_question = question_rewriter.invoke({"question": current_question})
            patience -= 1
            continue

        # 3. Generate answer
        generation = rag_chain.invoke({"context": format_docs(filtered_docs), "question": current_question})
        
        print(f"Generated answer: {generation}")

        # 4. Check hallucination
        hallucination_score = hallucination_grader.invoke({
            "documents": format_docs(filtered_docs),
            "generation": generation
        })

        if hallucination_score.binary_score.lower() == "no":
            print("Detected hallucination. Regenerating answer...")
            patience -= 1
            continue

        # 5. Check answer quality
        answer_score = answer_grader.invoke({
            "question": current_question,
            "generation": generation
        })

        if answer_score.binary_score.lower() == "yes":
            print("✅ Answer is good and grounded.")
            return generation, current_question, filtered_docs
        else:
            print("Answer is not relevant enough. Rewriting question...")
            current_question = question_rewriter.invoke({"question": current_question})
            patience -= 1

    print("❌ Failed to generate a valid answer. Returning last try.")
    return generation, current_question, filtered_docs

query_log = []

for query in queries:
    print(f"\n🔍 Query: {query}")
    generation, used_question, used_docs = run_self_rag(
        query,
        retriever,
        rag_chain,
        retrieval_grader,
        hallucination_grader,
        answer_grader,
        question_rewriter
    )

    result_entry = {
        "query": query,
        "rewritten_query": used_question,
        "answer": generation,
        "results": [
            {
                "source": doc.metadata.get("source", "Unknown"),
                "chunk_index": doc.metadata.get("chunk_index", 99),
                "excerpt": doc.page_content.strip().replace("\n", " ")
            }
            for doc in used_docs
        ]
    }
    query_log.append(result_entry)



🔍 Query: 뎅기열과 비슷한 매개체로 인한 감염병은?
Retrieved 10 docs, 2 relevant after grading.
  - C:\Users\Sese\autosave\알렌 이론 추출\theory_texts\2525_5684_모기 매개 감염병.html: 제목: 모기 매개 감염병

분류 및 증상 
 
 
 속(genus) 
 관련 질병 및 분류 
 증상 
 대표 유행 지역 
 
 
 
 Anopheles 
 : 얼룩날개모기, 학질모...
  - C:\Users\Sese\autosave\알렌 이론 추출\theory_texts\1643_3848_여행 관련 바이러스 감염.html: 제목: 여행 관련 바이러스 감염

뎅기열, 치쿤구니야열, 지카 바이러스 감염증, 황열, 메르스에 대해서 다룬다. 여행력과 임상양상을 보고 가능성이 높은 진단명을 고르는 문제가 자주...
Generated answer: 뎅기열과 비슷한 매개체로 인한 감염병으로는 치쿤구니야열과 지카 바이러스 감염증이 있습니다. 이들 질병은 모두 Aedes 모기에 의해 전파되며, 유사한 증상을 나타냅니다. 또한 황열도 모기 매개 감염병으로 관련이 있습니다.
✅ Answer is good and grounded.

🔍 Query: 불명열 진단 순서를 설명하시오
Retrieved 10 docs, 2 relevant after grading.
  - C:\Users\Sese\autosave\알렌 이론 추출\theory_texts\1636_3826_발열, 불명열.html: 제목: 발열, 불명열

감염성 질환에서 주로 동반되는 증상인 발열과 불명열에 대해 다룬다. 시험에 특별히 많이 출제되는 부분은 아니다. 불명열 환자에서 다음에 해야할 검사, 조치를...
  - C:\Users\Sese\autosave\알렌 이론 추출\theory_texts\1636_3826_발열, 불명열.html: 제목: 발열, 불명열

1. 발열(fever) 
 1) 체온 (body temperature) 
 (1)   정상 구