# 첫 번째 주피터 노트북

이 노트북은 Python 프로그래밍을 위한 기본 템플릿입니다.

## 사용 방법
- 각 셀은 `Shift + Enter`로 실행할 수 있습니다.
- 마크다운 셀은 `M`키로, 코드 셀은 `Y`키로 변환할 수 있습니다.
- 새로운 셀은 `A`(위에 추가) 또는 `B`(아래에 추가)키로 생성할 수 있습니다.

In [2]:
%pip install -q langchain langgraph langchain-docling langchain-qdrant langchain-text-splitters langchain-ollama langchain-community sentence-transformers

698.29s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
Note: you may need to restart the kernel to use updated packages.


In [1]:
import warnings

warnings.filterwarnings("ignore")

In [4]:
from langchain_ollama import ChatOllama

reasoning_llm  = ChatOllama(
    model="deepseek-r1:7b",
    stop=["</think>"],
    )

answer_llm = ChatOllama(
    model="exaone3.5",
    temperature=0.0)


In [5]:
from typing import Annotated, List, TypedDict, Literal
from langgraph.graph.message import add_messages
from langchain_core.documents import Document

#RAG 상태정의
class RAGState(TypedDict):
    """RAG 시스템의 상태를 정의합니다."""
    query: str #사용자 질의
    think: str #reasoning_llm이 생성한 사고 과정
    documents: List[Document] #검색된 문서 목록
    answer: str #answer_llm이 생성한 답변
    message: Annotated[List, add_messages] #메시지 저장
    mode: str #모드 저장




In [6]:
from langchain_docling import DoclingLoader
from langchain_docling.loader import ExportType

# File_PATH = "https://arxiv.org/pdf/2408.09869"
File_PATH = "/Users/heesung/work/M_CHO/vds-server/CNITS-DE-007.1-인터페이스 설계서_VDS(표준) Ver 1.0.pdf"

loader = DoclingLoader(
    file_path=File_PATH,
    export_type=ExportType.MARKDOWN,
    )

docs = loader.load()


0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.


In [None]:
from IPython.display import Markdown

display(Markdown(docs[0].page_content))
# 마크다운 파일로 저장
import os

# 저장할 디렉토리 경로 설정
save_dir = "/Users/heesung/work/M_CHO/vds-server/documents"

# 디렉토리가 없으면 생성
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# 파일명 설정 (원본 PDF 파일명을 기반으로)
file_name = os.path.splitext(os.path.basename(File_PATH))[0] + ".md"
save_path = os.path.join(save_dir, file_name)

# 마크다운 파일로 저장
with open(save_path, 'w', encoding='utf-8') as f:
    f.write(docs[0].page_content)

print(f"문서가 다음 경로에 저장되었습니다: {save_path}")



In [8]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=[
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"), 
    ],
    )
splits = [split for doc in docs for split in splitter.split_text(doc.page_content)]

for d in splits[:3]:
    print(f"- {d.page_content=}")
print("...")


- d.page_content='2024. 02.'
- d.page_content='|   No | 버전    | 변경일     | 변경 내용   | 작성자   | 승인자   |\n|------|---------|------------|-------------|----------|----------|\n|    1 | Rev.1.0 | 2024/02/14 | 최초작성    | 진해리   |          |'
- d.page_content='| 1.   | 차량 검지 시스템 개요·································································································································· 1 프로토콜 개요·································································································································· 1 2.  VDS   |\n|------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n|      | 1                                                                                                               

In [9]:
from IPython.display import Markdown

display(Markdown(splits[40].page_content))

| 항 목                       | 타입   |   길이 | 값                                | 비고   |
|-----------------------------|--------|--------|-----------------------------------|--------|
| T an ac r s ti on Num e b r | B      |      8 |                                   |        |
| Re ul   Code s t            | N      |      1 | 범 위의 정수  값 0  ~   0 x FF  . |        |
| 제어기 상태                 | B      |      2 | 비트 별로 장비상태에 대한         |        |

In [10]:
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(
    model="bge-m3:latest"
)

In [12]:
from langchain_qdrant import QdrantVectorStore
from langchain_qdrant import RetrievalMode
vector_store = QdrantVectorStore.from_documents(
    documents=splits, 
    embedding=embeddings, 
    location=":memory:",
    collection_name="rag_collection_0228", 
    retrieval_mode=RetrievalMode.DENSE
)
retriever = vector_store.as_retriever (search_kwargs = {'k' :10})

In [13]:
from langchain.retrievers import ContextualCompressionRetriever 
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder

model = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-base")
compressor = CrossEncoderReranker(model=model, top_n=5)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

In [14]:
from langchain_core.messages import HumanMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langgraph.graph import START, StateGraph, END

def classify_node(state: RAGState):
    """질문을 분류하여 처리모드를 결정 합니다."""
    query = state['query']

    # 모드를 상태에 저장
    if "Docling" in query:
        print("====검색 시작====")
        return {"mode" : "retrieve"}
    else:
        print("====생성 시작====")
        return {"mode" : "generate"}
    
def route_by_mode(state: RAGState) -> Literal["retrieve", "generate"]:
    """모드에 따라 다음단계를 결정 합니다."""
    return state["mode"]

def retrieve(state: RAGState):
    """질의를 기반으로 관련 문서를 검색 합니다."""
    query = state["query"]
    print("====검색시작===")
    documents = compression_retriever.invoke(query)
    for doc in documents:
        print(doc.page_content)
        print("-"*100)
    print("===검색 완료===")
    return {"docuements" : documents}

def reasoning(state: RAGState):
    """쿼리를 분석하여 사고 과정을 생성 합니다."""
    query = state["query"]
    documents = state["documents"]
    context = "\n\n".join([doc.page_content for doc in documents])
    reasoning_prompt = ChatPromptTemplate.from_template(
        """주어진 문서를 활용하여 사용자의 질문에 가장 적절한 답변을 작성 해주세요.
        
        질문 : {query}

        문서내용 : 
        {context}

        상세 추론 :"""
    )

    reasoning_chain = reasoning_prompt | reasoning_llm | StrOutputParser()
    print("===추론시작===")
    thinking = reasoning_chain.invoke({"query" : query, "context" : context})

    return {"thinking" : thinking}

# 3 답변 생성 노드(answer LLM)
def generate(state: RAGState):
    """문서와 추론 과정을 기반으로 최종 답변을 생성 합니다."""
    query = state["query"]
    thinking = state["thinking"]
    documents = state["documents"]

    # 문서 내용 추출
    context = "\n\n".join([doc.page_content for doc in documents])

    # 최종 답변을 위한  프롬프트
    answer_prompt = ChatPromptTemplate.from_template(
        """ 사용자의 질문에 한글로 답변 하세요. 제공된 문서와 추론과정이 있다면, 최대한 활용 하세요

        질문 : {query}

        추론과정 : {thinking}

        문서 내용 :  {context}

        """
    )

    print("====답변 생성 시작====")
    answer_chain = answer_prompt | answer_llm | StrOutputParser()
    answer = answer_chain.invoke({
        "query" : query,
        "thinking" : thinking,
        "context": context
    })

    print("====답변 생성 완료====")
    return {
        "answer" :answer,
        "message":[HumanMessage(content=answer)]
    }



In [16]:
from langgraph.checkpoint.memory import MemorySaver
workflow = StateGraph(RAGState)

#노드추가 
workflow.add_node("classify", classify_node)
workflow.add_node("reasoning", reasoning)
workflow.add_node("retrieve", retrieve)
workflow.add_node("generate", generate)

#엣지추가
workflow.add_edge(START, "classify")
workflow.add_conditional_edges(
    "classify",
    route_by_mode,
    {
        "retrieve": "retrieve",
        "generate": "generate"
    }
)
workflow.add_edge("retrieve", "reasoning")
workflow.add_edge("reasoning", "generate")
workflow.add_edge("generate", END)

memory = MemorySaver()
app = workflow.compile(checkpointer=memory)

In [17]:
from IPython.display import Image, display

display(Image(app.get_graph().draw_mermaid_png()))


ReadTimeout: HTTPSConnectionPool(host='mermaid.ink', port=443): Read timed out. (read timeout=10)

In [19]:
# 기존 입력 설정
input = {"query": "오늘 날씨가 어떄?"}
config = {"configurable": {"thread_id": 0}}


Error: Invalid URL 'YOUR_API_ENDPOINT': No scheme supplied. Perhaps you meant https://YOUR_API_ENDPOINT?
