# Naive RAG 구현 

## 1. Setting

1\) 환경 변수 로드

In [1]:
from dotenv import load_dotenv
load_dotenv()

True

## 2. Document Indexing

1\) Load Document
- **BART** 논문 로드

In [2]:
import requests
from langchain_community.document_loaders import PyPDFLoader

# PDF 다운로드
url = "https://arxiv.org/pdf/1910.13461.pdf"
with open("bart_paper.pdf", "wb") as f:
    f.write(requests.get(url).content)

# 로컬에서 PDF 로드
loader = PyPDFLoader("bart_paper.pdf")
docs = loader.load()
print(f'PDF 문서 개수: {len(docs)}')

PDF 문서 개수: 10


In [3]:
# 문단이 분리된 경우에도 문장 순서를 올바르게 읽는지 확인
docs[0].page_content[2200:2300]

'al., 2019), the order in which\nmasked tokens are predicted (Yang et al., 2019), and the\navailable co'

In [4]:
# 도표로 시작하는 페이지 데이터 확인
docs[1].page_content[:100]

'Bidirectional \nEncoder\nA  _  C  _  E \nB       D    \n(a) BERT: Random tokens are replaced with masks,'

2\) Split Text

- **Semantic Chunking** 방식으로 텍스트를 분할함<br>
    → 임베딩 벡터 간의 **기울기(gradient)** 변화를 기준으로 의미 단위(semantic unit)를 구분함<br>
    → 청크 길이에 일관성이 없으며, 문맥에 따라 길이가 유동적으로 결정됨

- 길이가 100자 미만인 청크는 이미지 기반 텍스트(OCR 등)로 간주하여 제거함<br>
    → 주요 텍스트가 아닌 부가 정보일 가능성이 높기 때문임

- 1차 분할된 청크는 길이 편차가 크므로, 문자열 길이 기준으로 재귀적으로 분할하여 최종적으로는 일관된 길이의 청크를 구성함

In [5]:
from langchain_experimental.text_splitter import SemanticChunker 
from langchain_openai.embeddings import OpenAIEmbeddings

text_splitter = SemanticChunker(
    embeddings=OpenAIEmbeddings(model="text-embedding-3-small"),
    breakpoint_threshold_type="gradient",  # 임계값 타입 설정 (gradient, percentile, standard_deviation, interquartile)
)
chunks = text_splitter.split_documents(docs)
print(f"생성된 청크 수: {len(chunks)}")
print(f"각 청크의 길이: {list(len(chunk.page_content) for chunk in chunks)}")

생성된 청크 수: 35
각 청크의 길이: [380, 1973, 2279, 3429, 108, 511, 7, 8, 34, 4303, 3184, 1086, 133, 770, 108, 3245, 2897, 568, 481, 257, 1287, 723, 2615, 1152, 1516, 1298, 489, 750, 1123, 771, 1099, 662, 127, 156, 1796]


In [6]:
selected_chunks = []
for idx, chunk in enumerate(chunks):
    content = chunk.page_content
    if len(chunk.page_content) < 100:
        print(f'{idx}: {content}')
    else:
        selected_chunks.append(chunk)

print(f"생성된 청크 수: {len(selected_chunks)}")
print(f"각 청크의 길이: {list(len(chunk.page_content) for chunk in selected_chunks)}")

6: A B C .
7: D E .A .
8: C . E . A _ . D _ E . A _C . _ E .
생성된 청크 수: 32
각 청크의 길이: [380, 1973, 2279, 3429, 108, 511, 4303, 3184, 1086, 133, 770, 108, 3245, 2897, 568, 481, 257, 1287, 723, 2615, 1152, 1516, 1298, 489, 750, 1123, 771, 1099, 662, 127, 156, 1796]


In [7]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,                      
    chunk_overlap=100,
    separators=[" \n", ".\n", ". "],
)
final_chunks = text_splitter.split_documents(selected_chunks)
print(f"생성된 텍스트 청크 수: {len(final_chunks)}")
print(f"각 청크의 길이: {list(len(chunk.page_content) for chunk in final_chunks)}")

생성된 텍스트 청크 수: 112
각 청크의 길이: [380, 398, 350, 381, 396, 448, 383, 448, 427, 333, 440, 306, 497, 81, 367, 490, 488, 344, 463, 449, 425, 108, 422, 146, 441, 380, 402, 396, 467, 487, 317, 465, 428, 437, 296, 354, 417, 429, 497, 308, 384, 258, 363, 351, 409, 415, 262, 133, 770, 108, 431, 465, 422, 446, 487, 359, 484, 329, 496, 152, 470, 481, 488, 383, 406, 182, 486, 132, 481, 257, 432, 473, 382, 473, 250, 312, 458, 401, 349, 475, 407, 495, 344, 420, 476, 421, 500, 375, 360, 484, 443, 468, 489, 484, 339, 495, 480, 317, 475, 335, 481, 458, 200, 496, 262, 127, 156, 484, 462, 470, 472, 208]


3\) Embedding
- 문서 임베딩은 `OpenAI`의 **text-embedding-3-small** 모델을 사용함

In [8]:
from langchain_openai import OpenAIEmbeddings

embeddings_model = OpenAIEmbeddings(
    model="text-embedding-3-small",
    dimensions=1024
)
documents = [chunk.page_content for chunk in final_chunks]
document_embeddings = embeddings_model.embed_documents(documents)
print(f"임베딩 벡터의 개수: {len(document_embeddings)}")
print(f"임베딩 벡터의 차원: {len(document_embeddings[0])}")

임베딩 벡터의 개수: 112
임베딩 벡터의 차원: 1024


In [9]:
from langchain_community.utils.math import cosine_similarity
import numpy as np

def find_most_similar(
        query: str, 
        documents: list,
        doc_embeddings: np.ndarray,
        embeddings_model
    ) -> tuple[str, float]:
    """ 쿼리와 가장 유사한 문서를 반환하는 함수 (코사인 유사도 사용) """
    query_embedding = embeddings_model.embed_query(query)
    similarities = cosine_similarity([query_embedding], doc_embeddings)[0]
    most_similar_idx = np.argmax(similarities)
    return documents[most_similar_idx], similarities[most_similar_idx]

# 유사도 확인
query = "What is BART architecture?"
most_similar_doc, similarity = find_most_similar(
    query, 
    documents,
    document_embeddings, 
    embeddings_model=embeddings_model
)
print(f"쿼리: {query}")
print(f"가장 유사한 문서: {most_similar_doc}")
print(f"유사도: {similarity:.4f}")

쿼리: What is BART architecture?
가장 유사한 문서: . The architecture is closely related to that used in
BERT, with the following differences: (1) each layer of
the decoder additionally performs cross-attention over
the ﬁnal hidden layer of the encoder (as in the trans-
former sequence-to-sequence model); and (2) BERT
uses an additional feed-forward network before word-
prediction, which BART does not. In total, BART con-
tains roughly 10% more parameters than the equiva-
lently sized BERT model
유사도: 0.6146


4\) Save Vector

- 임베딩된 벡터는 벡터스토어로 `ChromaDB` 사용하여 저장함

In [10]:
from langchain_chroma import Chroma

chroma_db = Chroma(
    collection_name="my_task02",
    embedding_function=embeddings_model,
    persist_directory="./chroma_db",
)
chroma_db.get()

# 문서를 벡터 저장소에 저장
doc_ids = [f"DOC_{i}" for i in range(len(final_chunks))]
added_doc_ids = chroma_db.add_documents(documents=final_chunks, ids=doc_ids)
print(f"{len(added_doc_ids)}개의 문서가 성공적으로 벡터 저장소에 추가되었습니다.")

112개의 문서가 성공적으로 벡터 저장소에 추가되었습니다.


5\) Retriever

- **MMR** 기반의 Retriever를 사용하여 문맥 다양성을 고려한 상위 3개 문서 청크를 검색함
- 유사도 계산에는 **Cosine Similarity** 를 사용함

In [11]:
chroma_mmr = chroma_db.as_retriever(
    search_type='mmr',
    search_kwargs={
        'k': 3,                 # 검색할 문서의 수
        'fetch_k': 8,           # mmr 알고리즘에 전달할 문서의 수 (fetch_k > k)
        'lambda_mult': 0.3,     # 다양성을 고려하는 정도 (1은 최소 다양성, 0은 최대 다양성을 의미. 기본값은 0.5)
        },
)

In [12]:
# 검색 테스트 
query = "What is the BART architecture?"
retrieved_docs = chroma_mmr.invoke(query)

print(f"쿼리: {query}")
print("검색 결과:")
for i, doc in enumerate(retrieved_docs, 1):
    score = cosine_similarity(
        [embeddings_model.embed_query(query)], 
        [embeddings_model.embed_query(doc.page_content)]
        )[0][0]
    print(f"-{i}-\n{doc.page_content[:100]}...{doc.page_content[-100:]} \n[유사도: {score}]")
    print("-" * 100)

쿼리: What is the BART architecture?
검색 결과:
-1-
. The architecture is closely related to that used in
BERT, with the following differences: (1) each... not. In total, BART con-
tains roughly 10% more parameters than the equiva-
lently sized BERT model 
[유사도: 0.6180143168912577]
----------------------------------------------------------------------------------------------------
-2-
. 2 Model
BART is a denoising autoencoder that maps a corrupted
document to the original document it...gressive decoder. For pre-training,
we optimize the negative log likelihood of the original
document 
[유사도: 0.5477388932809816]
----------------------------------------------------------------------------------------------------
-3-
In the extreme
case, where all information about the source is lost,
BART is equivalent to a languag...xtreme
case, where all information about the source is lost,
BART is equivalent to a language model. 
[유사도: 0.5143756996743966]
---------------------------------------------------

## 3. Prompt Engineering and Chain Testing

1\) Prompt Engineering

- 모든 답변은 제공된 컨텍스트에만 기반하여 작성되도록 함
- 외부 지식이나 사전 학습된 일반 상식은 사용하지 않도록 함
- 컨텍스트 내 명확한 근거가 없을 경우, **답변할 수 없음**으로 응답하도록 함

In [13]:
from langchain.prompts import ChatPromptTemplate

translate_prompt = ChatPromptTemplate.from_template(
    "Translate the following into English: {query}"
)
work_prompt = ChatPromptTemplate.from_template("""
Please answer following these rules:
1. Answer the questions based only on [Context].
2. If there is no [Context], answer that you don't know.
3. Do not use external knowledge.
4. If there is no clear basis in [Context], answer that you don't know.
5. You can refer to the previous conversation.

[Context]
{context}

[Question] 
{question}

[Answer]
""")
output_prompt = ChatPromptTemplate.from_template(
    "Translate the following into Korean: {output}"
)

2\) Chain Testing

In [14]:
from langchain_core.runnables import RunnableLambda, RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model="gpt-4.1-mini",
    temperature=0.8,
    top_p=0.7
)
output_parser = StrOutputParser()

def format_docs(docs):
    """ 참고 문서 연결 """
    return "\n\n".join([f"{i}: \n{doc.page_content}" for i, doc in enumerate(docs)])

def format_result(answer):
    """ 최종 응답 처리 """
    output = answer['output']
    context = answer['context']
    return f"{output}\n\n[Context]\n{context}"

# 체인 생성
translate_chain = translate_prompt | llm | output_parser
rag_chain = chroma_mmr | RunnableLambda(format_docs)
output_chain = work_prompt | llm | output_parser | output_prompt | llm | output_parser

main_chain = (
    translate_chain |
    RunnableParallel(
        question=RunnablePassthrough(),
        context=lambda x: rag_chain.invoke(x),
    ) | 
    RunnableParallel(
        context=lambda x: x['context'],
        output=output_chain
    ) | RunnableLambda(format_result)
)

In [15]:
# 체인 테스트
query = "BART의 강점이 모야?"
answer = main_chain.invoke({"query": query})
print(f"쿼리: {query}")
print("답변:")
print(answer)

쿼리: BART의 강점이 모야?
답변:
[Context]를 바탕으로, BART의 강점은 다음과 같습니다:

- 모든 ROUGE 지표에서 약 6.0점 가량 이전 BERT 기반 연구를 크게 능가하여 텍스트 생성 작업에서 뛰어난 성능을 보입니다 (Context 0).
- 고품질의 샘플 출력을 생성합니다 (Context 0).
- CONVAI2 데이터셋의 자동 평가 지표에서 이전 연구들을 능가하며 대화 응답 생성에서 우수한 성과를 보입니다 (Context 0).
- BERT와 GPT 사전학습 방식을 모두 일반화한 Transformer 기반 신경망 기계번역 아키텍처를 사용하여 손상 및 재구성(corruption and reconstruction) 접근법으로 학습됩니다 (Context 1).
- 판별 작업에서 RoBERTa 및 XLNet과 비슷한 성능을 보여, 단방향 디코더 레이어가 이러한 작업에서 성능 저하를 일으키지 않음을 증명합니다 (Context 2).

[Context]
0: 
BART outperforms the
best previous work, which leverages BERT, by roughly
6.0 points on all ROUGE metrics—representing a sig-
niﬁcant advance in performance on this problem. Qual-
itatively, sample quality is high (see §6). Dialogue We evaluate dialogue response generation
on C ONVAI2 (Dinan et al., 2019), in which agents
must generate responses conditioned on both the pre-
vious context and a textually-speciﬁed persona. BART
outperforms previous work on two automated metrics.

1: 
BART is trained 

In [16]:
# 체인 테스트
query = "LangChain이 뭐야?"
answer = main_chain.invoke({"query": query})
print(f"쿼리: {query}")
print("답변:")
print(answer)

쿼리: LangChain이 뭐야?
답변:
모르겠어요.

[Context]
0: 
. arXiv preprint arXiv:1907.10529, 2019. Guillaume Lample and Alexis Conneau. Cross-
lingual language model pretraining. arXiv preprint
arXiv:1901.07291, 2019. Zhenzhong Lan, Mingda Chen, Sebastian Goodman,
Kevin Gimpel, Piyush Sharma, and Radu Sori-
cut. Albert: A lite bert for self-supervised learn-
ing of language representations.

1: 
. Associa-
tion for Computational Linguistics. doi: 10.18653/
v1/N19-1423. URL https://www.aclweb. org/anthology/N19-1423. Emily Dinan, Varvara Logacheva, Valentin Malykh,
Alexander Miller, Kurt Shuster, Jack Urbanek,
Douwe Kiela, Arthur Szlam, Iulian Serban, Ryan
Lowe, et al. The second conversational in-
telligence challenge (convai2).

2: 
. This approach is related to the CLS
token in BERT; however we add the additional token
to the end so that representation for the token in the
decoder can attend to decoder states from the complete
input (Figure 3a). 3.2 Token Classiﬁcation Tasks
For token classiﬁcati

## 4. Chat Interface Implementation

- `Gradio`를 활용하여 Chat Interface를 구현함
- 위에서 테스트한 내용을 기능별로 정리하여 구현함

In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from langchain_core.messages import HumanMessage, AIMessage
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables import RunnableLambda, RunnableMap, RunnablePassthrough, RunnableParallel
from langchain_core.output_parsers import StrOutputParser

1\) 벡터 저장소 설정

- `OpenAI`의 **text-embedding-3-small** 임베딩 모델과 **Chroma** 벡터 저장소를 사용함 

In [3]:
from langchain_openai import OpenAIEmbeddings

# OpenAIEmbeddings 모델 생성
embeddings_model = OpenAIEmbeddings(
    model="text-embedding-3-small",  # 사용할 모델 이름
    dimensions=1024
)

In [None]:
from langchain_chroma import Chroma

chroma_db = Chroma(
    collection_name="my_task02",
    embedding_function=embeddings_model,
    persist_directory="./chroma_db",
)

2\) 검색기 정의

- MMR 검색으로 상위 3개 문서를 검색하는 Retriever 사용함

In [5]:
def format_docs(docs):
    """ 참고 문서 연결 """
    return "\n\n".join([f"{i}: \n{doc.page_content}" for i, doc in enumerate(docs)])

In [6]:
def get_context_chain(question):
    """ Retriever """
    chroma_mmr = chroma_db.as_retriever(
        search_type='mmr',
        search_kwargs={
            'k': 3,
            'fetch_k': 8,
            'lambda_mult': 0.3,
        },
    )
    chain = chroma_mmr | RunnableLambda(format_docs)
    return chain.invoke(question)

3\) RAG 프롬프트 구성

In [7]:
def to_english_chain(model):
    prompt = ChatPromptTemplate.from_template(
        "Translate the following into English: {query}"
    )
    return prompt | model | StrOutputParser()

def to_korean_chain(model):
    prompt = ChatPromptTemplate.from_template(
        "Translate the following into Korean: {query}"
    )
    return prompt | model | StrOutputParser()

def get_anwser_chain(model):
    prompt = ChatPromptTemplate.from_messages([
        MessagesPlaceholder("chat_history"),
        ("human", """
Please answer following these rules:
1. Answer the questions based only on [Context].
2. If there is no [Context], answer that you don't know.
3. Do not use external knowledge.
4. If there is no clear basis in [Context], answer that you don't know.
5. You can refer to the previous conversation.

[Context]
{context}

[Question] 
{question}

[Answer]
"""
    )])
    return prompt | model | StrOutputParser()

4\) RAG 체인 구성

- 대화 히스토리는 영문으로 작성된 내용만 저장 및 활용함

In [8]:
from typing import Iterator
from operator import itemgetter

In [9]:
from langchain_openai import ChatOpenAI

model = ChatOpenAI(
    model="gpt-4.1-mini",
    temperature=0.8,
    top_p=0.7
)
memory_store = []

In [10]:
def update_memory(x):
    memory_store.append(HumanMessage(content=x["question"]))
    memory_store.append(AIMessage(content=x["answer"]))
    return f"{x['korean_answer']}\n\n[Context]\n{x['context']}"

In [11]:
def get_streaming_response(message: str, history) -> Iterator[str]:
    translate_chain = to_english_chain(model)
    korean_chain = to_korean_chain(model)
    answer_chain = get_anwser_chain(model)

    full_chain = (
        translate_chain |
        RunnableMap({
            "question": RunnablePassthrough(),  # English question
            "context": lambda q: get_context_chain(q),  # get_context는 이미 함수로 있음
            "chat_history": RunnableLambda(lambda _: memory_store)
        }) |
        RunnableMap({
            "question": itemgetter("question"),
            "context": itemgetter("context"),
            "query": answer_chain
        }) |
        RunnableMap({
            "question": itemgetter("question"),
            "context": itemgetter("context"),
            "answer": itemgetter("query"),
            "korean_answer": korean_chain
        }) |
        RunnableLambda(update_memory)
    )
    return full_chain.invoke(message)

5\) Gradio 스트리밍 구현

In [12]:
import gradio as gr

# Gradio 인터페이스 설정
demo = gr.ChatInterface(
    fn=get_streaming_response,         # 메시지 처리 함수
    title="BART에 대해", # 채팅 인터페이스의 제목
    type="messages"
)

# 실행
demo.launch()

  from .autonotebook import tqdm as notebook_tqdm


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




In [13]:
# demo 실행 종료
demo.close()

Closing server running on port: 7860


In [14]:
memory_store

[HumanMessage(content='What is it that makes BART better than BERT?', additional_kwargs={}, response_metadata={}),
 AIMessage(content="Based on the [Context], BART generalizes BERT by using a bidirectional encoder like BERT and a left-to-right decoder like GPT, combining these features in a Transformer-based sequence-to-sequence architecture. Additionally, BART's decoder layers perform cross-attention over the encoder's final hidden layer, which BERT does not have. BART also contains roughly 10% more parameters than an equivalently sized BERT model. These architectural differences, along with its training method of corrupting and reconstructing text, contribute to BART achieving better performance on various tasks compared to BERT.", additional_kwargs={}, response_metadata={}),
 HumanMessage(content='Explain the strengths of BART.', additional_kwargs={}, response_metadata={}),
 AIMessage(content='Based on the [Context], the strengths of BART include:\n\n- It significantly outperforms p