# LLM 지식 탐색왕 만들기

## 1. Setting

In [None]:
from dotenv import load_dotenv
load_dotenv() # 환경 변수 로드

True

## 2. Document Indexing

### 1\) Load Document
- **BART** 논문을 로드함

In [3]:
# 논문 URL 리스트
urls = {
    "Transformer": "https://arxiv.org/pdf/1706.03762", 
    "BERT": "https://arxiv.org/pdf/1810.04805", 
    "XLNet": "https://arxiv.org/pdf/1906.08237", 
    "BART": "https://arxiv.org/pdf/1910.13461.pdf", 
}

In [None]:
import requests
from langchain_community.document_loaders import PyPDFLoader

documents = []
for source, url in urls.items():
    file_name = f"data/{source}.pdf"
    # PDF 다운로드
    with open(file_name, "wb") as f:
        f.write(requests.get(url).content)

    # 로컬에서 PDF 로드
    loader = PyPDFLoader(file_name)
    docs = loader.load()
    print(f'PDF 문서 개수: {len(docs)}')

    documents += docs

print(f'총 문서 개수: {len(documents)}')

PDF 문서 개수: 15
PDF 문서 개수: 16
PDF 문서 개수: 18
PDF 문서 개수: 10
총 문서 개수: 59


### 2\) Split Texts

- **Semantic Chunking** 방식으로 텍스트를 분할함<br>
    → 임베딩 벡터 간의 **기울기(gradient)** 변화를 기준으로 의미 단위(semantic unit)를 구분함<br>
    → 청크 길이에 일관성이 없으며, 문맥에 따라 길이가 유동적으로 결정됨

- 길이가 100자 미만인 청크는 이미지 기반 텍스트(OCR 등)로 간주하여 제거함<br>
    → 주요 텍스트가 아닌 부가 정보일 가능성이 높기 때문임

- 1차 분할된 청크는 길이 편차가 크므로, 문자열 길이 기준으로 재귀적으로 분할하여 최종적으로는 일관된 길이의 청크를 구성함

In [5]:
from langchain_experimental.text_splitter import SemanticChunker 
from langchain_openai.embeddings import OpenAIEmbeddings

text_splitter = SemanticChunker(
    embeddings=OpenAIEmbeddings(model="text-embedding-3-small"),
    breakpoint_threshold_type="gradient",  # 임계값 타입 설정 (gradient, percentile, standard_deviation, interquartile)
)
chunks = text_splitter.split_documents(documents)
print(f"생성된 청크 수: {len(chunks)}")
print(f"각 청크의 길이: {list(len(chunk.page_content) for chunk in chunks)}")

생성된 청크 수: 178
각 청크의 길이: [1741, 1117, 294, 137, 3824, 1783, 42, 184, 2320, 531, 528, 2127, 772, 853, 1852, 237, 227, 2856, 820, 2372, 52, 2920, 158, 2950, 2, 108, 94, 2969, 38, 2, 71, 64, 2813, 262, 693, 118, 812, 2, 133, 684, 359, 3594, 233, 3729, 562, 30, 194, 3473, 238, 3405, 1204, 275, 2495, 1070, 517, 3860, 68, 2127, 1575, 10, 288, 4172, 17, 127, 2502, 1479, 390, 180, 2076, 504, 132, 184, 1076, 732, 791, 140, 25, 1941, 113, 1027, 160, 147, 1371, 2217, 39, 125, 3535, 3344, 525, 199, 25, 1724, 627, 13, 2588, 517, 166, 2999, 256, 952, 171, 3052, 140, 1214, 2956, 3693, 1, 567, 871, 2728, 194, 2681, 1437, 2324, 1181, 1, 448, 2609, 330, 3321, 1, 108, 2343, 176, 1314, 2, 72, 611, 2429, 2, 298, 2497, 197, 2500, 344, 2751, 2608, 56, 2, 1632, 46, 1270, 1243, 380, 1973, 2279, 3429, 108, 511, 7, 8, 34, 4303, 3184, 1086, 133, 770, 108, 3245, 2897, 568, 481, 257, 1287, 723, 2615, 1152, 1516, 1298, 489, 750, 1123, 771, 1099, 662, 127, 156, 1796]


In [6]:
# 불필요한 청크 제거
selected_chunks = []
for idx, chunk in enumerate(chunks):
    content = chunk.page_content
    if len(chunk.page_content) < 100:
        print(f'{idx}: {content}')
    else:
        selected_chunks.append(chunk)

print(f"생성된 청크 수: {len(selected_chunks)}")
print(f"각 청크의 길이: {list(len(chunk.page_content) for chunk in selected_chunks)}")

6: The output is computed as a weighted sum
3
20: Table 3: Variations on the Transformer architecture.
24: 10
26: Learning phrase representations using rnn encoder-decoder for statistical
machine translation.
28: arXiv preprint arXiv:1508.04025, 2015.
29: 11
30: [25] Mitchell P Marcus, Mary Ann Marcinkiewicz, and Beatrice Santorini.
31: Building a large annotated
corpus of english: The penn treebank.
37: 14
45: BERT BERT
E[CLS] E1  E[SEP]...
56: (2018), but the system
has improved substantially after publication.
59: Additional
62: (2018b) presented
76: Curran As-
sociates, Inc.
84: BERT (Ours)
Trm Trm Trm
Trm Trm Trm
...
90: BERT
E[CLS] E1  E[SEP]...
93: jority class.
106: 4
115: 7
120: 9
125: 10
126: [26] Robert Parker, David Graff, Junbo Kong, Ke Chen, and Kazuaki Maeda.
129: 11
137: Rows and columns represent query and key respec-
tively.
138: 15
140: 5https://openreview.net/forum?id=HJePno0cYm
16
149: A B C .
150: D E .A .
151: C . E . A _ . D _ E . A _C . _ E .
생성된 청크 수: 149
각 청크

In [7]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# 청크 일정한 사이즈로 분할
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300,                      
    chunk_overlap=0,
    encoding_name="cl100k_base",  # TikToken 인코더 이름
    separators=[" \n", ".\n", r"(?<=[.!?])\s+"], # 구분자
    is_separator_regex=True,      # 구분자가 정규식인지 여부
    keep_separator=True,          # 구분자 유지 여부
)
final_chunks = text_splitter.split_documents(selected_chunks)
print(f"생성된 텍스트 청크 수: {len(final_chunks)}")
print(f"각 청크의 길이: {list(len(chunk.page_content) for chunk in final_chunks)}")

생성된 텍스트 청크 수: 285
각 청크의 길이: [985, 756, 1117, 294, 137, 1500, 1411, 913, 1223, 560, 184, 1242, 1078, 531, 528, 1181, 946, 772, 853, 1330, 522, 237, 227, 1297, 1032, 527, 604, 216, 1113, 1166, 93, 614, 937, 1275, 94, 158, 823, 1388, 739, 108, 830, 891, 926, 322, 937, 828, 879, 169, 262, 693, 118, 476, 336, 133, 509, 175, 359, 1052, 1062, 1248, 232, 233, 1203, 1030, 1085, 411, 562, 194, 159, 1063, 1153, 985, 111, 238, 1124, 1187, 1003, 91, 1043, 160, 275, 1234, 1122, 139, 1006, 64, 414, 103, 1040, 1014, 1011, 794, 637, 786, 704, 1007, 568, 288, 1068, 1060, 1142, 902, 127, 1204, 718, 580, 1180, 299, 390, 180, 1027, 1049, 504, 132, 184, 1008, 68, 732, 791, 140, 983, 958, 113, 928, 99, 160, 147, 973, 398, 1178, 1039, 125, 1148, 1040, 1058, 289, 1024, 1021, 1131, 168, 525, 199, 465, 1112, 145, 627, 1058, 945, 585, 517, 166, 1159, 1131, 709, 256, 952, 171, 1437, 1224, 390, 140, 772, 442, 1228, 1042, 686, 694, 1084, 1085, 829, 567, 871, 1128, 1216, 384, 194, 1170, 1111, 400, 855, 582, 995, 794,

In [7]:
from pprint import pprint

pprint(final_chunks[2].metadata)
pprint(final_chunks[2].page_content)

{'author': '',
 'creationdate': '2024-04-10T21:11:43+00:00',
 'creator': 'LaTeX with hyperref',
 'keywords': '',
 'moddate': '2024-04-10T21:11:43+00:00',
 'page': 0,
 'page_label': '1',
 'producer': 'pdfTeX-1.40.25',
 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live '
                    '2023) kpathsea version 6.3.5',
 'source': 'data/Transformer.pdf',
 'subject': '',
 'title': '',
 'total_pages': 15,
 'trapped': '/False'}
('∗Equal contribution. Listing order is random. Jakob proposed replacing RNNs '
 'with self-attention and started\n'
 'the effort to evaluate this idea. Ashish, with Illia, designed and '
 'implemented the first Transformer models and\n'
 'has been crucially involved in every aspect of this work. Noam proposed '
 'scaled dot-product attention, multi-head\n'
 'attention and the parameter-free position representation and became the '
 'other person involved in nearly every\n'
 'detail. Niki designed, implemented, tuned and evaluated countl

### 3\) Embedding
- 문서 임베딩은 `OpenAI`의 **text-embedding-3-small** 모델을 사용함

In [8]:
from langchain_openai import OpenAIEmbeddings

embeddings_model = OpenAIEmbeddings(
    model="text-embedding-3-small",
    dimensions=1024
)
document_embeddings = embeddings_model.embed_documents(
    [chunk.page_content for chunk in final_chunks]
)
print(f"임베딩 벡터의 개수: {len(document_embeddings)}")
print(f"임베딩 벡터의 차원: {len(document_embeddings[0])}")

임베딩 벡터의 개수: 285
임베딩 벡터의 차원: 1024


### 4\) Save Vectors

- 임베딩된 벡터는 벡터스토어로 `ChromaDB` 사용하여 저장함

In [35]:
from langchain_chroma import Chroma

chroma_db = Chroma(
    collection_name="papers",
    embedding_function=embeddings_model,
    persist_directory="./chroma_db",
    collection_metadata = {'hnsw:space': 'cosine'},
)

# 문서를 벡터 저장소에 저장
doc_ids = [f"DOC_{i}" for i in range(len(final_chunks))]
chroma_db.delete(ids=doc_ids)
added_doc_ids = chroma_db.add_documents(documents=final_chunks, ids=doc_ids)
print(f"{len(added_doc_ids)}개의 문서가 성공적으로 벡터 저장소에 추가되었습니다.")

285개의 문서가 성공적으로 벡터 저장소에 추가되었습니다.


## 3. Retrieval

### 1\) Evaluation Dataset

In [9]:
from ragas.testset.persona import Persona

# 페르소나 정의 (다양한 관점에서 질문 생성)
personas = [
    Persona(
        name="graduate_researcher",  # 박사과정 연구원: 심도 있는 분석적 질문
        role_description="As a PhD researcher studying LLM, I am delving deep into the LLM structure, pre-training and fine-tuning objectives.",
    ),
    Persona(
        name="junior_developer",   # 주니어 개발자: 실무 중심적 질문
        role_description="As a junior developer developing LLM services at a platform company, I am very interested in practical technical information and values.",
    ),
    Persona(
        name="undergraduate_student",  # 학부생: 기초적인 학습 질문
        role_description="As an undergraduate student majoring in AI, I would like to acquire basic knowledge about LLM technologies and trends.",
    )
]

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings

# LLM과 임베딩 모델 초기화
generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1"))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model="text-embedding-3-small"))

In [None]:
from ragas.testset import TestsetGenerator

# 합성 데이터 생성
generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings, persona_list=personas)
dataset = generator.generate_with_langchain_docs(final_chunks, testset_size=20) # 47m 6.8s

Applying CustomNodeFilter:   0%|          | 0/285 [00:00<?, ?it/s]          Node 37881f86-c201-487c-b464-379584623a32 does not have a summary. Skipping filtering.
Node ec12dfdc-8e3f-450b-be8a-078ebb145721 does not have a summary. Skipping filtering.
Node f173bba4-b7f4-4e20-9bc9-975173558d6b does not have a summary. Skipping filtering.
Applying CustomNodeFilter:   1%|▏         | 4/285 [00:00<00:50,  5.60it/s]Node 3b33bd44-f3a8-4121-8204-e6c50e2c0997 does not have a summary. Skipping filtering.
Node 2a5acefd-14e6-4508-9e8f-a9dec394ecc7 does not have a summary. Skipping filtering.
Node 0b951e51-7eb4-4101-bda1-c05e6c9f768f does not have a summary. Skipping filtering.
Node 22be5b0c-3716-4f0e-b3e3-59a473f4d928 does not have a summary. Skipping filtering.
Applying CustomNodeFilter:   6%|▌         | 17/285 [00:00<00:10, 25.97it/s]Node 146c8e0a-4077-416b-8a10-36a4de365443 does not have a summary. Skipping filtering.
Node 8bf0096c-ab34-4683-b894-2b34763fe979 does not have a summary. Skipping fil

In [15]:
# 저장
test_data = dataset.to_pandas()
test_data.to_excel("data/test_data.xlsx", index=False)

In [None]:
import pandas as pd

# 로드
test_data = pd.read_excel("data/test_data.xlsx")
test_data.head()

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,Who is Niki Parmar in the context of the Trans...,"['Provided proper attribution is provided, Goo...",Niki Parmar is listed as an author of the pape...,single_hop_specifc_query_synthesizer
1,How is the Transformer model applied to Englis...,['s\nentirely. Experiments on two machine tran...,The Transformer model is applied successfully ...,single_hop_specifc_query_synthesizer
2,Whaat is a Tranformer and whoo were the main p...,['∗Equal contribution. Listing order is random...,The Transformer was first designed and impleme...,single_hop_specifc_query_synthesizer
3,What are gated recurrent neural networks used ...,"['1 Introduction\nRecurrent neural networks, l...",Gated recurrent neural networks have been firm...,single_hop_specifc_query_synthesizer
4,what p100 gpus do in transformer training? i d...,['Recurrent models typically factor computatio...,The Transformer model can be trained to reach ...,single_hop_specifc_query_synthesizer


In [11]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

# 벡터스토어 로드
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    dimensions=1024
)
chroma_db = Chroma(
    collection_name="papers",
    embedding_function=embeddings,
    persist_directory="./chroma_db",
)

# 벡터 검색기 생성
retriever = chroma_db.as_retriever(search_kwargs={"k": 5})

In [26]:
# 검색된 문서
test_data['retrieved_contexts'] = test_data.user_input.apply(
    lambda query: [doc.page_content for doc in retriever.invoke(query)]
)
test_data.head()

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name,retrieved_contexts,response
0,Who is Niki Parmar in the context of the Trans...,"['Provided proper attribution is provided, Goo...",Niki Parmar is listed as an author of the pape...,single_hop_specifc_query_synthesizer,[∗Equal contribution. Listing order is random....,Niki Parmar is one of the contributors to the ...
1,How is the Transformer model applied to Englis...,['s\nentirely. Experiments on two machine tran...,The Transformer model is applied successfully ...,single_hop_specifc_query_synthesizer,[Table 4: The Transformer generalizes well to ...,The Transformer model is applied to English co...
2,Whaat is a Tranformer and whoo were the main p...,['∗Equal contribution. Listing order is random...,The Transformer was first designed and impleme...,single_hop_specifc_query_synthesizer,[d\nlanguage modeling tasks [34]. To the best ...,The Transformer is a neural network architectu...
3,What are gated recurrent neural networks used ...,"['1 Introduction\nRecurrent neural networks, l...",Gated recurrent neural networks have been firm...,single_hop_specifc_query_synthesizer,"[1 Introduction\nRecurrent neural networks, lo...",Gated recurrent neural networks are used as st...
4,what p100 gpus do in transformer training? i d...,['Recurrent models typically factor computatio...,The Transformer model can be trained to reach ...,single_hop_specifc_query_synthesizer,[. The\ntraining objective is to reconstruct ¯...,"Based on the provided context, the role of GPU..."


In [27]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# 템플릿 생성
template = """Answer the question based only on the following context:

[Context]
{context}

[Question]
{query}

[Answer]
"""
prompt = ChatPromptTemplate.from_template(template)

# RAG 체인 구성
llm = ChatOpenAI(model="gpt-4.1-mini", temperature=0)
qa_chain = prompt | llm | StrOutputParser()

# RAG 체인 생성 답변
test_data['response'] = test_data.apply(
    lambda row: qa_chain.invoke({
        "context": "\n".join(row.retrieved_contexts), 
        "query": row.user_input
    }),
    axis=1
)
test_data.head()

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name,retrieved_contexts,response
0,Who is Niki Parmar in the context of the Trans...,"['Provided proper attribution is provided, Goo...",Niki Parmar is listed as an author of the pape...,single_hop_specifc_query_synthesizer,[∗Equal contribution. Listing order is random....,Niki Parmar is one of the contributors to the ...
1,How is the Transformer model applied to Englis...,['s\nentirely. Experiments on two machine tran...,The Transformer model is applied successfully ...,single_hop_specifc_query_synthesizer,[Table 4: The Transformer generalizes well to ...,The Transformer model is applied to English co...
2,Whaat is a Tranformer and whoo were the main p...,['∗Equal contribution. Listing order is random...,The Transformer was first designed and impleme...,single_hop_specifc_query_synthesizer,[d\nlanguage modeling tasks [34]. To the best ...,The Transformer is a neural network architectu...
3,What are gated recurrent neural networks used ...,"['1 Introduction\nRecurrent neural networks, l...",Gated recurrent neural networks have been firm...,single_hop_specifc_query_synthesizer,"[1 Introduction\nRecurrent neural networks, lo...",Gated recurrent neural networks are used as st...
4,what p100 gpus do in transformer training? i d...,['Recurrent models typically factor computatio...,The Transformer model can be trained to reach ...,single_hop_specifc_query_synthesizer,[. The\ntraining objective is to reconstruct ¯...,"Based on the provided context, the role of GPU..."


In [None]:
from ragas import EvaluationDataset
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness

# 평가셋 처리
evaluation_dataset = EvaluationDataset.from_pandas(
    test_data[['user_input', 'retrieved_contexts', 'response', 'reference']]
)

# LLM 래퍼 생성
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1", temperature=0))

# 생성 평가 (기본 베이스 라인)
result = evaluate(
    dataset=evaluation_dataset,   # 평가 데이터셋
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness()],   # 평가 메트릭
    llm=evaluator_llm,   # LLM 래퍼
)
result

Evaluating: 100%|██████████| 63/63 [01:07<00:00,  1.07s/it]


{'context_recall': 0.5660, 'faithfulness': 0.7819, 'factual_correctness(mode=f1)': 0.3619}

In [29]:
## 결과 저장
result.to_pandas().to_excel('data/evaluation_dataset.xlsx', index=False)

### 2\) Retrieval Test

In [32]:
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever

# Semantic 검색기 생성
chroma_k_retriever = chroma_db.as_retriever(
    search_kwargs={"k": 5},
)

# BM25 검색기 생성
bm25_retriever = BM25Retriever.from_documents(final_chunks)

# 앙상블 검색기 생성
ensemble_retriever = EnsembleRetriever(
    retrievers=[chroma_k_retriever, bm25_retriever], 
    weights=[0.7, 0.3]          # 각 검색기의 가중치
)

In [41]:
# 검색기 실행
test_data['bm25_contexts'] = test_data.user_input.apply(
    lambda query: [doc.page_content for doc in bm25_retriever.invoke(query)]
)
test_data['ensemble_contexts'] = test_data.user_input.apply(
    lambda query: [doc.page_content for doc in ensemble_retriever.invoke(query)]
)
test_data.head()

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name,retrieved_contexts,response,hyde_query,bm25_contexts,ensemble_contexts
0,Who is Niki Parmar in the context of the Trans...,"['Provided proper attribution is provided, Goo...",Niki Parmar is listed as an author of the pape...,single_hop_specifc_query_synthesizer,[∗Equal contribution. Listing order is random....,Niki Parmar is one of the contributors to the ...,Niki Parmar is one of the co-authors of the se...,[∗Equal contribution. Listing order is random....,[∗Equal contribution. Listing order is random....
1,How is the Transformer model applied to Englis...,['s\nentirely. Experiments on two machine tran...,The Transformer model is applied successfully ...,single_hop_specifc_query_synthesizer,[Table 4: The Transformer generalizes well to ...,The Transformer model is applied to English co...,**Application of the Transformer Model to Engl...,[Table 4: The Transformer generalizes well to ...,[Table 4: The Transformer generalizes well to ...
2,Whaat is a Tranformer and whoo were the main p...,['∗Equal contribution. Listing order is random...,The Transformer was first designed and impleme...,single_hop_specifc_query_synthesizer,[d\nlanguage modeling tasks [34]. To the best ...,The Transformer is a neural network architectu...,**Transformer: Definition and Key Contributors...,"[n\nimprovement in our setting. Hence, we excl...",[d\nlanguage modeling tasks [34]. To the best ...
3,What are gated recurrent neural networks used ...,"['1 Introduction\nRecurrent neural networks, l...",Gated recurrent neural networks have been firm...,single_hop_specifc_query_synthesizer,"[1 Introduction\nRecurrent neural networks, lo...",Gated recurrent neural networks are used as st...,**Gated Recurrent Neural Networks in Sequence ...,"[1 Introduction\nRecurrent neural networks, lo...","[1 Introduction\nRecurrent neural networks, lo..."
4,what p100 gpus do in transformer training? i d...,['Recurrent models typically factor computatio...,The Transformer model can be trained to reach ...,single_hop_specifc_query_synthesizer,[. The\ntraining objective is to reconstruct ¯...,"Based on the provided context, the role of GPU...",**The Role of NVIDIA P100 GPUs in Transformer ...,[output values. These are concatenated and onc...,[. The\ntraining objective is to reconstruct ¯...


In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

# 문서 생성 체인 생성 (HyDE 기법)
template = """Please create the ideal document content for the given question.
The document should be written in an academic and professional tone.

Question: {question}

Document content:"""
hyde_prompt = ChatPromptTemplate.from_template(template)
hyde_llm = ChatOpenAI(model="gpt-4.1-mini", temperature=0)
hyde_chain = hyde_prompt | hyde_llm | StrOutputParser()

In [None]:
# HyDE 기법 적용 쿼리 생성
test_data['hyde_query'] = test_data.user_input.apply(
    lambda query: hyde_chain.invoke({"question": query})
)
test_data.head()

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name,retrieved_contexts,response,hyde_query
0,Who is Niki Parmar in the context of the Trans...,"['Provided proper attribution is provided, Goo...",Niki Parmar is listed as an author of the pape...,single_hop_specifc_query_synthesizer,[∗Equal contribution. Listing order is random....,Niki Parmar is one of the contributors to the ...,Niki Parmar is one of the co-authors of the se...
1,How is the Transformer model applied to Englis...,['s\nentirely. Experiments on two machine tran...,The Transformer model is applied successfully ...,single_hop_specifc_query_synthesizer,[Table 4: The Transformer generalizes well to ...,The Transformer model is applied to English co...,**Application of the Transformer Model to Engl...
2,Whaat is a Tranformer and whoo were the main p...,['∗Equal contribution. Listing order is random...,The Transformer was first designed and impleme...,single_hop_specifc_query_synthesizer,[d\nlanguage modeling tasks [34]. To the best ...,The Transformer is a neural network architectu...,**Transformer: Definition and Key Contributors...
3,What are gated recurrent neural networks used ...,"['1 Introduction\nRecurrent neural networks, l...",Gated recurrent neural networks have been firm...,single_hop_specifc_query_synthesizer,"[1 Introduction\nRecurrent neural networks, lo...",Gated recurrent neural networks are used as st...,**Gated Recurrent Neural Networks in Sequence ...
4,what p100 gpus do in transformer training? i d...,['Recurrent models typically factor computatio...,The Transformer model can be trained to reach ...,single_hop_specifc_query_synthesizer,[. The\ntraining objective is to reconstruct ¯...,"Based on the provided context, the role of GPU...",**The Role of NVIDIA P100 GPUs in Transformer ...


In [42]:
# HyDE 기법 적용 쿼리 + 앙상블 검색
test_data['hyde_ensemble_contexts'] = test_data.hyde_query.apply(
    lambda query: [doc.page_content for doc in ensemble_retriever.invoke(query)]
)
test_data.head()

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name,retrieved_contexts,response,hyde_query,bm25_contexts,ensemble_contexts,hyde_ensemble_contexts
0,Who is Niki Parmar in the context of the Trans...,"['Provided proper attribution is provided, Goo...",Niki Parmar is listed as an author of the pape...,single_hop_specifc_query_synthesizer,[∗Equal contribution. Listing order is random....,Niki Parmar is one of the contributors to the ...,Niki Parmar is one of the co-authors of the se...,[∗Equal contribution. Listing order is random....,[∗Equal contribution. Listing order is random....,[∗Equal contribution. Listing order is random....
1,How is the Transformer model applied to Englis...,['s\nentirely. Experiments on two machine tran...,The Transformer model is applied successfully ...,single_hop_specifc_query_synthesizer,[Table 4: The Transformer generalizes well to ...,The Transformer model is applied to English co...,**Application of the Transformer Model to Engl...,[Table 4: The Transformer generalizes well to ...,[Table 4: The Transformer generalizes well to ...,"[,\nbigger models are better, and dropout is v..."
2,Whaat is a Tranformer and whoo were the main p...,['∗Equal contribution. Listing order is random...,The Transformer was first designed and impleme...,single_hop_specifc_query_synthesizer,[d\nlanguage modeling tasks [34]. To the best ...,The Transformer is a neural network architectu...,**Transformer: Definition and Key Contributors...,"[n\nimprovement in our setting. Hence, we excl...",[d\nlanguage modeling tasks [34]. To the best ...,"[Provided proper attribution is provided, Goog..."
3,What are gated recurrent neural networks used ...,"['1 Introduction\nRecurrent neural networks, l...",Gated recurrent neural networks have been firm...,single_hop_specifc_query_synthesizer,"[1 Introduction\nRecurrent neural networks, lo...",Gated recurrent neural networks are used as st...,**Gated Recurrent Neural Networks in Sequence ...,"[1 Introduction\nRecurrent neural networks, lo...","[1 Introduction\nRecurrent neural networks, lo...",[Recurrent models typically factor computation...
4,what p100 gpus do in transformer training? i d...,['Recurrent models typically factor computatio...,The Transformer model can be trained to reach ...,single_hop_specifc_query_synthesizer,[. The\ntraining objective is to reconstruct ¯...,"Based on the provided context, the role of GPU...",**The Role of NVIDIA P100 GPUs in Transformer ...,[output values. These are concatenated and onc...,[. The\ntraining objective is to reconstruct ¯...,[Recurrent models typically factor computation...


In [62]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder

# CrossEncoderReranker 모델 초기화 
model = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-v2-m3")
re_ranker = CrossEncoderReranker(model=model, top_n=3)

# 앙상블 검색기 + CrossEncoderReranker 문서 검색
cross_encoder_reranker_retriever = ContextualCompressionRetriever(
    base_compressor=re_ranker, 
    base_retriever=ensemble_retriever,
)
test_data['ensemble_reranker_contexts'] = test_data.user_input.apply(
    lambda query: [doc.page_content for doc in cross_encoder_reranker_retriever.invoke(query)]
)
test_data.head()

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name,retrieved_contexts,response,hyde_query,bm25_contexts,ensemble_contexts,hyde_ensemble_contexts,total_mix_contexts,ensemble_reranker_contexts
0,Who is Niki Parmar in the context of the Trans...,"['Provided proper attribution is provided, Goo...",Niki Parmar is listed as an author of the pape...,single_hop_specifc_query_synthesizer,[∗Equal contribution. Listing order is random....,Niki Parmar is one of the contributors to the ...,Niki Parmar is one of the co-authors of the se...,[∗Equal contribution. Listing order is random....,[∗Equal contribution. Listing order is random....,[∗Equal contribution. Listing order is random....,"[Provided proper attribution is provided, Goog...",[∗Equal contribution. Listing order is random....
1,How is the Transformer model applied to Englis...,['s\nentirely. Experiments on two machine tran...,The Transformer model is applied successfully ...,single_hop_specifc_query_synthesizer,[Table 4: The Transformer generalizes well to ...,The Transformer model is applied to English co...,**Application of the Transformer Model to Engl...,[Table 4: The Transformer generalizes well to ...,[Table 4: The Transformer generalizes well to ...,"[,\nbigger models are better, and dropout is v...",[Table 4: The Transformer generalizes well to ...,[s\nentirely. Experiments on two machine trans...
2,Whaat is a Tranformer and whoo were the main p...,['∗Equal contribution. Listing order is random...,The Transformer was first designed and impleme...,single_hop_specifc_query_synthesizer,[d\nlanguage modeling tasks [34]. To the best ...,The Transformer is a neural network architectu...,**Transformer: Definition and Key Contributors...,"[n\nimprovement in our setting. Hence, we excl...",[d\nlanguage modeling tasks [34]. To the best ...,"[Provided proper attribution is provided, Goog...","[Provided proper attribution is provided, Goog...",[d\nlanguage modeling tasks [34]. To the best ...
3,What are gated recurrent neural networks used ...,"['1 Introduction\nRecurrent neural networks, l...",Gated recurrent neural networks have been firm...,single_hop_specifc_query_synthesizer,"[1 Introduction\nRecurrent neural networks, lo...",Gated recurrent neural networks are used as st...,**Gated Recurrent Neural Networks in Sequence ...,"[1 Introduction\nRecurrent neural networks, lo...","[1 Introduction\nRecurrent neural networks, lo...",[Recurrent models typically factor computation...,"[1 Introduction\nRecurrent neural networks, lo...","[1 Introduction\nRecurrent neural networks, lo..."
4,what p100 gpus do in transformer training? i d...,['Recurrent models typically factor computatio...,The Transformer model can be trained to reach ...,single_hop_specifc_query_synthesizer,[. The\ntraining objective is to reconstruct ¯...,"Based on the provided context, the role of GPU...",**The Role of NVIDIA P100 GPUs in Transformer ...,[output values. These are concatenated and onc...,[. The\ntraining objective is to reconstruct ¯...,[Recurrent models typically factor computation...,"[In addition, we apply dropout to the sums of ...",[e\nprevious state-of-the-art model. The Trans...


In [59]:
test_data['total_mix_contexts'] = test_data.hyde_query.apply(
    lambda query: [doc.page_content for doc in cross_encoder_reranker_retriever.invoke(query)]
)
test_data.head()

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name,retrieved_contexts,response,hyde_query,bm25_contexts,ensemble_contexts,hyde_ensemble_contexts,ensemble_cross_contexts,total_mix_contexts
0,Who is Niki Parmar in the context of the Trans...,"['Provided proper attribution is provided, Goo...",Niki Parmar is listed as an author of the pape...,single_hop_specifc_query_synthesizer,[∗Equal contribution. Listing order is random....,Niki Parmar is one of the contributors to the ...,Niki Parmar is one of the co-authors of the se...,[∗Equal contribution. Listing order is random....,[∗Equal contribution. Listing order is random....,[∗Equal contribution. Listing order is random....,[∗Equal contribution. Listing order is random....,"[Provided proper attribution is provided, Goog..."
1,How is the Transformer model applied to Englis...,['s\nentirely. Experiments on two machine tran...,The Transformer model is applied successfully ...,single_hop_specifc_query_synthesizer,[Table 4: The Transformer generalizes well to ...,The Transformer model is applied to English co...,**Application of the Transformer Model to Engl...,[Table 4: The Transformer generalizes well to ...,[Table 4: The Transformer generalizes well to ...,"[,\nbigger models are better, and dropout is v...",[s\nentirely. Experiments on two machine trans...,[Table 4: The Transformer generalizes well to ...
2,Whaat is a Tranformer and whoo were the main p...,['∗Equal contribution. Listing order is random...,The Transformer was first designed and impleme...,single_hop_specifc_query_synthesizer,[d\nlanguage modeling tasks [34]. To the best ...,The Transformer is a neural network architectu...,**Transformer: Definition and Key Contributors...,"[n\nimprovement in our setting. Hence, we excl...",[d\nlanguage modeling tasks [34]. To the best ...,"[Provided proper attribution is provided, Goog...",[d\nlanguage modeling tasks [34]. To the best ...,"[Provided proper attribution is provided, Goog..."
3,What are gated recurrent neural networks used ...,"['1 Introduction\nRecurrent neural networks, l...",Gated recurrent neural networks have been firm...,single_hop_specifc_query_synthesizer,"[1 Introduction\nRecurrent neural networks, lo...",Gated recurrent neural networks are used as st...,**Gated Recurrent Neural Networks in Sequence ...,"[1 Introduction\nRecurrent neural networks, lo...","[1 Introduction\nRecurrent neural networks, lo...",[Recurrent models typically factor computation...,"[1 Introduction\nRecurrent neural networks, lo...","[1 Introduction\nRecurrent neural networks, lo..."
4,what p100 gpus do in transformer training? i d...,['Recurrent models typically factor computatio...,The Transformer model can be trained to reach ...,single_hop_specifc_query_synthesizer,[. The\ntraining objective is to reconstruct ¯...,"Based on the provided context, the role of GPU...",**The Role of NVIDIA P100 GPUs in Transformer ...,[output values. These are concatenated and onc...,[. The\ntraining objective is to reconstruct ¯...,[Recurrent models typically factor computation...,[e\nprevious state-of-the-art model. The Trans...,"[In addition, we apply dropout to the sums of ..."


In [65]:
# 저장
test_data.to_excel('data/retrieval_test.xlsx', index=False)

### 3\) Retrieval Evaluation

In [34]:
from langchain_core.documents import Document

# 평가셋 Document 객체 리스트로 변환
context_docs = test_data.reference_contexts.apply(
    lambda x: [Document(page_content=doc) for doc in eval(x)]
).tolist()
print(f"컨텍스트 문서: {len(context_docs)}개 문서")
print("="*200)
print(context_docs[0])

컨텍스트 문서: 21개 문서
[Document(metadata={}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works. Attention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗ ‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer,\nbased solel

In [50]:
from krag.evaluators import RougeOfflineRetrievalEvaluators

def evaluate_qa_test(df_qa_test: pd.Series, max_k=2) -> dict:
    # 문서 Document 객체 리스트로 변환
    qa_test = df_qa_test.apply(
        lambda x: [Document(page_content=doc) for doc in x]
    ).tolist()

    # 평가자 인스턴스 생성
    evaluator = RougeOfflineRetrievalEvaluators(
        actual_docs=context_docs,
        predicted_docs=qa_test, 
        match_method='rouge2',
        threshold=0.8,
    )

    # 평가지표 계산
    result = []
    for k in range(1, max_k+1):
        hit_rate = evaluator.calculate_hit_rate(k=k)['hit_rate']
        mrr = evaluator.calculate_mrr(k=k)['mrr']
        map_score = evaluator.calculate_map(k=k)['map']
        ndcg = evaluator.calculate_ndcg(k=k)['ndcg']

        print(f"K={k}")
        print("-"*200)
        print(f"Hit Rate: {hit_rate:.3f}")
        print(f"MRR: {mrr:.3f}")
        print(f"MAP: {map_score:.3f}")
        print(f"NDCG: {ndcg:.3f}")
        print("="*200)
        print()

        result.append({
            'hit_rate': hit_rate,
            'mrr': mrr,
            'map': map_score,
            'ndcg': ndcg,  
        })
    return pd.DataFrame(result)

In [52]:
_ = evaluate_qa_test(test_data.retrieved_contexts, max_k=5)

K=1
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Hit Rate: 0.000
MRR: 1.000
MAP: 0.200
NDCG: 1.000

K=2
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Hit Rate: 0.000
MRR: 1.000
MAP: 0.400
NDCG: 1.000

K=3
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Hit Rate: 0.000
MRR: 1.000
MAP: 0.600
NDCG: 1.000

K=4
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Hit Rate: 0.048
MRR: 1.000


In [53]:
_ = evaluate_qa_test(test_data.bm25_contexts, max_k=5)

K=1
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Hit Rate: 0.000
MRR: 0.286
MAP: 0.057
NDCG: 1.000

K=2
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Hit Rate: 0.000
MRR: 0.333
MAP: 0.076
NDCG: 0.908

K=3
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Hit Rate: 0.000
MRR: 0.365
MAP: 0.105
NDCG: 0.834

K=4
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Hit Rate: 0.000
MRR: 0.389


In [54]:
_ = evaluate_qa_test(test_data.ensemble_contexts, max_k=5)

K=1
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Hit Rate: 0.000
MRR: 1.000
MAP: 0.200
NDCG: 1.000

K=2
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Hit Rate: 0.000
MRR: 1.000
MAP: 0.400
NDCG: 1.000

K=3
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Hit Rate: 0.000
MRR: 1.000
MAP: 0.600
NDCG: 1.000

K=4
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Hit Rate: 0.143
MRR: 1.000


In [55]:
_ = evaluate_qa_test(test_data.hyde_ensemble_contexts, max_k=5)

K=1
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Hit Rate: 0.000
MRR: 0.667
MAP: 0.133
NDCG: 1.000

K=2
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Hit Rate: 0.000
MRR: 0.738
MAP: 0.205
NDCG: 0.924

K=3
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Hit Rate: 0.000
MRR: 0.754
MAP: 0.281
NDCG: 0.892

K=4
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Hit Rate: 0.000
MRR: 0.778


In [None]:
_ = evaluate_qa_test(test_data.ensemble_reranker_contexts, max_k=5) # 가장 우수하다고 판단

K=1
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Hit Rate: 0.000
MRR: 0.857
MAP: 0.171
NDCG: 1.000

K=2
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Hit Rate: 0.000
MRR: 0.929
MAP: 0.310
NDCG: 0.965

K=3
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Hit Rate: 0.000
MRR: 0.929
MAP: 0.446
NDCG: 0.961

K=4
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Hit Rate: 0.000
MRR: 0.929


In [64]:
_ = evaluate_qa_test(test_data.total_mix_contexts, max_k=5)

K=1
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Hit Rate: 0.000
MRR: 0.714
MAP: 0.143
NDCG: 1.000

K=2
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Hit Rate: 0.000
MRR: 0.762
MAP: 0.248
NDCG: 0.968

K=3
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Hit Rate: 0.000
MRR: 0.794
MAP: 0.305
NDCG: 0.901

K=4
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Hit Rate: 0.000
MRR: 0.794


### 4\) Generation Evaluation

In [66]:
# RAG 체인 생성 답변
test_data['answer'] = test_data.apply(
    lambda row: qa_chain.invoke({
        "context": "\n".join(row.ensemble_reranker_contexts), 
        "query": row.user_input
    }),
    axis=1
)
test_data.head()

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name,retrieved_contexts,response,hyde_query,bm25_contexts,ensemble_contexts,hyde_ensemble_contexts,total_mix_contexts,ensemble_reranker_contexts,answer
0,Who is Niki Parmar in the context of the Trans...,"['Provided proper attribution is provided, Goo...",Niki Parmar is listed as an author of the pape...,single_hop_specifc_query_synthesizer,[∗Equal contribution. Listing order is random....,Niki Parmar is one of the contributors to the ...,Niki Parmar is one of the co-authors of the se...,[∗Equal contribution. Listing order is random....,[∗Equal contribution. Listing order is random....,[∗Equal contribution. Listing order is random....,"[Provided proper attribution is provided, Goog...",[∗Equal contribution. Listing order is random....,Niki Parmar is one of the contributors to the ...
1,How is the Transformer model applied to Englis...,['s\nentirely. Experiments on two machine tran...,The Transformer model is applied successfully ...,single_hop_specifc_query_synthesizer,[Table 4: The Transformer generalizes well to ...,The Transformer model is applied to English co...,**Application of the Transformer Model to Engl...,[Table 4: The Transformer generalizes well to ...,[Table 4: The Transformer generalizes well to ...,"[,\nbigger models are better, and dropout is v...",[Table 4: The Transformer generalizes well to ...,[s\nentirely. Experiments on two machine trans...,The Transformer model is applied to English co...
2,Whaat is a Tranformer and whoo were the main p...,['∗Equal contribution. Listing order is random...,The Transformer was first designed and impleme...,single_hop_specifc_query_synthesizer,[d\nlanguage modeling tasks [34]. To the best ...,The Transformer is a neural network architectu...,**Transformer: Definition and Key Contributors...,"[n\nimprovement in our setting. Hence, we excl...",[d\nlanguage modeling tasks [34]. To the best ...,"[Provided proper attribution is provided, Goog...","[Provided proper attribution is provided, Goog...",[d\nlanguage modeling tasks [34]. To the best ...,A Transformer is a neural network architecture...
3,What are gated recurrent neural networks used ...,"['1 Introduction\nRecurrent neural networks, l...",Gated recurrent neural networks have been firm...,single_hop_specifc_query_synthesizer,"[1 Introduction\nRecurrent neural networks, lo...",Gated recurrent neural networks are used as st...,**Gated Recurrent Neural Networks in Sequence ...,"[1 Introduction\nRecurrent neural networks, lo...","[1 Introduction\nRecurrent neural networks, lo...",[Recurrent models typically factor computation...,"[1 Introduction\nRecurrent neural networks, lo...","[1 Introduction\nRecurrent neural networks, lo...",Gated recurrent neural networks are used as st...
4,what p100 gpus do in transformer training? i d...,['Recurrent models typically factor computatio...,The Transformer model can be trained to reach ...,single_hop_specifc_query_synthesizer,[. The\ntraining objective is to reconstruct ¯...,"Based on the provided context, the role of GPU...",**The Role of NVIDIA P100 GPUs in Transformer ...,[output values. These are concatenated and onc...,[. The\ntraining objective is to reconstruct ¯...,[Recurrent models typically factor computation...,"[In addition, we apply dropout to the sums of ...",[e\nprevious state-of-the-art model. The Trans...,"Based on the context, P100 GPUs are used to pr..."


In [None]:
from ragas import EvaluationDataset
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness

# 평가셋 처리
result_data = test_data[['user_input', 'ensemble_reranker_contexts', 'answer', 'reference']]
result_data.columns = ['user_input', 'retrieved_contexts', 'response', 'reference']

evaluation_dataset = EvaluationDataset.from_pandas(result_data)

# LLM 래퍼 생성
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4.1", temperature=0))

# 생성 평가
result = evaluate(
    dataset=evaluation_dataset,   # 평가 데이터셋
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness()],   # 평가 메트릭
    llm=evaluator_llm,   # LLM 래퍼
)
result # faithfulness 빼고는 조금 높음

Evaluating: 100%|██████████| 63/63 [01:12<00:00,  1.16s/it]


{'context_recall': 0.5891, 'faithfulness': 0.7242, 'factual_correctness(mode=f1)': 0.3833}