In [1]:
from autorag.utils.util import load_summary_file
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
import nest_asyncio

nest_asyncio.apply()

In [3]:
import pandas as pd
import os
import pathlib

root_dir = pathlib.PurePath(os.path.dirname(os.getcwd())).parent
data_dir = os.path.join(root_dir, 'data')

In [4]:
%pip install 'AutoRAG[gpu]'


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# RAG 최적화 실습 with AutoRAG

## 전체 최적화 실행 (AutoRAG)

먼저 전체 최적화를 한 번에 실행한 후, 각 단계의 최적화 과정에 대해 더욱 자세히 알아봅시다.

In [5]:
sample_data_dir = os.path.join(data_dir, "optimization_sample")
qa_path = os.path.join(sample_data_dir, "qa.parquet")
corpus_path = os.path.join(sample_data_dir, "corpus.parquet")

In [6]:
# 데이터 확인
qa_df = pd.read_parquet(qa_path, engine='pyarrow')
corpus_df = pd.read_parquet(corpus_path, engine='pyarrow')

In [7]:
qa_df.head()

Unnamed: 0,qid,query,generation_gt,retrieval_gt
0,80_law,"원고 A이 시력 저하등의 증상이 발생한 시점은 언제로 보고, 그 시점 판단의 기준은...","[원고 A의 시력 저하 등의 증상은 2017년 2월 25일 이전, 즉 필러 주입술 ...",[[law - 민사_미성년자인.pdf - 7]]
1,4_finance,2024년도 한국은행의 대정부 일시대출금 한도와 대출조건을 결정하는 과정에서 보완된...,"[2024년도 한국은행의 대정부 일시대출금 한도는 총 50조 원으로, 이전 년도와 ...",[[finance - 2024년 3월_2. 통화신용정책 운영.pdf - 7]]
2,40_public,"2023년에 발표된 '가을철 지역축제 안전관리협조 요청 사항'은 무엇인지, 그리고 ...","[2023년 가을철 지역축제 안전관리협조 요청 사항에 따르면, 인파밀집 행사 시 부...",[[public - 국가안전시스템 개편 보고서.pdf - 8]]
3,69_law,국세기본법 제14조 제2항에서 언급하는 과세표준의 계산에 관한 규정이 어떠한 실질내...,"[국세기본법 제14조 제2항에 따르면, 과세표준의 계산은 소득, 수익, 재산, 행위...",[[law - 행정_원고가.pdf - 22]]
4,10_finance,"비은행 금융기관의 부동산 PF 대출 연체율 상승에 따른 미래적 영향은 무엇이며, 어...","[비은행 금융기관의 부동산 PF 대출의 연체율이 상승하는 경우, 이는 관련 대출 부...",[[finance - 2024년 3월_3. 향후 통화신용정책 방향.pdf - 10]]


In [8]:
len(qa_df)

15

실제 최적화에서는 50~100개 이상의 QA Pair를 사용하는 것을 권장합니다.

In [9]:
corpus_df.head()

Unnamed: 0,doc_id,contents,metadata
0,law - 형사_피고인이.pdf - 1,서울고등법원\n제3형사부\n판 결\n사 건 2023노1373 통신비밀보호법위반\n피...,{'last_modified_datetime': 2024-06-05 20:59:48...
1,law - 행정_원고가.pdf - 6,로열티로 지급하기로 하는 내용의 라이선스 계약을 체결하였다(갑 제4호증의 2). 그...,{'last_modified_datetime': 2024-06-05 20:59:48...
2,commerce - 이커머스 솔루션 소개자료.pdf - 36,e-Commerce Solution BESPIN GLOBAL\n#인앱영상통화 # 글...,{'last_modified_datetime': 2024-06-05 20:59:48...
3,law - 행정_타인자금.pdf - 5,소유주식수\n주주명 지분율 비고\n계 보통주 우선주\nc 33.730.000 33....,{'last_modified_datetime': 2024-06-05 20:59:48...
4,law - 행정_금품.pdf - 23,별지\n관계 법령\n공무원연금법\n제37조(급여의 환수)\n1 공단은 급여를 받은 ...,{'last_modified_datetime': 2024-06-05 20:59:48...


In [10]:
from autorag.evaluator import Evaluator

evaluator = Evaluator(
	qa_data_path=qa_path,
	corpus_data_path=corpus_path,
	project_dir=os.path.join(root_dir, "autorag_project", "evaluation", "optimization_sample"),
)








In [None]:
evaluator.start_trial(yaml_path=os.path.join(root_dir, "config", "optimization", "config.yaml"), skip_validation=True)

이제 각 단계별 결과를 확인하며, 최적화 과정에 대해서 설명합니다.

In [13]:
trial_dir = os.path.join(root_dir, "autorag_project", "evaluation", "optimization_sample", "0")

## Query Expansion & Retrieval

In [15]:
from autorag.utils.util import load_summary_file

query_expansion_summary = load_summary_file(os.path.join(trial_dir, 'retrieve_node_line', 'query_expansion', 'summary.csv'))

In [16]:
query_expansion_summary

Unnamed: 0,filename,module_name,module_params,execution_time,query_expansion_retrieval_recall,is_best
0,0.parquet,PassQueryExpansion,{},0.000285,1.0,True
1,1.parquet,QueryDecompose,"{'generator_module_type': 'llama_index_llm', '...",0.213945,1.0,False
2,2.parquet,HyDE,"{'generator_module_type': 'llama_index_llm', '...",1.092337,1.0,False
3,3.parquet,MultiQueryExpansion,"{'generator_module_type': 'llama_index_llm', '...",0.253236,0.933333,False


In [17]:
retrieval_summary = load_summary_file(os.path.join(trial_dir, 'retrieve_node_line', 'retrieval', 'summary.csv'))

In [18]:
retrieval_summary

Unnamed: 0,filename,module_name,module_params,execution_time,retrieval_f1,retrieval_ndcg,retrieval_map,is_best
0,0.parquet,VectorDB,"{'top_k': 20, 'vectordb': 'chroma_large'}",0.058605,0.095238,0.880641,0.844949,False
1,1.parquet,BM25,"{'top_k': 20, 'bm25_tokenizer': 'ko_kkma'}",0.034406,0.095238,0.950791,0.933333,False
2,2.parquet,BM25,"{'top_k': 20, 'bm25_tokenizer': 'ko_kiwi'}",0.920244,0.095238,1.0,1.0,True
3,3.parquet,BM25,"{'top_k': 20, 'bm25_tokenizer': 'ko_okt'}",0.017498,0.095238,0.975395,0.966667,False
4,4.parquet,HybridRRF,"{'top_k': 20, 'weight': 4.0, 'target_modules':...",0.978849,0.095238,0.93744,0.916667,False
5,5.parquet,HybridCC,"{'top_k': 20, 'normalize_method': 'dbsf', 'wei...",0.978849,0.095238,1.0,1.0,False
6,6.parquet,HybridCC,"{'top_k': 20, 'normalize_method': 'z', 'weight...",0.978849,0.095238,1.0,1.0,False
7,7.parquet,HybridCC,"{'top_k': 20, 'normalize_method': 'mm', 'weigh...",0.978849,0.095238,1.0,1.0,False
8,8.parquet,HybridCC,"{'top_k': 20, 'normalize_method': 'tmm', 'weig...",0.978849,0.095238,1.0,1.0,False


## Reranker & Passage Filter

In [20]:
reranker_summary = load_summary_file(os.path.join(trial_dir, 'retrieve_node_line', 'passage_reranker', 'summary.csv'))

In [21]:
reranker_summary

Unnamed: 0,filename,module_name,module_params,execution_time,passage_reranker_retrieval_f1,passage_reranker_retrieval_ndcg,passage_reranker_retrieval_map,is_best
0,0.parquet,PassReranker,{'top_k': 4},0.000157,0.4,1.0,1.0,True
1,1.parquet,Tart,{'top_k': 4},10.075981,0.186667,0.442062,0.433333,False
2,2.parquet,Upr,{'top_k': 4},10.647726,0.16,0.375395,0.366667,False
3,3.parquet,FlagEmbeddingReranker,{'top_k': 4},4.489106,0.4,1.0,1.0,False


In [22]:
passage_filter_summary = load_summary_file(os.path.join(trial_dir, 'retrieve_node_line', 'passage_filter', 'summary.csv'))

In [23]:
passage_filter_summary

Unnamed: 0,filename,module_name,module_params,execution_time,passage_filter_retrieval_f1,passage_filter_retrieval_ndcg,passage_filter_retrieval_map,is_best
0,0.parquet,PassPassageFilter,{},0.00017,0.4,1.0,1.0,False
1,1.parquet,SimilarityThresholdCutoff,{'threshold': 0.85},0.150015,0.708889,0.866667,0.866667,False
2,2.parquet,SimilarityThresholdCutoff,{'threshold': 0.87},0.20831,0.788889,0.866667,0.866667,False
3,3.parquet,SimilarityThresholdCutoff,{'threshold': 0.89},0.363682,0.8,0.8,0.8,False
4,4.parquet,SimilarityPercentileCutoff,{'percentile': 0.6},0.24573,0.0,0.0,0.0,False
5,5.parquet,SimilarityPercentileCutoff,{'percentile': 0.8},0.122055,0.0,0.0,0.0,False
6,6.parquet,SimilarityPercentileCutoff,{'percentile': 0.4},0.125134,0.0,0.0,0.0,False
7,7.parquet,ThresholdCutoff,{'threshold': 0.85},0.000382,0.4,1.0,1.0,False
8,8.parquet,ThresholdCutoff,{'threshold': 0.87},0.000217,0.4,1.0,1.0,False
9,9.parquet,PercentileCutoff,{'percentile': 0.6},0.000324,0.666667,1.0,1.0,False


## Prompt & Generator

In [24]:
prompt_maker_summary = load_summary_file(os.path.join(trial_dir, 'post_retrieve_node_line', 'prompt_maker', 'summary.csv'))

In [25]:
prompt_maker_summary

Unnamed: 0,filename,module_name,module_params,execution_time,average_prompt_token,prompt_maker_rouge,prompt_maker_sem_score,prompt_maker_bert_score,is_best
0,0.parquet,Fstring,{'prompt': '단락을 읽고 질문에 답하세요. 답할때 단계별로 천천히 고심하여...,0.000203,2389.0,0.448969,0.953454,0.762986,False
1,1.parquet,Fstring,{'prompt': '단락을 읽고 질문에 답하세요. \n 질문 : {query} \...,0.000159,2262.0,0.448969,0.95346,0.762986,False
2,2.parquet,LongContextReorder,{'prompt': '단락을 읽고 질문에 답하세요. \n 질문: {query} \n...,0.000146,4307.133333,0.448969,0.953472,0.762986,True
3,3.parquet,LongContextReorder,{'prompt': '단락을 읽고 질문에 답하세요. 답할때 단계별로 천천히 고심하여...,0.000883,4434.133333,0.448969,0.95347,0.762986,False


In [26]:
generator_summary = load_summary_file(os.path.join(trial_dir, 'post_retrieve_node_line', 'generator', 'summary.csv'))

In [27]:
generator_summary

Unnamed: 0,filename,module_name,module_params,execution_time,average_output_token,rouge,sem_score,bert_score,is_best
0,0.parquet,OpenAILLM,"{'llm': 'gpt-4o-mini', 'temperature': 1.0, 'ba...",0.606898,210.466667,0.542832,0.957097,0.789548,False
1,1.parquet,OpenAILLM,"{'llm': 'gpt-4o', 'temperature': 1.0, 'batch':...",0.460583,162.333333,0.622461,0.958303,0.825605,True


## Final Result

In [29]:
final_result = pd.read_csv(os.path.join(trial_dir, 'summary.csv'))

In [30]:
final_result

Unnamed: 0,node_line_name,node_type,best_module_filename,best_module_name,best_module_params,best_execution_time
0,retrieve_node_line,query_expansion,0.parquet,PassQueryExpansion,{},0.000285
1,retrieve_node_line,retrieval,2.parquet,BM25,"{'top_k': 20, 'bm25_tokenizer': 'ko_kiwi'}",0.920244
2,retrieve_node_line,passage_reranker,0.parquet,PassReranker,{'top_k': 4},0.000157
3,retrieve_node_line,passage_filter,11.parquet,PercentileCutoff,{'percentile': 0.4},0.000252
4,post_retrieve_node_line,prompt_maker,2.parquet,LongContextReorder,{'prompt': '단락을 읽고 질문에 답하세요. \\n 질문: {query} \...,0.000146
5,post_retrieve_node_line,generator,1.parquet,OpenAILLM,"{'llm': 'gpt-4o', 'temperature': 1.0, 'batch':...",0.460583
