# **라이브러리 및 환경변수 설정**


In [3]:
import pandas as pd
from langchain_chroma import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain.retrievers.document_compressors import EmbeddingsFilter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.documents import Document
from pydantic import BaseModel, Field
from supabase import create_client, Client
from datetime import datetime
from app.routers.vectorstore import request_table, text_files_to_docs, docs_text_split, docs_insert_db, db_to_document
import os
from glob import glob
from typing import List
from krag.evaluators import OfflineRetrievalEvaluators


SUPABASE_URL = os.getenv('SUPABASE_URL')
SUPABASE_API_KEY = os.getenv('SUPABASE_API_KEY')

supabase: Client = create_client(supabase_url=SUPABASE_URL, supabase_key=SUPABASE_API_KEY)

def check_vectorstore_status():
    collection = vector_store._collection
    print(f"저장된 문서 수: {collection.count()}")
    
    # 샘플 검색으로 테스트
    docs = retriever.get_relevant_documents("테스트")
    print(f"검색된 문서 수: {len(docs)}")
    for doc in docs:
        print(f"문서 내용: {doc.page_content[:100]}...")

# **신규 데이터 삽입**

In [None]:
# '.txt' files convert to Document object
text_into_docs = text_files_to_docs(path = './data/GA_information')

# Document object split chunk
splitted_docs = docs_text_split(docs = text_into_docs, size= 500, overlap = 100, db = supabase)

# New Documents insert into db
docs_insert_db(splitted_docs = splitted_docs, db = supabase)

# Load latest docs from the db
latest_docs = db_to_document(db = supabase)

# HuggingFaceModel Alert Disable
os.environ["TOKENIZERS_PARALLELISM"] = "false"

[Alert] No Data in table
[Alert] All data get Success
[Alert] All data get Success


  from .autonotebook import tqdm as notebook_tqdm


[Alert] There is no Chroma DB, Create a new one.


In [4]:
# Define Embedding Model
embedding = HuggingFaceEmbeddings(
    model_name = 'FronyAI/frony-embed-large-ko-v1',
    model_kwargs = {'device': 'mps'},
    encode_kwargs = {'normalize_embeddings': True}
)

# Define Vectorstore path
vectorstore_path = './ga_assistant_store'

# If not exist vectorstore
if not os.path.isdir(vectorstore_path):
    print("[Alert] There is no Chroma DB, Create a new one.")
    vector_store = Chroma.from_documents(
        documents = latest_docs,                    # Document 객체 리스트 (TextSplitter로 분할된 문서)
        embedding = embedding,                      # 임베딩 모델 정의 변수
        collection_name = "ga_assistant",           # 필요할 때 호출할 수 있는 컬렉션 이름 정의
        persist_directory = vectorstore_path        # 로컬 경로 지정 (미지정 시 RAM에 저장 -> 휘발되니 경로 지정 필요)
    )

# If exist vectorstore
else:
    print("[Alert] Discover DB, add data to existing DB.")
    vector_store = Chroma(
        collection_name = "ga_assistant",
        embedding_function = embedding,
        persist_directory = vectorstore_path
    )

    # Add new documents
    # vector_store.add_documents(latest_docs)

  from .autonotebook import tqdm as notebook_tqdm


[Alert] Discover DB, add data to existing DB.


# **Compressor retriever**

In [5]:
retriever = vector_store.as_retriever(search_kwargs = {'k': 3})

CrossEncoder = HuggingFaceCrossEncoder(model_name = 'BAAI/bge-reranker-v2-m3')

re_ranker = CrossEncoderReranker(
    model = CrossEncoder,
    top_n = 2
)

EmbeddingFilter = EmbeddingsFilter(
    embeddings = embedding,
    similarity_threshold = 0.3
)

compressor_pipeline = DocumentCompressorPipeline(
    transformers = [re_ranker, EmbeddingFilter]
)

final_retriever = ContextualCompressionRetriever(
    base_compressor = compressor_pipeline,
    base_retriever = retriever
)

# **평가 데이터셋 생성**

In [12]:
# Load API Key
GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')

# Definition model object
llm = ChatGoogleGenerativeAI(
    model = 'gemini-2.5-flash',         # model name
    google_api_key = GEMINI_API_KEY,    # API KEY
    temperature = 0.3
)

## **Pydantic Class**

In [10]:
class QAPair(BaseModel):
    query: str = Field(description = "AI가 생성한 질문 (write your question in KOREAN)")
    label: str = Field(description = "질문의 대한 답 (write the answer to the fact_based question in KOREAN, making sure it reflects the essence of the question)")

class QASet(BaseModel):
    qa_pairs: List[QAPair] = Field(description = "query: label의 리스트")

pydantic_parser = PydanticOutputParser(pydantic_object = QASet)

pydantic_parser.get_format_instructions()

'The output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}\nthe object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.\n\nHere is the output schema:\n```\n{"$defs": {"QAPair": {"properties": {"query": {"description": "AI가 생성한 질문 (write your question in KOREAN)", "title": "Query", "type": "string"}, "label": {"description": "질문의 대한 답 (write the answer to the fact_based question in KOREAN, making sure it reflects the essence of the question)", "title": "Label", "type": "string"}}, "required": ["query", "label"], "title": "QAPair", "type": "object"}}, "properties": {"qa_pairs": {"description": "query: label의 리스트", "items": {"$ref": "#/$defs/QAPair"}, "title": "Qa Pairs", "type": "array"}}, 

## **Create Chain**

In [13]:
prompt_text = ""

with open('./prompt/get_evaluation_data.txt', 'r') as f:
    prompt_text = f.read()

create_dataset_prompt = ChatPromptTemplate.from_template(
    template = prompt_text,
    partial_variables = {'format_instructions': pydantic_parser.get_format_instructions()}
)

chain = create_dataset_prompt | llm | pydantic_parser

def create_dataset(context: str, num_questions: int) -> QASet:
    """ Query functions using chains """
    return chain.invoke({'context': context, 'num_questions_per_chunk': num_questions})

## **Create Evaluation DataSet**

In [6]:
df = request_table('vectorstore', db = supabase)
df.drop(['id', 'source'], axis = 1, inplace = True)

df.head()

[Alert] All data get Success


Unnamed: 0,doc_id,file_name,page_content,date
0,1,Guide_GA.txt,총무팀 업무 메뉴얼을 안내합니다.\n\n[총무팀 구성원]\n1. 이동훈님 (이메일:...,2025-08-22
1,2,Guide_GA.txt,"[업무 요청]\n1. 유형자산, 무형자산 신청\n 1) 현재 우리 회사는 직군 별 ...",2025-08-22
2,3,Guide_GA.txt,2. 정기 주차\n 1) 안내 : 현재 우리 회사는 50대의 무료 주차분이 할당되어...,2025-08-22
3,4,Guide_GA.txt,3. 자산 외부 반출\n 1) 안내 : 지급된 사내 자산에 대해선 보안상 외부 반출...,2025-08-22


In [14]:
qa_pairs = []

for _, row in df.iterrows():
    qa_pairs_row = create_dataset(row['page_content'], 10)
    
    for qa_pair_row in qa_pairs_row.qa_pairs:
        qa_pairs.append(
            {
                'doc_id': row['doc_id'],
                'context': row['page_content'],
                'file_name': row['file_name'],
                'date': row['date'],
                'query': qa_pair_row.query,
                'label': qa_pair_row.label
            }
        )

final_df = pd.DataFrame(qa_pairs)

In [15]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   doc_id     40 non-null     object
 1   context    40 non-null     object
 2   file_name  40 non-null     object
 3   date       40 non-null     object
 4   query      40 non-null     object
 5   label      40 non-null     object
dtypes: object(6)
memory usage: 2.0+ KB


In [16]:
final_df.head()

Unnamed: 0,doc_id,context,file_name,date,query,label
0,1,총무팀 업무 메뉴얼을 안내합니다.\n\n[총무팀 구성원]\n1. 이동훈님 (이메일:...,Guide_GA.txt,2025-08-22,총무팀 구성원 중 한 명의 이름은 무엇인가요?,이동훈님입니다.
1,1,총무팀 업무 메뉴얼을 안내합니다.\n\n[총무팀 구성원]\n1. 이동훈님 (이메일:...,Guide_GA.txt,2025-08-22,이동훈님의 이메일 주소는 무엇인가요?,main3373@gmail.com입니다.
2,1,총무팀 업무 메뉴얼을 안내합니다.\n\n[총무팀 구성원]\n1. 이동훈님 (이메일:...,Guide_GA.txt,2025-08-22,이동훈님의 연락처는 몇 번인가요?,010-3271-7132입니다.
3,1,총무팀 업무 메뉴얼을 안내합니다.\n\n[총무팀 구성원]\n1. 이동훈님 (이메일:...,Guide_GA.txt,2025-08-22,이동훈님의 역할 중 하나는 무엇인가요?,총무 일괄 지원입니다.
4,1,총무팀 업무 메뉴얼을 안내합니다.\n\n[총무팀 구성원]\n1. 이동훈님 (이메일:...,Guide_GA.txt,2025-08-22,이동훈님은 어떤 IT 관련 지원을 담당하나요?,"PC, 소프트웨어 장애 등 IT 관련 지원을 담당합니다."


# **Retriever Evaluation**

## **Extract Label Docs**

In [63]:
def df_to_documents(df: pd.DataFrame, content_column: str, metadata_columns: List[str] = None) -> List[Document]:
    """
    Dataframe convert to Document object
    - df: DataFrame
    - content_column: page_content
    - metadata_columns = Default None(All), input list type
    """
    if metadata_columns is None:
        metadata_columns = [col for col in df.columns if col != content_column]

    docs = []

    for _, row in df.iterrows():
        page_content = row[content_column]
        metadata = {col: [row[col]] for col in metadata_columns}
        
        docs.append(Document(page_content=str(page_content), metadata=metadata))

    return docs

metadata_list = ['doc_id', 'context', 'file_name', 'date']

# Extract label docs
labels = df_to_documents(final_df, content_column = 'context', metadata_columns = metadata_list)

In [17]:
question_paper = final_df[['context', 'query']]

predicts = []

for _, row in question_paper.iterrows():
    context = row['context']
    query = row['query']

    response = final_retriever.invoke(query)

    predicts.append(response)

predicts[0]

[_DocumentWithState(metadata={'file_name': 'Guide_GA.txt', 'doc_id': 'DOC_1', 'source': './data/GA_information/Guide_GA.txt'}, page_content='총무팀 업무 메뉴얼을 안내합니다.\n\n[총무팀 구성원]\n1. 이동훈님 (이메일: main3373@gmail.com, 연락처: 010-3271-7132)\n - 역할 : 총무 일괄 지원, IT 총무(PC, 소프트웨어 장애 등 IT 관련 지원)', state={'embedded_doc': [0.00371673540212214, 0.013163432478904724, 0.012836224399507046, 0.0044601052068173885, 0.005280079320073128, -0.017548272386193275, 0.013564631342887878, -0.010397655889391899, 0.014133743941783905, -0.03356335684657097, -0.05586544796824455, -0.030161190778017044, -0.013288481160998344, -0.0377313606441021, 0.0061449347995221615, -0.00033826602157205343, 0.04087163135409355, -0.013186515308916569, -0.015328912995755672, 0.0054232398979365826, 0.006415150128304958, -0.014714265242218971, 0.01162585336714983, 0.00014906014257576317, 0.01392163522541523, -0.004393705632537603, 0.03719600290060043, -0.034023597836494446, 0.00835737306624651, -0.0019524514209479094, 0.004267474170774221, 0.

In [69]:
def single_mrr(label: Document, predict: List[Document]) -> float:
    rank = 1

    # Search rank
    for doc in predict:
        if doc.page_content == label.page_content:
            return 1 / rank
        
        rank += 1
    
    # If no label, return 0.0
    return 0.0

def mrr_mean(labels: List[Document], predicts: List[List[Document]]) -> float:
    max_range = len(labels)
    
    mrrs = []
    
    for i in range(0, max_range):
        label = labels[i]
        predict = predicts[i]

        mrr = single_mrr(label = label, predict = predict)
        mrrs.append(mrr)

    mrr_mean = sum(mrrs) / len(mrrs)

    return mrr_mean

In [62]:
predicts

[[_DocumentWithState(metadata={'file_name': 'Guide_GA.txt', 'doc_id': 'DOC_1', 'source': './data/GA_information/Guide_GA.txt'}, page_content='총무팀 업무 메뉴얼을 안내합니다.\n\n[총무팀 구성원]\n1. 이동훈님 (이메일: main3373@gmail.com, 연락처: 010-3271-7132)\n - 역할 : 총무 일괄 지원, IT 총무(PC, 소프트웨어 장애 등 IT 관련 지원)', state={'embedded_doc': [0.00371673540212214, 0.013163432478904724, 0.012836224399507046, 0.0044601052068173885, 0.005280079320073128, -0.017548272386193275, 0.013564631342887878, -0.010397655889391899, 0.014133743941783905, -0.03356335684657097, -0.05586544796824455, -0.030161190778017044, -0.013288481160998344, -0.0377313606441021, 0.0061449347995221615, -0.00033826602157205343, 0.04087163135409355, -0.013186515308916569, -0.015328912995755672, 0.0054232398979365826, 0.006415150128304958, -0.014714265242218971, 0.01162585336714983, 0.00014906014257576317, 0.01392163522541523, -0.004393705632537603, 0.03719600290060043, -0.034023597836494446, 0.00835737306624651, -0.0019524514209479094, 0.004267474170774221, 0

In [70]:
result = mrr_mean(labels = labels, predicts = predicts)

result

0.925

In [18]:
import requests

BASE_URL = 'http://127.0.0.1:8001'
ENDPOINT = '/evaluate_metrics/llm/mrrs_mean'

labels_data = [{'page_content': doc.page_content, 'metadata': doc.metadata} for doc in labels]
predicts_data = [[{'page_content': doc.page_content, 'metadata': doc.metadata} for doc in sublist] for sublist in predicts]

payload = {'labels': labels_data, 'predicts': predicts_data}

response = requests.post(f"{BASE_URL}{ENDPOINT}", json = payload, timeout = 20)

print(f"MRR: {response.json()}")

NameError: name 'labels' is not defined

In [20]:
import requests

def request_llm(query):
    BASE_URL = 'http://127.0.0.1:8001'
    ENDPOINT = '/request/rag_model/lcel'

    payload = {'input_text': query}

    response = requests.post(f"{BASE_URL}{ENDPOINT}", json = payload, timeout = 600).text

    return response

result = request_llm("총무팀 인원")

result

'**요약**  \n총무팀은 **1명**으로 구성되어 있습니다.  \n\n**상세 내용**  \n1. **총무팀 인원**  \n   - 현재 총무팀에는 **이동훈님** 한 분이 소속되어 있습니다.  \n\n2. **이동훈님 정보**  \n   - **이메일**: main3373@gmail.com  \n   - **연락처**: 010-3271-7132  \n   - **역할**: 총무 일괄 지원, IT 총무(PC, 소프트웨어 장애 등 IT 관련 지원)  \n\n필요하신 경우 위 연락처로 문의해 주시면 도움을 드리겠습니다.'

In [None]:
question_paper = final_df.loc[:, ['query', 'label']]

questions = question_paper['query']
labels = question_paper['label']

predicts = []

for question in questions:
    result = request_llm(question)
    predicts.append(result)

In [38]:
predicts = pd.read_csv('./data/piece_1.csv')

final_df['predict'] = predicts['0']

In [43]:
final_df.to_csv('./data/question_paper.csv')

# **LLM as judge**

In [44]:
from langchain.evaluation import load_evaluator, EvaluatorType
from langchain_openai import ChatOpenAI

gpt = ChatOpenAI(model = 'gpt-4o-mini', temperature = 0.1)

evaluator_qa = load_evaluator(evaluator = 'qa', llm = gpt)
evaluator_context_qa = load_evaluator(evaluator = 'context_qa', llm = gpt)
evaluator_cot_qa = load_evaluator(evaluator = 'cot_qa', llm = gpt)

query = final_df['query']
predictions = final_df['predict']
labels = final_df['label']

index_range = range(0, len(final_df))

qa_results = []
context_qa_results = []
cot_qa_results = []

for i in index_range:
    qa_result = evaluator_qa.evaluate_strings(
        input = final_df['query'].iloc[i],
        prediction = final_df['predict'].iloc[i],
        reference = final_df['label'].iloc[i]
    )

    context_qa_result = evaluator_context_qa.evaluate_strings(
        input = final_df['query'].iloc[i],
        prediction = final_df['predict'].iloc[i],
        reference = final_df['label'].iloc[i]
    )

    cot_qa_result = evaluator_cot_qa.evaluate_strings(
        input = final_df['query'].iloc[i],
        prediction = final_df['predict'].iloc[i],
        reference = final_df['label'].iloc[i]
    )

    qa_results.append(qa_result)
    context_qa_results.append(context_qa_result)
    cot_qa_results.append(cot_qa_result)

In [47]:
qa_results

[{'reasoning': 'GRADE: CORRECT', 'value': 'CORRECT', 'score': 1},
 {'reasoning': 'GRADE: CORRECT', 'value': 'CORRECT', 'score': 1},
 {'reasoning': 'GRADE: CORRECT', 'value': 'CORRECT', 'score': 1},
 {'reasoning': 'GRADE: CORRECT', 'value': 'CORRECT', 'score': 1},
 {'reasoning': 'GRADE: CORRECT', 'value': 'CORRECT', 'score': 1},
 {'reasoning': 'GRADE: CORRECT', 'value': 'CORRECT', 'score': 1},
 {'reasoning': 'GRADE: CORRECT', 'value': 'CORRECT', 'score': 1},
 {'reasoning': 'GRADE: CORRECT', 'value': 'CORRECT', 'score': 1},
 {'reasoning': 'GRADE: CORRECT', 'value': 'CORRECT', 'score': 1},
 {'reasoning': 'GRADE: CORRECT', 'value': 'CORRECT', 'score': 1},
 {'reasoning': 'GRADE: CORRECT', 'value': 'CORRECT', 'score': 1},
 {'reasoning': 'GRADE: CORRECT', 'value': 'CORRECT', 'score': 1},
 {'reasoning': 'INCORRECT', 'value': 'INCORRECT', 'score': 0},
 {'reasoning': 'GRADE: CORRECT', 'value': 'CORRECT', 'score': 1},
 {'reasoning': 'GRADE: CORRECT', 'value': 'CORRECT', 'score': 1},
 {'reasoning'

In [48]:
qa_results_score = [result['score'] for result in qa_results]
context_qa_results_score = [result['score'] for result in context_qa_results]
cot_qa_results_score = [result['score'] for result in cot_qa_results]

In [None]:
final_df['general_qa'] = qa_results_score
final_df['context_qa'] = context_qa_results_score
final_df['cot_qa_result'] = cot_qa_results_score

Unnamed: 0,doc_id,context,file_name,date,query,label,predict,general_qa,context_qa,cot_qa_result
0,1,총무팀 업무 메뉴얼을 안내합니다.\n\n[총무팀 구성원]\n1. 이동훈님 (이메일:...,Guide_GA.txt,2025-08-22,총무팀 구성원 중 한 명의 이름은 무엇인가요?,이동훈님입니다.,이동훈님입니다.,1,1,1
1,1,총무팀 업무 메뉴얼을 안내합니다.\n\n[총무팀 구성원]\n1. 이동훈님 (이메일:...,Guide_GA.txt,2025-08-22,이동훈님의 이메일 주소는 무엇인가요?,main3373@gmail.com입니다.,이동훈님의 이메일 주소는 **main3373@gmail.com** 입니다.,1,1,1
2,1,총무팀 업무 메뉴얼을 안내합니다.\n\n[총무팀 구성원]\n1. 이동훈님 (이메일:...,Guide_GA.txt,2025-08-22,이동훈님의 연락처는 몇 번인가요?,010-3271-7132입니다.,**Summary** \n이동훈님의 연락처는 **010-3271-7132** 입니...,1,1,1
3,1,총무팀 업무 메뉴얼을 안내합니다.\n\n[총무팀 구성원]\n1. 이동훈님 (이메일:...,Guide_GA.txt,2025-08-22,이동훈님의 역할 중 하나는 무엇인가요?,총무 일괄 지원입니다.,**Summary** \n이동훈님의 역할 중 하나는 **총무 일괄 지원**입니다....,1,1,1
4,1,총무팀 업무 메뉴얼을 안내합니다.\n\n[총무팀 구성원]\n1. 이동훈님 (이메일:...,Guide_GA.txt,2025-08-22,이동훈님은 어떤 IT 관련 지원을 담당하나요?,"PC, 소프트웨어 장애 등 IT 관련 지원을 담당합니다.","**Summary** \n이동훈님은 총무팀에서 IT 관련 지원을 담당하며, 주로 ...",1,1,1


In [50]:
final_df.to_csv('./data/evaluated_question_paper.csv')

In [52]:
scores = final_df[['general_qa', 'context_qa', 'cot_qa_result']]

scores.mean()

general_qa       0.875
context_qa       0.900
cot_qa_result    0.925
dtype: float64