# **Base**

In [1]:
import os
from dotenv import load_dotenv
from langchain_chroma import Chroma
from supabase import Client, create_client
from langchain_huggingface import HuggingFaceEmbeddings
from app.routers.vectorstore import request_table, text_files_to_docs, docs_text_split, docs_insert_db, db_to_document

load_dotenv()

SUPABASE_URL = os.getenv('SUPABASE_URL')
SUPABASE_API_KEY = os.getenv('SUPABASE_API_KEY')

supabase: Client = create_client(supabase_url=SUPABASE_URL, supabase_key=SUPABASE_API_KEY)

# Define Embedding Model
embedding = HuggingFaceEmbeddings(
    model_name = 'FronyAI/frony-embed-large-ko-v1',
    model_kwargs = {'device': 'mps'},
    encode_kwargs = {'normalize_embeddings': True}
)

# Define Vectorstore path
vectorstore_path = './ga_assistant_store'

# Load Vectorstore
vector_store = Chroma(
    collection_name = "ga_assistant",
    embedding_function = embedding,
    persist_directory = vectorstore_path
)

  from .autonotebook import tqdm as notebook_tqdm


# **Retriever Evaluation**

## **Retriever Definition**

In [2]:
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain.retrievers.document_compressors import EmbeddingsFilter

retriever = vector_store.as_retriever(search_kwargs = {'k': 3})

CrossEncoder = HuggingFaceCrossEncoder(model_name = 'BAAI/bge-reranker-v2-m3')

re_ranker = CrossEncoderReranker(
    model = CrossEncoder,
    top_n = 2
)

EmbeddingFilter = EmbeddingsFilter(
    embeddings = embedding,
    similarity_threshold = 0.3
)

compressor_pipeline = DocumentCompressorPipeline(
    transformers = [re_ranker, EmbeddingFilter]
)

final_retriever = ContextualCompressionRetriever(
    base_compressor = compressor_pipeline,
    base_retriever = retriever
)

## **Create Evaluation Dataset**

### **Create Pydantic Class**

In [3]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from typing import List

# Load API Key
GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')

# Definition model object
llm = ChatGoogleGenerativeAI(
    model = 'gemini-2.5-flash',         # model name
    google_api_key = GEMINI_API_KEY,    # API KEY
    temperature = 0.3
)

class QAPair(BaseModel):
    query: str = Field(description = "AI가 생성한 질문 (write your question in KOREAN)")
    label: str = Field(description = "질문의 대한 답 (write the answer to the fact_based question in KOREAN, making sure it reflects the essence of the question)")

class QASet(BaseModel):
    qa_pairs: List[QAPair] = Field(description = "query: label의 리스트")

pydantic_parser = PydanticOutputParser(pydantic_object = QASet)

pydantic_parser.get_format_instructions()

'The output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}\nthe object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.\n\nHere is the output schema:\n```\n{"$defs": {"QAPair": {"properties": {"query": {"description": "AI가 생성한 질문 (write your question in KOREAN)", "title": "Query", "type": "string"}, "label": {"description": "질문의 대한 답 (write the answer to the fact_based question in KOREAN, making sure it reflects the essence of the question)", "title": "Label", "type": "string"}}, "required": ["query", "label"], "title": "QAPair", "type": "object"}}, "properties": {"qa_pairs": {"description": "query: label의 리스트", "items": {"$ref": "#/$defs/QAPair"}, "title": "Qa Pairs", "type": "array"}}, 

### **Create Chain for get Dataset**

In [4]:
from langchain_core.prompts import ChatPromptTemplate

prompt_text = ""

with open('./prompt/get_evaluation_data.txt', 'r') as f:
    prompt_text = f.read()

create_dataset_prompt = ChatPromptTemplate.from_template(
    template = prompt_text,
    partial_variables = {'format_instructions': pydantic_parser.get_format_instructions()}
)

chain = create_dataset_prompt | llm | pydantic_parser

def create_dataset(context: str, num_questions: int) -> QASet:
    """ Query functions using chains """
    return chain.invoke({'context': context, 'num_questions_per_chunk': num_questions})

### **Chain Execute**

In [5]:
import pandas as pd

# Get vectorstore latest data in supabase table
df = request_table('vectorstore', db = supabase)
df.drop(['id', 'source'], axis = 1, inplace = True)

# final return variable
qa_pairs = []

# Process to get evaluation data
for _, row in df.iterrows():
    qa_pairs_row = create_dataset(row['page_content'], 10)
    
    for qa_pair_row in qa_pairs_row.qa_pairs:
        # preprocess -> append
        qa_pairs.append(
            {
                'doc_id': row['doc_id'],
                'context': row['page_content'],
                'file_name': row['file_name'],
                'date': row['date'],
                'query': qa_pair_row.query,
                'label': qa_pair_row.label
            }
        )

final_df = pd.DataFrame(qa_pairs)
final_df.to_csv('./data/question_paper.csv', index = False)

final_df.head()

[Alert] All data get Success


Unnamed: 0,doc_id,context,file_name,date,query,label
0,1,총무팀 업무 메뉴얼을 안내합니다.\n\n[총무팀 구성원]\n1. 이동훈님 (이메일:...,Guide_GA.txt,2025-08-22,이 문서는 어떤 내용을 안내하고 있습니까?,총무팀 업무 메뉴얼을 안내하고 있습니다.
1,1,총무팀 업무 메뉴얼을 안내합니다.\n\n[총무팀 구성원]\n1. 이동훈님 (이메일:...,Guide_GA.txt,2025-08-22,총무팀 구성원 중 한 명의 이름은 무엇입니까?,이동훈님입니다.
2,1,총무팀 업무 메뉴얼을 안내합니다.\n\n[총무팀 구성원]\n1. 이동훈님 (이메일:...,Guide_GA.txt,2025-08-22,이동훈님의 이메일 주소는 무엇입니까?,main3373@gmail.com입니다.
3,1,총무팀 업무 메뉴얼을 안내합니다.\n\n[총무팀 구성원]\n1. 이동훈님 (이메일:...,Guide_GA.txt,2025-08-22,이동훈님의 연락처는 몇 번입니까?,010-3271-7132입니다.
4,1,총무팀 업무 메뉴얼을 안내합니다.\n\n[총무팀 구성원]\n1. 이동훈님 (이메일:...,Guide_GA.txt,2025-08-22,이동훈님의 주요 역할 중 하나는 무엇입니까?,총무 일괄 지원입니다.


## **Dataset preprocess**

In [6]:
from langchain_core.documents import Document

# Convert DataFrame to Document
def df_to_documents(df: pd.DataFrame, content_column: str, metadata_columns: List[str] = None) -> List[Document]:
    """
    Dataframe convert to Document object
    - df: DataFrame
    - content_column: will be page_content element
    - metadata_columns = will be metadta -> Default None(All)
    """
    if metadata_columns is None:
        metadata_columns = [col for col in df.columns if col != content_column]

    docs = []

    for _, row in df.iterrows():
        page_content = row[content_column]
        metadata = {col: [row[col]] for col in metadata_columns}
        
        docs.append(Document(page_content=str(page_content), metadata=metadata))

    return docs

metadata_list = ['doc_id', 'context', 'file_name', 'date']

# Extract label docs: Column 'context'
labels = df_to_documents(final_df, content_column = 'context', metadata_columns = metadata_list)

# Feature: final_df['context']
predicts = []

for _, row in final_df.iterrows():
    context = row['context']
    query = row['query']

    response = final_retriever.invoke(query)

    predicts.append(response)

print(f"Label 샘플 추출: {labels[0]}")
print(f"Feature 샘플 추출: {predicts[0]}")

Label 샘플 추출: page_content='총무팀 업무 메뉴얼을 안내합니다.

[총무팀 구성원]
1. 이동훈님 (이메일: main3373@gmail.com, 연락처: 010-3271-7132)
 - 역할 : 총무 일괄 지원, IT 총무(PC, 소프트웨어 장애 등 IT 관련 지원)' metadata={'doc_id': ['1'], 'context': ['총무팀 업무 메뉴얼을 안내합니다.\n\n[총무팀 구성원]\n1. 이동훈님 (이메일: main3373@gmail.com, 연락처: 010-3271-7132)\n - 역할 : 총무 일괄 지원, IT 총무(PC, 소프트웨어 장애 등 IT 관련 지원)'], 'file_name': ['Guide_GA.txt'], 'date': ['2025-08-22']}
Feature 샘플 추출: []


## **Create a function to calculate MRR**

In [7]:
def single_mrr(label: Document, predict: List[Document]) -> float:
    rank = 1

    # Search rank
    for doc in predict:
        if doc.page_content == label.page_content:
            return 1 / rank
        
        rank += 1
    
    # If no label, return 0.0
    return 0.0

def mrr_mean(labels: List[Document], predicts: List[List[Document]]) -> float:
    max_range = len(labels)
    
    mrrs = []
    
    for i in range(0, max_range):
        label = labels[i]
        predict = predicts[i]

        mrr = single_mrr(label = label, predict = predict)
        mrrs.append(mrr)

    mrr_mean = sum(mrrs) / len(mrrs)

    return mrr_mean

## **Run calculation**

In [8]:
result = mrr_mean(labels = labels, predicts = predicts)

print(f"Retriever의 평균 mrr 지표: {result}")

Retriever의 평균 mrr 지표: 0.8625


## **Appendix: Calling with API**

In [10]:
import requests

BASE_URL = 'http://127.0.0.1:8000'
ENDPOINT = '/evaluate_metrics/llm/mrrs_mean'

# Preprocessing for data transmission
labels_data = [{'page_content': doc.page_content, 'metadata': doc.metadata} for doc in labels]
predicts_data = [[{'page_content': doc.page_content, 'metadata': doc.metadata} for doc in sublist] for sublist in predicts]

# Convert to type: json
payload = {'labels': labels_data, 'predicts': predicts_data}

# request
response = requests.post(f"{BASE_URL}{ENDPOINT}", json = payload, timeout = 20)

# response output
print(f"MRR: {response.json()}")

MRR: 0.8625


# **LLM Evaluation**

## **Run predicts**

In [13]:
def request_llm(query):
    BASE_URL = 'http://127.0.0.1:8000'
    ENDPOINT = '/request/rag_model/lcel'

    payload = {'input_text': query}

    response = requests.post(f"{BASE_URL}{ENDPOINT}", json = payload, timeout = 600).text

    return response

questions = final_df['query']
labels = final_df['label']

predicts = []

for question in questions:
    result = request_llm(question)
    predicts.append(result)

final_df['predict'] = predicts

## **Evaluator Definition**

In [14]:
from langchain.evaluation import load_evaluator, EvaluatorType
from langchain_openai import ChatOpenAI

gpt = ChatOpenAI(model = 'gpt-4o-mini', temperature = 0.1)

evaluator_qa = load_evaluator(evaluator = 'qa', llm = gpt)
evaluator_context_qa = load_evaluator(evaluator = 'context_qa', llm = gpt)
evaluator_cot_qa = load_evaluator(evaluator = 'cot_qa', llm = gpt)

## **Run Evaluator**

In [16]:
# Extract Feature, Predictions, labels
query = final_df['query']
predictions = final_df['predict']
labels = final_df['label']

# Define index range to use 'iloc'
index_range = range(0, len(final_df))

# Final Return variables
qa_results = []
context_qa_results = []
cot_qa_results = []

# Run a evaluation
for i in index_range:
    qa_result = evaluator_qa.evaluate_strings(
        input = final_df['query'].iloc[i],
        prediction = final_df['predict'].iloc[i],
        reference = final_df['label'].iloc[i]
    )

    context_qa_result = evaluator_context_qa.evaluate_strings(
        input = final_df['query'].iloc[i],
        prediction = final_df['predict'].iloc[i],
        reference = final_df['label'].iloc[i]
    )

    cot_qa_result = evaluator_cot_qa.evaluate_strings(
        input = final_df['query'].iloc[i],
        prediction = final_df['predict'].iloc[i],
        reference = final_df['label'].iloc[i]
    )

    qa_results.append(qa_result)
    context_qa_results.append(context_qa_result)
    cot_qa_results.append(cot_qa_result)

## **Check Results**

In [17]:
# Extract scores
qa_results_score = [result['score'] for result in qa_results]
context_qa_results_score = [result['score'] for result in context_qa_results]
cot_qa_results_score = [result['score'] for result in cot_qa_results]

# Add columns into final_df
final_df['general_qa'] = qa_results_score
final_df['context_qa'] = context_qa_results_score
final_df['cot_qa_result'] = cot_qa_results_score

# Save to final result
final_df.to_csv('./data/evaluated_question_paper.csv')

# Select metrics
scores = final_df[['general_qa', 'context_qa', 'cot_qa_result']]

# Print metrics
scores.mean()

general_qa       0.80
context_qa       0.85
cot_qa_result    0.90
dtype: float64