# EXP10b: Multi-Document Generalization Rerun

**Phase B** of HANDOFF_v2_next_experiments.md

## 목표
- 실제 운영 RAG 체인으로 A/B/C 설정 비교
- dry-run proxy 제거, 실측 metrics 확보
- 다문서 testset(Phase A 30문항)으로 일반화 성능 정량화

## EXP09와의 차이점
| 항목 | EXP09 (기존) | Phase B (신규) |
|------|-------------|---------------|
| 평가 파이프라인 | 실험용 자동 생성기 | **운영 RAG 체인** |
| ops 지표 | dry-run proxy | **실측** (latency, timeout) |
| 문서당 문항 수 | 1문항 | **6문항** |
| 반복 횟수 | 1-run | **3-run** 평균 |
| testset | 자동 생성 | **수동 검증된** golden_testset |
| 평가 문서 | 100건 전체 | **5건 대표 문서** |

In [None]:
# ============================================================
# Cell 0: 환경 설정
# ============================================================
import os, sys, time, re, json, warnings
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
from collections import defaultdict

warnings.filterwarnings('ignore')

# 프로젝트 루트 설정
PROJECT_ROOT = Path(os.getcwd()).resolve()
if PROJECT_ROOT.name == 'notebooks':
    PROJECT_ROOT = PROJECT_ROOT.parent
os.chdir(PROJECT_ROOT)
sys.path.insert(0, str(PROJECT_ROOT / 'src'))

print(f"Project root: {PROJECT_ROOT}")
print(f"Working directory: {os.getcwd()}")

# .env 로드
from dotenv import load_dotenv
load_dotenv()
assert os.getenv('OPENAI_API_KEY'), 'OPENAI_API_KEY not found in .env'
print('OpenAI API key loaded')

# 실험 출력 디렉토리
EXP_DIR = PROJECT_ROOT / 'data' / 'exp10b'
EXP_DIR.mkdir(parents=True, exist_ok=True)
print(f"Experiment output dir: {EXP_DIR}")

In [None]:
# ============================================================
# Cell 1: 문서 설정 및 Testset 로드
# ============================================================

# 5건 대표 문서 (Phase A에서 선정)
DOC_CONFIGS = {
    "doc_A": {
        "name": "수협중앙회 (text_only)",
        "file_path": "data/raw/files/수협중앙회_수협중앙회 수산물사이버직매장 시스템 재구축 ISMP 수립 입.hwp",
        "doc_type": "text_only",
        "source_doc": "수협중앙회_수협중앙회 수산물사이버직매장 시스템 재구축 ISMP 수립 입.hwp",
    },
    "doc_B": {
        "name": "한국교육과정평가원 (table_simple)",
        "file_path": "data/raw/files/한국교육과정평가원_국가교육과정정보센터(NCIC) 시스템 운영 및 개선.hwp",
        "doc_type": "table_simple",
        "source_doc": "한국교육과정평가원_국가교육과정정보센터(NCIC) 시스템 운영 및 개선.hwp",
    },
    "doc_C": {
        "name": "국립중앙의료원 (table_complex)",
        "file_path": "data/raw/files/국립중앙의료원_(긴급)「2024년도 차세대 응급의료 상황관리시스템 구축.hwp",
        "doc_type": "table_complex",
        "source_doc": "국립중앙의료원_(긴급)「2024년도 차세대 응급의료 상황관리시스템 구축.hwp",
    },
    "doc_D": {
        "name": "한국철도공사 (mixed)",
        "file_path": "data/raw/files/한국철도공사 (용역)_예약발매시스템 개량 ISMP 용역.hwp",
        "doc_type": "mixed",
        "source_doc": "한국철도공사 (용역)_예약발매시스템 개량 ISMP 용역.hwp",
    },
    "doc_E": {
        "name": "스포츠윤리센터 (hwp_representative)",
        "file_path": "data/raw/files/재단법인스포츠윤리센터_스포츠윤리센터 LMS(학습지원시스템) 기능개선.hwp",
        "doc_type": "hwp_representative",
        "source_doc": "재단법인스포츠윤리센터_스포츠윤리센터 LMS(학습지원시스템) 기능개선.hwp",
    },
}

# source_doc → doc_key 매핑 생성
SOURCE_TO_KEY = {v["source_doc"]: k for k, v in DOC_CONFIGS.items()}

# Testset 로드
testset = pd.read_csv('data/experiments/golden_testset_multi.csv')
print(f"Testset loaded: {len(testset)} questions")
print(f"Documents: {testset['source_doc'].nunique()}")
print(f"\nQuestions per document:")
print(testset.groupby('source_doc').size())
print(f"\nDifficulty distribution:")
print(testset['difficulty'].value_counts())

In [None]:
# ============================================================
# Cell 2: Helper Functions (kw_v2, 정규화 등)
# ============================================================

# 동의어 맵 (EXP06에서 검증된 버전)
SYNONYM_MAP = {
    '정보전략계획': 'ismp', 'ismp 수립': 'ismp', '정보화전략계획': 'ismp',
    '통합로그인': 'sso', '단일 로그인': 'sso', '싱글사인온': 'sso',
    '간편인증': '간편인증', '간편 인증': '간편인증',
    '2차인증': '2차인증', '2차 인증': '2차인증', '추가인증': '2차인증',
    'project manager': 'pm', '사업관리자': 'pm', '사업책임자': 'pm', '프로젝트 매니저': 'pm',
    'project leader': 'pl', '프로젝트 리더': 'pl',
    'quality assurance': 'qa', '품질관리': 'qa', '품질보증': 'qa',
    '하자보수': '하자보수', '하자 보수': '하자보수',
    '발주처': '발주기관', '발주 기관': '발주기관',
}

def normalize_answer_v2(text):
    """EXP06 검증된 정규화 v2"""
    if not isinstance(text, str):
        return str(text).strip().lower()
    t = text.strip().lower()
    # 1. 구두점 정리
    t = re.sub(r'[\u00b7\u2027\u2022\u2219]', ' ', t)
    t = re.sub(r'[\u201c\u201d\u2018\u2019\u300c\u300d\u300e\u300f]', '', t)
    t = re.sub(r'[-\u2013\u2014]', ' ', t)
    # 2. 숫자 콤마 제거
    t = re.sub(r'(\d),(?=\d{3})', r'\1', t)
    # 3. 비율 통일
    t = re.sub(r'(\d+)\s*(%|퍼센트|percent)', r'\1%', t)
    # 4. 통화 정리
    t = re.sub(r'(\d+)\s*원', r'\1원', t)
    t = re.sub(r'(\d+)\s*억\s*원', r'\1억원', t)
    t = re.sub(r'(\d+)\s*만\s*원', r'\1만원', t)
    # 5. VAT 통일
    t = t.replace('v.a.t', 'vat').replace('vat 포함', 'vat포함')
    # 6. 동의어 치환
    for orig, norm in SYNONYM_MAP.items():
        t = t.replace(orig.lower(), norm)
    # 7. 공백 정리
    t = re.sub(r'\s+', ' ', t).strip()
    return t


def keyword_accuracy_v2(answer, ground_truth):
    """kw_v2: 정규화 후 키워드 매칭 정확도"""
    ans_norm = normalize_answer_v2(answer)
    gt_norm = normalize_answer_v2(ground_truth)
    gt_words = [w for w in gt_norm.split() if len(w) > 1]
    if not gt_words:
        return 1.0
    matched = sum(1 for w in gt_words if w in ans_norm)
    return matched / len(gt_words)


def compute_metrics_batch(results_df):
    """배치로 kw_v2 계산"""
    kw_scores = []
    for _, row in results_df.iterrows():
        score = keyword_accuracy_v2(row['answer'], row['ground_truth'])
        kw_scores.append(score)
    results_df['kw_v2'] = kw_scores
    return results_df

print('Helper functions loaded.')

In [None]:
# ============================================================
# Cell 3: 5건 대표 문서 인덱싱 (Per-Document ChromaDB)
# ============================================================
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document
from bidflow.parsing.hwp_parser import HWPParser
from bidflow.parsing.preprocessor import TextPreprocessor

# 실험 파라미터 (EXP04-v3 최적 설정)
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50
EMBEDDING_MODEL = 'text-embedding-3-small'

parser = HWPParser()
preprocessor = TextPreprocessor()
splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    separators=['\n\n', '\n', ' ', '']
)
embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL)

# Per-document ChromaDB 생성
doc_vectordbs = {}  # doc_key -> Chroma instance
doc_chunk_counts = {}

for doc_key, doc_cfg in DOC_CONFIGS.items():
    persist_dir = str(EXP_DIR / f'vectordb_{doc_key}')
    file_path = str(PROJECT_ROOT / doc_cfg['file_path'])
    
    print(f"\n{'='*60}")
    print(f"Indexing {doc_key}: {doc_cfg['name']}")
    print(f"File: {doc_cfg['file_path']}")
    
    # 기존 인덱스가 있으면 재사용
    if os.path.exists(persist_dir) and os.listdir(persist_dir):
        print(f"  -> Using existing index at {persist_dir}")
        vdb = Chroma(
            persist_directory=persist_dir,
            embedding_function=embeddings,
            collection_name='bidflow_rfp'
        )
        count = vdb._collection.count()
        doc_vectordbs[doc_key] = vdb
        doc_chunk_counts[doc_key] = count
        print(f"  -> Loaded {count} chunks")
        continue
    
    # 1. HWP 텍스트 추출 (production parser)
    t0 = time.time()
    raw_text = parser._parse_with_hwp5txt(file_path)
    parse_method = 'hwp5txt'
    if not raw_text:
        raw_text = parser._parse_with_olefile(file_path)
        parse_method = 'olefile'
    parse_time = time.time() - t0
    
    if not raw_text:
        print(f"  !! FAILED to extract text from {doc_key}")
        continue
    
    # 2. 정규화
    normalized_text = preprocessor.normalize(raw_text)
    
    # 3. 청킹 (실험 파라미터)
    chunks = splitter.split_text(normalized_text)
    
    # 4. LangChain Document 생성
    lc_docs = []
    for i, chunk_text in enumerate(chunks):
        lc_docs.append(Document(
            page_content=chunk_text,
            metadata={
                'doc_key': doc_key,
                'filename': doc_cfg['source_doc'],
                'chunk_index': i,
                'doc_type': doc_cfg['doc_type'],
            }
        ))
    
    # 5. ChromaDB에 인덱싱
    t1 = time.time()
    vdb = Chroma.from_documents(
        documents=lc_docs,
        embedding=embeddings,
        persist_directory=persist_dir,
        collection_name='bidflow_rfp'
    )
    index_time = time.time() - t1
    
    doc_vectordbs[doc_key] = vdb
    doc_chunk_counts[doc_key] = len(chunks)
    
    print(f"  -> Parse: {parse_method} ({parse_time:.1f}s), {len(raw_text):,} chars")
    print(f"  -> Chunks: {len(chunks)} (size={CHUNK_SIZE}, overlap={CHUNK_OVERLAP})")
    print(f"  -> Index: {index_time:.1f}s")

print(f"\n{'='*60}")
print("\nIndexing Summary:")
for k, v in doc_chunk_counts.items():
    print(f"  {k}: {v} chunks")
print(f"  Total: {sum(doc_chunk_counts.values())} chunks")

In [None]:
# ============================================================
# Cell 4: RAG 체인 팩토리 (Per-Document Pipeline)
# ============================================================
from langchain_community.retrievers import BM25Retriever
from langchain_core.retrievers import BaseRetriever
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from typing import List, Any


class ExperimentRetriever(BaseRetriever):
    """실험용 하이브리드 리트리버 (파라미터 완전 제어)"""
    vector_retriever: Any = None
    bm25_retriever: Any = None
    weights: List[float] = [0.3, 0.7]
    top_k: int = 15
    pool_size: int = 50
    use_rerank: bool = True
    rerank_model: str = 'BAAI/bge-reranker-v2-m3'

    def _get_relevant_documents(
        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
    ) -> List[Document]:
        search_k = self.pool_size if self.use_rerank else self.top_k

        # BM25
        try:
            self.bm25_retriever.k = search_k * 2
            bm25_docs = self.bm25_retriever.invoke(query)
        except Exception:
            bm25_docs = []

        # Vector
        try:
            self.vector_retriever.search_kwargs['k'] = search_k * 2
            vector_docs = self.vector_retriever.invoke(query)
        except Exception:
            vector_docs = []

        # RRF Merge
        rrf_top = self.pool_size if self.use_rerank else self.top_k
        merged = self._rrf_merge(bm25_docs, vector_docs, k=60, limit=rrf_top)

        # Rerank
        if self.use_rerank and merged:
            from bidflow.retrieval.rerank import rerank
            merged = rerank(query, merged, top_k=self.top_k, model_name=self.rerank_model)

        return merged

    def _rrf_merge(self, list1, list2, k=60, limit=50):
        scores = defaultdict(float)
        doc_map = {}
        w_bm25, w_vec = self.weights
        for rank, doc in enumerate(list1):
            scores[doc.page_content] += w_bm25 * (1 / (rank + k))
            doc_map[doc.page_content] = doc
        for rank, doc in enumerate(list2):
            scores[doc.page_content] += w_vec * (1 / (rank + k))
            if doc.page_content not in doc_map:
                doc_map[doc.page_content] = doc
        sorted_contents = sorted(scores.keys(), key=lambda x: scores[x], reverse=True)
        return [doc_map[c] for c in sorted_contents[:limit]]


def build_retriever(vdb, alpha=0.7, top_k=15, pool_size=50, use_rerank=True):
    """ChromaDB 인스턴스로부터 하이브리드 리트리버 생성"""
    vector_retriever = vdb.as_retriever(search_kwargs={'k': pool_size * 2})

    # BM25 초기화 (ChromaDB에서 전체 문서 로드)
    result = vdb.get()
    all_docs = []
    if result and result['documents']:
        for i, text in enumerate(result['documents']):
            meta = result['metadatas'][i] if result['metadatas'] else {}
            all_docs.append(Document(page_content=text, metadata=meta))
    bm25_retriever = BM25Retriever.from_documents(all_docs) if all_docs else BM25Retriever.from_documents([Document(page_content='empty')])
    bm25_retriever.k = pool_size * 2

    return ExperimentRetriever(
        vector_retriever=vector_retriever,
        bm25_retriever=bm25_retriever,
        weights=[round(1 - alpha, 2), round(alpha, 2)],
        top_k=top_k,
        pool_size=pool_size,
        use_rerank=use_rerank,
    )


def build_rag_chain(retriever, model_name='gpt-5-mini'):
    """RAG 체인 생성 (운영 체인과 동일 프롬프트)"""
    temp = 1 if model_name == 'gpt-5-mini' else 0
    llm = ChatOpenAI(model=model_name, temperature=temp, timeout=60, max_retries=2)

    prompt = ChatPromptTemplate.from_template(
        '아래 문맥(Context)만을 근거로 질문에 답하세요.\n'
        '반드시 원문에 있는 사업명, 기관명, 금액, 날짜 등의 표현을 그대로(Verbatim) 사용하세요.\n'
        '문맥에 답이 없으면 \'해당 정보를 찾을 수 없습니다\'라고 답하세요.\n\n'
        '## 문맥 (Context)\n{context}\n\n'
        '## 질문\n{question}\n\n'
        '## 답변\n'
    )

    def invoke_chain(question):
        """답변 생성 + 검색 컨텍스트 반환"""
        t0 = time.time()
        docs = retriever.invoke(question)
        retrieval_time = time.time() - t0

        context_text = '\n\n'.join([doc.page_content for doc in docs])

        t1 = time.time()
        chain = prompt | llm | StrOutputParser()
        answer = chain.invoke({'context': context_text, 'question': question})
        generation_time = time.time() - t1

        return {
            'answer': answer,
            'retrieved_contexts': [doc.page_content for doc in docs],
            'n_retrieved': len(docs),
            'retrieval_time': retrieval_time,
            'generation_time': generation_time,
            'total_time': retrieval_time + generation_time,
        }

    return invoke_chain

print('RAG chain factory loaded.')

In [None]:
# ============================================================
# Cell 5: 실험 설정 정의 (A/B/C Configs)
# ============================================================

# Config A: 현재 최적 설정 (EXP04-v3 best) - 모든 문서 동일
CONFIG_A = {
    'name': 'A_single_pipeline',
    'description': '현재 최적 설정 그대로 (alpha=0.7, rerank, pool=50, top_k=15)',
    'params': {
        'alpha': 0.7,
        'top_k': 15,
        'pool_size': 50,
        'use_rerank': True,
    },
    'doc_type_overrides': {},  # 문서 유형별 오버라이드 없음
}

# Config B: 문서 유형별 단일 라우팅
CONFIG_B = {
    'name': 'B_rule_single_route',
    'description': '문서 유형별 alpha 조정 (text_only→0.5, 나머지→0.7)',
    'params': {
        'alpha': 0.7,  # 기본값
        'top_k': 15,
        'pool_size': 50,
        'use_rerank': True,
    },
    'doc_type_overrides': {
        'text_only': {'alpha': 0.5},  # BM25 비중 높임
        'table_complex': {'alpha': 0.8, 'pool_size': 60},  # 벡터 비중 높임 + 풀 확대
    },
}

# Config C: 보수적 확장 (wider net)
CONFIG_C = {
    'name': 'C_conservative_wide',
    'description': '넓은 검색 범위 (top_k=20, pool=75) - multi-route 프록시',
    'params': {
        'alpha': 0.7,
        'top_k': 20,
        'pool_size': 75,
        'use_rerank': True,
    },
    'doc_type_overrides': {},
}

ALL_CONFIGS = [CONFIG_A, CONFIG_B, CONFIG_C]
N_RUNS = 3

print('Experiment configs defined:')
for cfg in ALL_CONFIGS:
    print(f"  {cfg['name']}: {cfg['description']}")
print(f"\nRuns per config: {N_RUNS}")
print(f"Total evaluations: {len(ALL_CONFIGS)} configs × {N_RUNS} runs × {len(testset)} questions = {len(ALL_CONFIGS) * N_RUNS * len(testset)}")

In [None]:
# ============================================================
# Cell 6: RAG 실행 루프 (3 configs × 3 runs × 30 questions)
# ============================================================

all_results = []  # 전체 결과 저장
errors = []  # 에러 로그

total_evals = len(ALL_CONFIGS) * N_RUNS * len(testset)
eval_count = 0
exp_start = time.time()

for config in ALL_CONFIGS:
    config_name = config['name']
    print(f"\n{'#'*70}")
    print(f"# Config: {config_name}")
    print(f"# {config['description']}")
    print(f"{'#'*70}")

    # Per-document 리트리버 생성 (config 파라미터 반영)
    doc_chains = {}
    for doc_key in DOC_CONFIGS:
        doc_type = DOC_CONFIGS[doc_key]['doc_type']
        params = dict(config['params'])  # 기본 파라미터 복사
        # 문서 유형별 오버라이드 적용
        if doc_type in config.get('doc_type_overrides', {}):
            params.update(config['doc_type_overrides'][doc_type])

        retriever = build_retriever(
            doc_vectordbs[doc_key],
            alpha=params['alpha'],
            top_k=params['top_k'],
            pool_size=params['pool_size'],
            use_rerank=params['use_rerank'],
        )
        doc_chains[doc_key] = build_rag_chain(retriever)

    for run_idx in range(N_RUNS):
        run_start = time.time()
        print(f"\n--- Run {run_idx + 1}/{N_RUNS} ---")

        for q_idx, row in testset.iterrows():
            eval_count += 1
            question = row['question']
            ground_truth = row['ground_truth']
            source_doc = row['source_doc']
            doc_key = SOURCE_TO_KEY.get(source_doc)

            if doc_key is None or doc_key not in doc_chains:
                errors.append({'config': config_name, 'run': run_idx, 'question': question, 'error': f'No chain for {source_doc}'})
                continue

            try:
                result = doc_chains[doc_key](question)
                all_results.append({
                    'config': config_name,
                    'run': run_idx,
                    'doc_key': doc_key,
                    'doc_type': DOC_CONFIGS[doc_key]['doc_type'],
                    'question': question,
                    'ground_truth': ground_truth,
                    'answer': result['answer'],
                    'category': row.get('category', ''),
                    'difficulty': row.get('difficulty', ''),
                    'n_retrieved': result['n_retrieved'],
                    'retrieval_time': result['retrieval_time'],
                    'generation_time': result['generation_time'],
                    'total_time': result['total_time'],
                    'retrieved_contexts': result['retrieved_contexts'],
                    'timeout': result['total_time'] > 120,
                })
                # Progress
                if eval_count % 10 == 0:
                    elapsed = time.time() - exp_start
                    eta = (elapsed / eval_count) * (total_evals - eval_count)
                    print(f"  [{eval_count}/{total_evals}] elapsed={elapsed:.0f}s, ETA={eta:.0f}s")

            except Exception as e:
                errors.append({'config': config_name, 'run': run_idx, 'question': question[:50], 'error': str(e)})
                all_results.append({
                    'config': config_name, 'run': run_idx, 'doc_key': doc_key,
                    'doc_type': DOC_CONFIGS[doc_key]['doc_type'],
                    'question': question, 'ground_truth': ground_truth,
                    'answer': 'ERROR', 'category': row.get('category', ''),
                    'difficulty': row.get('difficulty', ''),
                    'n_retrieved': 0, 'retrieval_time': 0, 'generation_time': 0,
                    'total_time': 0, 'retrieved_contexts': [], 'timeout': False,
                })
                print(f"  ERROR: {question[:40]}... -> {e}")

        run_time = time.time() - run_start
        print(f"  Run {run_idx + 1} completed in {run_time:.0f}s")

total_time = time.time() - exp_start
print(f"\n{'='*70}")
print(f"Total experiment time: {total_time:.0f}s ({total_time/60:.1f} min)")
print(f"Total evaluations: {eval_count}")
print(f"Errors: {len(errors)}")

# DataFrame 변환
results_df = pd.DataFrame(all_results)
print(f"\nResults DataFrame: {results_df.shape}")

In [None]:
# ============================================================
# Cell 7: kw_v2 계산 및 기본 분석
# ============================================================

# kw_v2 계산
results_df = compute_metrics_batch(results_df)

# 중간 저장 (RAGAS 전)
results_df.drop(columns=['retrieved_contexts']).to_csv(
    str(EXP_DIR / 'exp10b_raw_results.csv'), index=False, encoding='utf-8-sig'
)
print('Raw results saved.\n')

# ── Config별 Overall Mean (3-run 평균) ──
print('='*60)
print('Config별 Overall Mean (3-run 평균)')
print('='*60)

config_summary = results_df.groupby('config').agg(
    kw_v2_mean=('kw_v2', 'mean'),
    kw_v2_std=('kw_v2', 'std'),
    total_time_mean=('total_time', 'mean'),
    total_time_p95=('total_time', lambda x: np.percentile(x, 95)),
    timeout_rate=('timeout', 'mean'),
    n_evals=('kw_v2', 'count'),
).round(4)
print(config_summary)

# ── Config × Doc별 Macro Group Mean ──
print('\n' + '='*60)
print('Config × Document별 kw_v2 (3-run 평균)')
print('='*60)

doc_pivot = results_df.groupby(['config', 'doc_key'])['kw_v2'].mean().unstack()
doc_pivot['macro_mean'] = doc_pivot.mean(axis=1)  # Macro group mean
print(doc_pivot.round(4))

# ── Worst Group 분석 ──
print('\n' + '='*60)
print('Config별 Worst Group (최저 성능 문서)')
print('='*60)

for cfg_name in results_df['config'].unique():
    cfg_data = results_df[results_df['config'] == cfg_name]
    doc_means = cfg_data.groupby('doc_key')['kw_v2'].mean()
    worst = doc_means.idxmin()
    print(f"  {cfg_name}: worst={worst} (kw_v2={doc_means[worst]:.4f})")

# ── 난이도별 분석 ──
print('\n' + '='*60)
print('Config × Difficulty별 kw_v2')
print('='*60)

diff_pivot = results_df.groupby(['config', 'difficulty'])['kw_v2'].mean().unstack()
print(diff_pivot.round(4))

In [None]:
# ============================================================
# Cell 8: RAGAS 평가 (faithfulness, context_recall)
# ============================================================
# 비용 절약: Config별 Run 0만 RAGAS 평가 (각 30문항 × 3 configs = 90 evals)

from datasets import Dataset
from ragas import evaluate
from ragas.metrics import Faithfulness, ContextRecall
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from bidflow.eval.ragas_runner import FixedTempChatOpenAI

# RAGAS LLM/Embeddings 설정
ragas_llm = LangchainLLMWrapper(
    FixedTempChatOpenAI(model='gpt-5-mini', timeout=180, max_retries=3)
)
ragas_emb = LangchainEmbeddingsWrapper(
    OpenAIEmbeddings(model='text-embedding-3-small')
)

ragas_results_list = []

for config in ALL_CONFIGS:
    config_name = config['name']
    print(f"\nRAGAS evaluating: {config_name} (Run 0 only)")

    # Run 0 데이터만 추출
    run0 = results_df[(results_df['config'] == config_name) & (results_df['run'] == 0)].copy()

    if run0.empty:
        print(f"  No data for {config_name} run 0, skipping")
        continue

    # RAGAS 데이터셋 구성
    eval_dict = {
        'user_input': run0['question'].tolist(),
        'response': run0['answer'].tolist(),
        'retrieved_contexts': run0['retrieved_contexts'].tolist(),
        'reference': run0['ground_truth'].tolist(),
    }
    hf_dataset = Dataset.from_dict(eval_dict)

    # 평가 실행
    try:
        ragas_result = evaluate(
            dataset=hf_dataset,
            metrics=[
                Faithfulness(llm=ragas_llm),
                ContextRecall(llm=ragas_llm),
            ],
            llm=ragas_llm,
            embeddings=ragas_emb,
            raise_exceptions=False,
        )
        ragas_df = ragas_result.to_pandas()
        ragas_df['config'] = config_name
        ragas_df['doc_key'] = run0['doc_key'].values
        ragas_df['doc_type'] = run0['doc_type'].values
        ragas_results_list.append(ragas_df)

        print(f"  Faithfulness: {ragas_df['faithfulness'].mean():.4f}")
        print(f"  Context Recall: {ragas_df['context_recall'].mean():.4f}")
    except Exception as e:
        print(f"  RAGAS evaluation failed: {e}")

if ragas_results_list:
    ragas_all = pd.concat(ragas_results_list, ignore_index=True)
    ragas_all.to_csv(str(EXP_DIR / 'exp10b_ragas_results.csv'), index=False, encoding='utf-8-sig')
    print('\nRAGAS results saved.')

    # RAGAS 요약
    print('\n' + '='*60)
    print('RAGAS Summary (Run 0)')
    print('='*60)
    ragas_summary = ragas_all.groupby('config')[['faithfulness', 'context_recall']].mean()
    print(ragas_summary.round(4))
else:
    ragas_all = pd.DataFrame()
    print('No RAGAS results generated.')

In [None]:
# ============================================================
# Cell 9: 종합 결과 집계 및 Quality Floor 판정
# ============================================================

# Quality Floor (HANDOFF v2 섹션 5.5)
QUALITY_FLOOR = {
    'kw_v2': 0.50,
    'faithfulness': 0.80,
    'context_recall': 0.60,
}
WORST_GROUP_FLOOR = {
    'kw_v2': 0.35,
    'faithfulness': 0.70,
    'context_recall': 0.45,
}
OPS_CEILING = {
    'timeout_rate': 0.10,
    'p95_latency_sec': 120,
}

print('='*70)
print('PHASE B: 종합 결과 보고')
print('='*70)

report = {}
for config in ALL_CONFIGS:
    config_name = config['name']
    cfg_data = results_df[results_df['config'] == config_name]

    # Overall metrics
    overall_kw = cfg_data['kw_v2'].mean()
    overall_kw_std = cfg_data['kw_v2'].std()
    timeout_rate = cfg_data['timeout'].mean()
    p95_latency = np.percentile(cfg_data['total_time'], 95)

    # RAGAS (Run 0)
    if not ragas_all.empty and config_name in ragas_all['config'].values:
        ragas_cfg = ragas_all[ragas_all['config'] == config_name]
        faith = ragas_cfg['faithfulness'].mean()
        cr = ragas_cfg['context_recall'].mean()
    else:
        faith = np.nan
        cr = np.nan

    # Worst group
    doc_means = cfg_data.groupby('doc_key')['kw_v2'].mean()
    worst_doc = doc_means.idxmin()
    worst_kw = doc_means.min()

    # Macro group mean
    macro_kw = doc_means.mean()

    # Quality floor check
    pass_kw = overall_kw >= QUALITY_FLOOR['kw_v2']
    pass_faith = faith >= QUALITY_FLOOR['faithfulness'] if not np.isnan(faith) else False
    pass_cr = cr >= QUALITY_FLOOR['context_recall'] if not np.isnan(cr) else False
    pass_worst_kw = worst_kw >= WORST_GROUP_FLOOR['kw_v2']
    pass_ops = timeout_rate <= OPS_CEILING['timeout_rate'] and p95_latency <= OPS_CEILING['p95_latency_sec']
    all_pass = pass_kw and pass_faith and pass_cr and pass_worst_kw and pass_ops

    report[config_name] = {
        'kw_v2_overall': round(overall_kw, 4),
        'kw_v2_std': round(overall_kw_std, 4),
        'kw_v2_macro': round(macro_kw, 4),
        'kw_v2_worst': round(worst_kw, 4),
        'worst_doc': worst_doc,
        'faithfulness': round(faith, 4) if not np.isnan(faith) else None,
        'context_recall': round(cr, 4) if not np.isnan(cr) else None,
        'timeout_rate': round(timeout_rate, 4),
        'p95_latency_sec': round(p95_latency, 1),
        'quality_floor_pass': all_pass,
        'detail': {
            'pass_kw': pass_kw,
            'pass_faith': pass_faith,
            'pass_cr': pass_cr,
            'pass_worst_kw': pass_worst_kw,
            'pass_ops': pass_ops,
        },
    }

    status = '✅ PASS' if all_pass else '❌ FAIL'
    print(f"\n{config_name} {status}")
    print(f"  kw_v2: {overall_kw:.4f} (±{overall_kw_std:.4f}) [floor={QUALITY_FLOOR['kw_v2']}] {'✅' if pass_kw else '❌'}")
    print(f"  faithfulness: {faith:.4f} [floor={QUALITY_FLOOR['faithfulness']}] {'✅' if pass_faith else '❌'}" if not np.isnan(faith) else f"  faithfulness: N/A")
    print(f"  context_recall: {cr:.4f} [floor={QUALITY_FLOOR['context_recall']}] {'✅' if pass_cr else '❌'}" if not np.isnan(cr) else f"  context_recall: N/A")
    print(f"  worst_group: {worst_doc}={worst_kw:.4f} [floor={WORST_GROUP_FLOOR['kw_v2']}] {'✅' if pass_worst_kw else '❌'}")
    print(f"  macro_group_mean: {macro_kw:.4f}")
    print(f"  timeout_rate: {timeout_rate:.4f} [ceil={OPS_CEILING['timeout_rate']}]")
    print(f"  p95_latency: {p95_latency:.1f}s [ceil={OPS_CEILING['p95_latency_sec']}s]")

print('\n' + '='*70)
print('판정 요약')
print('='*70)
for name, r in report.items():
    status = '✅ PASS' if r['quality_floor_pass'] else '❌ FAIL'
    print(f"  {name}: {status} (kw_v2={r['kw_v2_overall']}, faith={r['faithfulness']}, cr={r['context_recall']})")

In [None]:
# ============================================================
# Cell 10: 결과 저장 (report.json + metrics.csv)
# ============================================================

# 1. Report JSON
exp_report = {
    'experiment': 'exp10b_generalization_rerun',
    'phase': 'B',
    'date': datetime.now().isoformat(),
    'testset': 'golden_testset_multi.csv',
    'n_questions': len(testset),
    'n_documents': len(DOC_CONFIGS),
    'n_runs': N_RUNS,
    'chunk_size': CHUNK_SIZE,
    'chunk_overlap': CHUNK_OVERLAP,
    'embedding_model': EMBEDDING_MODEL,
    'llm_model': 'gpt-5-mini',
    'doc_chunk_counts': doc_chunk_counts,
    'configs': {c['name']: c for c in ALL_CONFIGS},
    'quality_floor': QUALITY_FLOOR,
    'worst_group_floor': WORST_GROUP_FLOOR,
    'ops_ceiling': OPS_CEILING,
    'results': report,
    'total_evals': eval_count,
    'total_errors': len(errors),
    'errors': errors[:10],  # 최대 10개만 저장
}

report_path = 'data/experiments/exp10b_report.json'
with open(report_path, 'w', encoding='utf-8') as f:
    json.dump(exp_report, f, ensure_ascii=False, indent=2, default=str)
print(f'Report saved: {report_path}')

# 2. Metrics CSV (retrieved_contexts 제외)
metrics_path = 'data/experiments/exp10b_metrics.csv'
results_df.drop(columns=['retrieved_contexts']).to_csv(
    metrics_path, index=False, encoding='utf-8-sig'
)
print(f'Metrics saved: {metrics_path}')

# 3. 에러 로그
if errors:
    error_path = str(EXP_DIR / 'exp10b_errors.json')
    with open(error_path, 'w', encoding='utf-8') as f:
        json.dump(errors, f, ensure_ascii=False, indent=2)
    print(f'Errors saved: {error_path}')

print(f'\n실험 완료! Phase B 결과 저장 완료.')
print(f'다음 단계: HISTORY_v2_execution.md 업데이트')

In [None]:
# ============================================================
# Cell 11: doc_type 별 text vs table 성능 비교 (Gap 분석)
# ============================================================

print('='*70)
print('Document Type 별 성능 비교 (text vs table Gap)')
print('='*70)

# text_only vs table-heavy 비교
for config_name in results_df['config'].unique():
    cfg_data = results_df[results_df['config'] == config_name]

    text_kw = cfg_data[cfg_data['doc_type'] == 'text_only']['kw_v2'].mean()
    table_types = ['table_simple', 'table_complex', 'mixed', 'hwp_representative']
    table_kw = cfg_data[cfg_data['doc_type'].isin(table_types)]['kw_v2'].mean()
    gap = text_kw - table_kw

    print(f"\n{config_name}:")
    print(f"  text_only kw_v2: {text_kw:.4f}")
    print(f"  table-docs kw_v2: {table_kw:.4f}")
    print(f"  text-table gap: {gap:.4f} ({gap*100:.1f}%p)")

# Category별 분석
print('\n' + '='*70)
print('Category별 kw_v2 (전체 config 평균)')
print('='*70)

cat_analysis = results_df.groupby('category')['kw_v2'].agg(['mean', 'std', 'count'])
cat_analysis = cat_analysis.sort_values('mean', ascending=False)
print(cat_analysis.round(4))