In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
rag_df = pd.read_csv('/home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/results/LLM_QnA/RAG/final/all_rag_matched_results_merged_with_accuracy.csv')
baseline_df = pd.read_csv('/home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/results/LLM_QnA/RAG/final/all_baseline_matched_results_merged_with_accuracy.csv')

with open ('/home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/dataset/ground_truth.json', 'r') as f:
    gt = json.load(f)

gpt5mini_onedoc = pd.read_csv('/home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/results/LLM_QnA/one_doc/final/eval/gpt-5-mini_evaluation.csv')
gpt5_onedoc = pd.read_csv('/home/cptaswadu/new-rescue/RESCUE-n8n/eval/insurance/results/LLM_QnA/one_doc/final/eval/gpt-5_evaluation.csv')

In [11]:
rag_top1 = rag_df[rag_df['top_k'] == 1]
baseline_top1 = baseline_df[baseline_df['top_k'] == 1]

case_sets = []
for qna_model in ['gpt-5-mini', 'gpt-5']:
    for retrieval_model in ['gpt-5-mini', 'gpt-5']:
        for threshold in [10, 30]:
            cases = rag_top1[
                (rag_top1['retrieval_count'] == threshold) & 
                (rag_top1['retrieval_model'] == retrieval_model) &
                (rag_top1['qna_model'] == qna_model)
            ]['case_id'].tolist()
            case_sets.append(set(cases))
            print(f"QnA: {qna_model}, Retrieval: {retrieval_model}, Threshold: {threshold} -> {len(cases)} cases")
print(case_sets)

QnA: gpt-5-mini, Retrieval: gpt-5-mini, Threshold: 10 -> 36 cases
QnA: gpt-5-mini, Retrieval: gpt-5-mini, Threshold: 30 -> 40 cases
QnA: gpt-5-mini, Retrieval: gpt-5, Threshold: 10 -> 37 cases
QnA: gpt-5-mini, Retrieval: gpt-5, Threshold: 30 -> 39 cases
QnA: gpt-5, Retrieval: gpt-5-mini, Threshold: 10 -> 37 cases
QnA: gpt-5, Retrieval: gpt-5-mini, Threshold: 30 -> 39 cases
QnA: gpt-5, Retrieval: gpt-5, Threshold: 10 -> 36 cases
QnA: gpt-5, Retrieval: gpt-5, Threshold: 30 -> 39 cases
[{'Case811', 'Case2007', 'Case2162', 'Case15570', 'Case15202', 'Case2666', 'Case19162', 'Case13983', 'Case2847', 'Case6747', 'Case5674', 'Case7447', 'Case14195', 'Case5834', 'Case22', 'Case14226', 'Case3223', 'Case4604', 'Case9773', 'Case8051', 'Case3494', 'Case10075', 'Case11795', 'Case12792', 'Case4196', 'Case6867', 'Case9349', 'Case4262', 'Case15458', 'Case17595', 'Case14017', 'Case18821', 'Case9408', 'Case3581', 'Case16126', 'Case17196'}, {'Case811', 'Case2007', 'Case2162', 'Case15570', 'Case14408', 'Ca

In [14]:
common_cases = case_sets[0]
for case_set in case_sets[1:]:
    common_cases = common_cases & case_set
print(f"Common case_ids across all {len(case_sets)} combinations: {len(common_cases)}")

Common case_ids across all 8 combinations: 30


In [13]:
def get_accuracies(threshold, retrieval_model, qna_model, case_ids):
    # Internal-QA
    rag_subset = rag_top1[
        (rag_top1['case_id'].isin(case_ids)) &
        (rag_top1['retrieval_count'] == threshold) & 
        (rag_top1['retrieval_model'] == retrieval_model) &
        (rag_top1['qna_model'] == qna_model)
    ]
    internal_qa_acc = rag_subset['accuracy'].mean()
    
    # Baseline
    baseline_subset = baseline_top1[
        (baseline_top1['case_id'].isin(case_ids)) &
        (baseline_top1['retrieval_count'] == threshold) & 
        (baseline_top1['retrieval_model'] == retrieval_model) &
        (baseline_top1['qna_model'] == qna_model)
    ]
    baseline_acc = baseline_subset['accuracy'].mean()
    
    # One-Document-QA
    if qna_model == 'gpt-5':
        onedoc_acc = gpt5_onedoc[gpt5_onedoc['case_id'].isin(case_ids)]['accuracy'].mean()
    else:
        onedoc_acc = gpt5mini_onedoc[gpt5mini_onedoc['case_id'].isin(case_ids)]['accuracy'].mean()

    return baseline_acc, internal_qa_acc, onedoc_acc

results = []
for qna_model in ['gpt-5-mini', 'gpt-5']:
    for retrieval_model in ['gpt-5-mini', 'gpt-5']:
        for threshold in [10, 30]:
            baseline, internal, onedoc = get_accuracies(threshold, retrieval_model, qna_model, common_cases)
            qna_name = 'GPT-5-Mini' if qna_model == 'gpt-5-mini' else 'GPT-5'
            results.append({
                'QnA Model': qna_name,
                'Retrieval Model': retrieval_model,
                'Top K': 1,
                'Top C': threshold,
                'Baseline': f"{baseline:.2f}%",
                'Internal-QA': f"{internal:.2f}%",
                'One-Document-QA': f"{onedoc:.2f}%"
            })

# Create DataFrame and display
df = pd.DataFrame(results)
print(df)

    QnA Model Retrieval Model  Top K  Top C Baseline Internal-QA  \
0  GPT-5-Mini      gpt-5-mini      1     10   60.37%      71.11%   
1  GPT-5-Mini      gpt-5-mini      1     30   61.11%      73.70%   
2  GPT-5-Mini           gpt-5      1     10   62.59%      71.11%   
3  GPT-5-Mini           gpt-5      1     30   61.48%      75.56%   
4       GPT-5      gpt-5-mini      1     10   60.37%      67.04%   
5       GPT-5      gpt-5-mini      1     30   59.63%      68.52%   
6       GPT-5           gpt-5      1     10   59.63%      68.15%   
7       GPT-5           gpt-5      1     30   57.78%      66.67%   

  One-Document-QA  
0          72.96%  
1          72.96%  
2          72.96%  
3          72.96%  
4          68.15%  
5          68.15%  
6          68.15%  
7          68.15%  


In [15]:
case_sets = []
for model in ['gpt-5-mini', 'gpt-5']:
    for threshold in [10, 30]:
        # retrieval_model과 qna_model이 같은 경우만
        cases = rag_top1[
            (rag_top1['retrieval_count'] == threshold) & 
            (rag_top1['retrieval_model'] == model) &
            (rag_top1['qna_model'] == model)
        ]['case_id'].tolist()
        case_sets.append(set(cases))
        print(f"Model: {model:10s}, Threshold: {threshold} -> {len(cases):2d} cases")


Model: gpt-5-mini, Threshold: 10 -> 36 cases
Model: gpt-5-mini, Threshold: 30 -> 40 cases
Model: gpt-5     , Threshold: 10 -> 36 cases
Model: gpt-5     , Threshold: 30 -> 39 cases


In [16]:
common_cases = case_sets[0]
for case_set in case_sets[1:]:
    common_cases = common_cases & case_set
print(f"Common case_ids across all {len(case_sets)} combinations: {len(common_cases)}")

Common case_ids across all 4 combinations: 33


In [17]:
def get_accuracies(threshold, model, case_ids):
    """
    model: retrieval_model과 qna_model 모두에 사용 (같은 값)
    """
    # Internal-QA
    rag_subset = rag_top1[
        (rag_top1['case_id'].isin(case_ids)) &
        (rag_top1['retrieval_count'] == threshold) & 
        (rag_top1['retrieval_model'] == model) &
        (rag_top1['qna_model'] == model)
    ]
    internal_qa_acc = rag_subset['accuracy'].mean()
    
    # Baseline
    baseline_subset = baseline_top1[
        (baseline_top1['case_id'].isin(case_ids)) &
        (baseline_top1['retrieval_count'] == threshold) & 
        (baseline_top1['retrieval_model'] == model) &
        (baseline_top1['qna_model'] == model)
    ]
    baseline_acc = baseline_subset['accuracy'].mean()
    
    # One-Document-QA
    if model == 'gpt-5':
        onedoc_acc = gpt5_onedoc[gpt5_onedoc['case_id'].isin(case_ids)]['accuracy'].mean()
    else:
        onedoc_acc = gpt5mini_onedoc[gpt5mini_onedoc['case_id'].isin(case_ids)]['accuracy'].mean()
    
    return baseline_acc, internal_qa_acc, onedoc_acc

# Build results
results = []
for model in ['gpt-5-mini', 'gpt-5']:
    for threshold in [10, 30]:
        baseline, internal, onedoc = get_accuracies(threshold, model, common_cases)
        model_name = 'GPT-5-Mini' if model == 'gpt-5-mini' else 'GPT-5'
        results.append({
            'Model': model_name,
            'Top K': 1,
            'Top C': threshold,
            'Baseline': f"{baseline:.2f}%",
            'Internal-QA': f"{internal:.2f}%",
            'One-Document-QA': f"{onedoc:.2f}%"
        })

# Create DataFrame
df = pd.DataFrame(results)
print(df)

        Model  Top K  Top C Baseline Internal-QA One-Document-QA
0  GPT-5-Mini      1     10   61.62%      72.05%          73.40%
1  GPT-5-Mini      1     30   61.95%      74.07%          73.40%
2       GPT-5      1     10   60.27%      69.36%          69.70%
3       GPT-5      1     30   59.26%      68.01%          69.70%


In [18]:
def get_accuracies(threshold, model):
    """
    Internal-QA에서 필터링된 케이스를 그대로 baseline, one-doc에도 사용
    """
    # Internal-QA에서 케이스 선택
    rag_subset = rag_top1[
        (rag_top1['retrieval_count'] == threshold) & 
        (rag_top1['retrieval_model'] == model) &
        (rag_top1['qna_model'] == model)
    ]
    case_ids = rag_subset['case_id'].tolist()  # 여기서 케이스 결정!
    internal_qa_acc = rag_subset['accuracy'].mean()
    
    # 같은 케이스들로 Baseline 계산
    baseline_subset = baseline_top1[
        (baseline_top1['case_id'].isin(case_ids)) &
        (baseline_top1['retrieval_count'] == threshold) & 
        (baseline_top1['retrieval_model'] == model) &
        (baseline_top1['qna_model'] == model)
    ]
    baseline_acc = baseline_subset['accuracy'].mean()
    
    # 같은 케이스들로 One-Document-QA 계산
    if model == 'gpt-5':
        onedoc_acc = gpt5_onedoc[gpt5_onedoc['case_id'].isin(case_ids)]['accuracy'].mean()
    else:
        onedoc_acc = gpt5mini_onedoc[gpt5mini_onedoc['case_id'].isin(case_ids)]['accuracy'].mean()
    
    return baseline_acc, internal_qa_acc, onedoc_acc, len(case_ids)

results = []
for model in ['gpt-5-mini', 'gpt-5']:
    for threshold in [10, 30]:
        baseline, internal, onedoc, n_cases = get_accuracies(threshold, model)
        model_name = 'GPT-5-Mini' if model == 'gpt-5-mini' else 'GPT-5'
        results.append({
            'Model': model_name,
            'Top K': 1,
            'Top C': threshold,
            'Baseline': f"{baseline:.2f}%",
            'Internal-QA': f"{internal:.2f}%",
            'One-Document-QA': f"{onedoc:.2f}%",
            'N Cases': n_cases
        })

# Create DataFrame
df = pd.DataFrame(results)
print(df)

        Model  Top K  Top C Baseline Internal-QA One-Document-QA  N Cases
0  GPT-5-Mini      1     10   61.73%      71.60%          72.84%       36
1  GPT-5-Mini      1     30   61.94%      73.33%          71.67%       40
2       GPT-5      1     10   60.19%      69.14%          69.75%       36
3       GPT-5      1     30   59.83%      68.09%          70.09%       39


In [10]:
results = []

# GPT-5-Mini with different retrieval models
print("=== GPT-5-Mini QnA ===")
for retrieval_model in ['gpt-5-mini', 'gpt-5']:
    for threshold in [10, 30]:
        result = get_accuracies(threshold, retrieval_model, 'gpt-5-mini')
        if result:
            results.append({
                'QnA Model': 'GPT-5-Mini',
                'Retrieval Model': retrieval_model,
                'Top K': 1,
                'Top C': threshold,
                'Baseline': f"{result['baseline']:.2f}%",
                'Internal-QA': f"{result['internal_qa']:.2f}%",
                'One-Document-QA': f"{result['one_doc']:.2f}%",
                'N Cases': result['n_cases']
            })
            print(f"Retrieval: {retrieval_model}, Threshold: {threshold}, Cases: {result['n_cases']}")

# GPT-5 with different retrieval models
print("\n=== GPT-5 QnA ===")
for retrieval_model in ['gpt-5-mini', 'gpt-5']:
    for threshold in [10, 30]:
        result = get_accuracies(threshold, retrieval_model, 'gpt-5')
        if result:
            results.append({
                'QnA Model': 'GPT-5',
                'Retrieval Model': retrieval_model,
                'Top K': 1,
                'Top C': threshold,
                'Baseline': f"{result['baseline']:.2f}%",
                'Internal-QA': f"{result['internal_qa']:.2f}%",
                'One-Document-QA': f"{result['one_doc']:.2f}%",
                'N Cases': result['n_cases']
            })
            print(f"Retrieval: {retrieval_model}, Threshold: {threshold}, Cases: {result['n_cases']}")

=== GPT-5-Mini QnA ===
Retrieval: gpt-5-mini, Threshold: 10, Cases: 36
Retrieval: gpt-5-mini, Threshold: 30, Cases: 40
Retrieval: gpt-5, Threshold: 10, Cases: 37
Retrieval: gpt-5, Threshold: 30, Cases: 39

=== GPT-5 QnA ===
Retrieval: gpt-5-mini, Threshold: 10, Cases: 37
Retrieval: gpt-5-mini, Threshold: 30, Cases: 39
Retrieval: gpt-5, Threshold: 10, Cases: 36
Retrieval: gpt-5, Threshold: 30, Cases: 39


In [None]:
def get_accuracies(threshold, qna_model, case_ids):
    # Internal-QA
    rag_subset = rag_top1[
        (rag_top1['case_id'].isin(case_ids)) &
        (rag_top1['retrieval_count'] == threshold) & 
        (rag_top1['qna_model'] == qna_model)
    ]
    internal_qa_acc = rag_subset['accuracy'].mean()
    
    # Baseline
    baseline_subset = baseline_top1[
        (baseline_top1['case_id'].isin(case_ids)) &
        (baseline_top1['retrieval_count'] == threshold) & 
        (baseline_top1['qna_model'] == qna_model)
    ]
    baseline_acc = baseline_subset['accuracy'].mean()
    
    # One-Document-QA
    if qna_model == 'gpt-5':
        onedoc_acc = gpt5_onedoc[gpt5_onedoc['case_id'].isin(case_ids)]['accuracy'].mean()
    else:
        onedoc_acc = gpt5mini_onedoc[gpt5mini_onedoc['case_id'].isin(case_ids)]['accuracy'].mean()
    
    return baseline_acc, internal_qa_acc, onedoc_acc

In [3]:
rag_top1 = rag_df[rag_df['top_k'] == 1]
baseline_top1 = baseline_df[baseline_df['top_k'] == 1]

In [4]:
def get_accuracies(threshold, qna_model):
    # Get Internal-QA cases
    rag_subset = rag_top1[
        (rag_top1['retrieval_count'] == threshold) & 
        (rag_top1['qna_model'] == qna_model)
    ]
    case_ids = rag_subset['case_id'].tolist()
    
    # Calculate accuracies
    internal_qa_acc = rag_subset['accuracy'].mean()
    
    baseline_subset = baseline_top1[
        (baseline_top1['case_id'].isin(case_ids)) &
        (baseline_top1['retrieval_count'] == threshold) & 
        (baseline_top1['qna_model'] == qna_model)
    ]
    baseline_acc = baseline_subset['accuracy'].mean()
    
    if qna_model == 'gpt-5':
        onedoc_acc = gpt5_onedoc[gpt5_onedoc['case_id'].isin(case_ids)]['accuracy'].mean()
    else:
        onedoc_acc = gpt5mini_onedoc[gpt5mini_onedoc['case_id'].isin(case_ids)]['accuracy'].mean()
    
    return baseline_acc, internal_qa_acc, onedoc_acc

In [5]:
for threshold in [10, 30]:
    baseline, internal, onedoc = get_accuracies(threshold, 'gpt-5-mini')
    print(f"1\t{threshold}\t{baseline:.2f}%\t{internal:.2f}%\t{onedoc:.2f}%")

1	10	62.40%	71.39%	72.07%
1	30	62.45%	73.70%	71.54%


In [6]:
for threshold in [10, 30]:
    baseline, internal, onedoc = get_accuracies(threshold, 'gpt-5')
    print(f"1\t{threshold}\t{baseline:.2f}%\t{internal:.2f}%\t{onedoc:.2f}%")

1	10	60.73%	68.95%	69.67%
1	30	59.97%	68.95%	70.19%
