In [None]:
import numpy as np
import pandas as pd
from doraemon import Doraemon
from kg_rag import Judgment, Fusion, KG_RAG_Tool, Inference


class PromptBuilder:
    """
    A class encapsulating the prompt templates used throughout the KG‑RAG
    pipeline. Two dictionaries are exposed: PROMPT for the initial RAG
    prompting and PROMPT_FUSION for the fusion stage. Helper methods are
    provided to assemble user messages.
    """
    PROMPT = {
        'rag': (
            "Use the provided contexts to answer the fact checking question. "
            "If the contexts are incomplete or weak, still provide your best possible answer. "
            "Always return a confidence score between 0.00 and 1.00 reflecting how confident you are that the final answer is correct. "
            "Output MUST be exactly one line in this format:\n"
            "\\boxed{{final answer, confidence score}}\n"
            "Do not include any other text. Examples:\n"
            "\\boxed{{answer, 0.92}}\n"
        ),
        'cf_use': (
            "Assume your previous answer is wrong due to improper use of the retrieved contexts. "
            "Carefully re-check the provided contexts and regenerate the answer using one or a few words. "
            "Always return a confidence score between 0.00 and 1.00 reflecting how confident you are that the final answer is correct. "
            "Output MUST be exactly one line in this format:\n"
            "\\boxed{{final answer, confidence score}}\n"
            "Do not include any other text. Examples:\n"
            "\\boxed{{answer, 0.95}}\n"
        ),
        'cf_quality': (
            "Assume your previous answer is wrong because the quality of the referred contexts is poor. "
            "Re-select the most relevant parts from the given contexts and regenerate the answer using one or a few words. "
            "Always return a confidence score between 0.00 and 1.00 reflecting how confident you are that the final answer is correct. "
            "Output MUST be exactly one line in this format:\n"
            "\\boxed{{final answer, confidence score}}\n"
            "Do not include any other text. Examples:\n"
            "\\boxed{{answer, 0.88}}\n"
        )
    }

    @classmethod
    def build_user_multi_contents(cls, q, ctxs):
        if ctxs is None:
            ctxs = []
        if not isinstance(ctxs, (list, tuple)):
            ctxs = [ctxs]
        lines = [f"Question: {q}"] + [f"\nContext{i+1}: {c}" for i, c in enumerate(ctxs)]
        return "".join(lines)


def classify(x: str, n: int = 1200, dataset: str = "metaqa") -> pd.DataFrame:
    BASE_PATHS = {
        "metaqa": "/kaggle/input/filtered-multiple-hops-metaqa",
        "webqsp": "/kaggle/input/webqsp"
    }

    # Validate dataset input
    if dataset not in BASE_PATHS:
        raise ValueError(f"Invalid dataset '{dataset}'. Must be one of: {list(BASE_PATHS.keys())}")

    base = BASE_PATHS[dataset]
    x = (x or "").strip().lower()

    # Select file path based on dataset type
    if dataset == "metaqa":
        match x:
            case "one":
                path = f"{base}/one_hop_supported.pickle"
            case "three" | _:
                path = f"{base}/three_hop_supported.pickle"
    else:  # dataset == "another"
        match x:
            case "one":
                path = f"{base}/webqsp_ctxstyle_1200_hop1_nl.pkl"
            case "three":
                path = f"{base}/webqsp_ctxstyle_1200_hop3_nl.pkl"
            case "two" | _:
                raise ValueError("❌ 'two_hop_supported.pickle' does not exist in the 'current' dataset.")

    # Load and return dataframe
    df = pd.read_pickle(path)

    if dataset =="webqsp":
        df=df.rename(columns={
            "ground_truth":"Label",
            "contexts": "ctx_topk"
        })
    
    return df.head(n)

In [None]:
dataset="metaqa"

df = classify("one", n = 1, dataset=dataset) 

df['query'] = df.apply(lambda row: [
    {"role": "system", "content": PromptBuilder.PROMPT['rag']},
    {"role": "user", "content": PromptBuilder.build_user_multi_contents(row['question'], row['ctx_topk'])}
], axis=1)

df['cf_use'] = df.apply(lambda row: [
    {"role": "system", "content": PromptBuilder.PROMPT['cf_use']},
    {"role": "user", "content": PromptBuilder.build_user_multi_contents(row['question'], row['ctx_topk'])}
], axis=1)

df['cf_quality'] = df.apply(lambda row: [
    {"role": "system", "content": PromptBuilder.PROMPT['cf_quality']},
    {"role": "user", "content": PromptBuilder.build_user_multi_contents(row['question'], row['ctx_topk'])}
], axis=1)

Doraemon.set_provider('gpt3')
logger = Doraemon.get_logger(logfile='rkag_emnlp.log')
tasks = df.to_dict(orient='records')

init_a = await Inference.process_batches(tasks, logger, 'query')
df['init_a'] = pd.Series(init_a, dtype='object')
cf_use_a = await Inference.process_batches(tasks, logger, 'cf_use')
df['cf_use_a'] = pd.Series(cf_use_a, dtype='object')
cf_quality_a = await Inference.process_batches(tasks, logger, 'cf_quality')
df['cf_qual_a'] = pd.Series(cf_quality_a, dtype='object')

In [None]:
df = Judgment.apply_judgments(df)
df = await Fusion.resolve_f_cases(df, logger)
df = await Fusion.compute_probabilities(df, logger)

In [None]:
from sklearn.metrics import brier_score_loss
from torchmetrics.classification import BinaryCalibrationError
from calibration_metrics import CalibrationMetrics


def selective_auc_trapz(df, prob_col="final_P", correct_col="is_correct"):
    y = df[correct_col].astype(int).to_numpy()
    p = pd.to_numeric(df[prob_col], errors="coerce").clip(0, 1).to_numpy()
    if len(y) == 0:
        return np.nan, np.array([]), np.array([])
    # sort by prob desc (ties keep row order; SAUC will depend on that)
    idx = np.argsort(-p, kind="mergesort")
    y_sorted = y[idx]
    n = len(y_sorted)
    coverage = np.arange(1, n + 1) / n
    accuracy_curve = np.cumsum(y_sorted) / np.arange(1, n + 1)
    auc = np.trapz(accuracy_curve, coverage)
    return float(auc), coverage, accuracy_curve

def ece_l1(df, prob_col="final_P", correct_col="is_correct", n_bins=10):
    y = df[correct_col].astype(int).to_numpy()
    p = pd.to_numeric(df[prob_col], errors="coerce").clip(0, 1).to_numpy()
    if len(y) == 0:
        return np.nan
    bins = np.linspace(0.0, 1.0, n_bins + 1)
    idx = np.digitize(p, bins, right=True)
    idx[idx == 0] = 1
    idx[idx > n_bins] = n_bins
    ece = 0.0
    N = len(p)
    for b in range(1, n_bins + 1):
        m = (idx == b)
        if not m.any():
            continue
        conf = p[m].mean()
        acc = y[m].mean()
        ece += (m.mean()) * abs(acc - conf)
    return float(ece)

def brier(df, prob_col="final_P", correct_col="is_correct"):
    p = pd.to_numeric(df[prob_col], errors="coerce")   # Series
    y = df[correct_col].astype("Int64")                # Series
    m = p.notna() & y.notna()
    if not m.any():
        return np.nan
    return float(brier_score_loss(y[m].astype(int).to_numpy(),
                                  p[m].clip(0, 1).to_numpy()))


print(KG_RAG_Tool.eval_accuracy(df, pred='fusion_prob', g_t='Label'))
auc, cov, acc = selective_auc_trapz(df, prob_col="fusion_prob", correct_col="is_correct")
# ece = ece_l1(df, prob_col="final_P", correct_col="is_correct", n_bins=10)
bs = brier(df, prob_col="fusion_prob", correct_col="is_correct")

# L2
ece_l2 = CalibrationMetrics.ece_torchmetrics_binary(
    df, prob_col="fusion_prob", correct_col="is_correct", n_bins=10, norm="l2"
)

print("ECE (L2)",ece_l2)
print("Brier score:", bs)
print("Selective AUC (trapz):", auc)

In [None]:
df.to_pickle('rkag_emnlp.pkl')