In [None]:
import re
import pandas as pd
from doraemon import Doraemon
from typing import List, Optional, Sequence, Dict, Any
from kg_rag import Inference, KG_RAG_Tool

class Verb2sCoT:
    """
    Two-stage prompting for Verb_2s_CoT:
      - Stage 1 (answer): output = "<reasoning_text> \\boxed{<final answer>}"
      - Stage 2 (prob):   output = "\\boxed{<probability>}"

    Notes:
    - SYSTEM_PROMPT is a single dict with two keys: 'answer' and 'prob'.
    - The old parse_final_triplet is removed (stage 1 no longer includes probability).
    """

    # -------- Single SYSTEM_PROMPT with two keys --------
    SYSTEM_PROMPT: Dict[str, str] = {
        # =========================
        # Stage 1: Answer prompt
        # =========================
        "answer": (
            "Provide your best guess for the following question.\n"
            "First give ONE short sentence explaining the decisive clue(s) (no step-by-step reasoning).\n"
            "Then on a new line give ONLY the guess with no other words or explanation.\n\n"
            "For example:\n\n"
            "reasoning\\boxed{{guess}}\n\n"
        ),

        # =========================
        # Stage 2: Probability prompt
        # =========================
        "prob": (
            "Provide the probability that your guess is correct. "
            "Give ONLY the probability, no other words or explanation.\n\n"
            "For example:\n\n"
            "\\boxed{{0.85}}\n"
        ),
    }

    # ----------- Format contexts helper -----------
    @staticmethod
    def _format_contexts(ctxs: Optional[Sequence[str]]) -> str:
        if not ctxs:
            return ""
        return "\n".join(f"Context{i+1}: {c}" for i, c in enumerate(ctxs))


    # ----------- Stage 1 messages (answer) -----------
    @classmethod
    def msgs_stage1(cls, question: str, contexts: Optional[Sequence[str]] = None) -> List[Dict[str, str]]:
        user_prompt = ""
        if contexts:
            user_prompt += "## Provided Contexts\n" + cls._format_contexts(contexts) + "\n\n"
        user_prompt += f"Question: {question}"
        user_prompt += "return ONLY like: reasoning\\boxed{{guess}}\n\n"
        return [
            {"role": "system", "content": cls.SYSTEM_PROMPT["answer"]},
            {"role": "user", "content": user_prompt},
        ]

    # ----------- Stage 2 messages (probability) -----------
    @classmethod
    def msgs_stage2(
        cls,
        final_answer: str,
        question: str,
        contexts: Optional[Sequence[str]] = None,
    ) -> List[Dict[str, str]]:
        """
        Build OpenAI-style messages for Stage 2 (verbalised probability):
        - System prompt = SYSTEM_PROMPT["prob"] (unchanged baseline content).
        - User prompt includes contexts (if any), the original question, and the final answer from Stage 1.
        """
        user_prompt = ""
        if contexts:
            user_prompt += "## Provided Contexts\n" + cls._format_contexts(contexts) + "\n\n"
        user_prompt += f"Question: {question}\n"
        user_prompt += f"Final Answer: {final_answer}"
        user_prompt += "Return ONLY like: \\boxed{{0.85}}\n"
    
        return [
            {"role": "system", "content": cls.SYSTEM_PROMPT["prob"]},
            {"role": "user", "content": user_prompt},
        ]



def classify(x: str, n: int = 1200, dataset: str = "metaqa") -> pd.DataFrame:
    BASE_PATHS = {
        "metaqa": "/kaggle/input/filtered-multiple-hops-metaqa",
        "webqsp": "/kaggle/input/webqsp"
    }

    # Validate dataset input
    if dataset not in BASE_PATHS:
        raise ValueError(f"Invalid dataset '{dataset}'. Must be one of: {list(BASE_PATHS.keys())}")

    base = BASE_PATHS[dataset]
    x = (x or "").strip().lower()

    # Select file path based on dataset type
    if dataset == "metaqa":
        match x:
            case "one":
                path = f"{base}/one_hop_supported.pickle"
            case "three" | _:
                path = f"{base}/three_hop_supported.pickle"
    else:  # dataset == "another"
        match x:
            case "one":
                path = f"{base}/webqsp_ctxstyle_1200_hop1_nl.pkl"
            case "three":
                path = f"{base}/webqsp_ctxstyle_1200_hop3_nl.pkl"
            case "two" | _:
                raise ValueError("‚ùå 'two_hop_supported.pickle' does not exist in the 'current' dataset.")

    # Load and return dataframe
    df = pd.read_pickle(path)

    if dataset =="webqsp":
        df=df.rename(columns={
            "ground_truth":"Label",
            "contexts": "ctx_topk"
        })
    
    return df.head(n)

In [None]:
dataset="webqsp"

df = classify("three", n = 1200, dataset=dataset)

df['query']=df.apply(
    lambda x: Verb2sCoT.msgs_stage1(
        question=x['question'],
        contexts=x['ctx_topk']
    ),
    axis=1
)

Doraemon.set_provider('llama3')
logger=Doraemon.get_logger(logfile='verb_2s_cot.log')
tasks = df.to_dict(orient='records')

q_a = await Inference.process_batches(tasks, logger, 'query')
df['q_a'] = pd.Series(q_a, dtype='object')
df['q_a'] = df['q_a'].apply(KG_RAG_Tool.extract_boxed_answer)

In [None]:
df['query_prob']=df.apply(
    lambda x: Verb2sCoT.msgs_stage2(
        final_answer=x['q_a'],
        question=x['question'],
        contexts=x['ctx_topk']
    ), 
    axis=1
)

tasks = df.to_dict(orient='records')

query_prob_a = await Inference.process_batches(tasks, logger, 'query_prob')
df['q_a_prob'] = pd.Series(query_prob_a, dtype='object')

df['final_a']=df['q_a'].copy()
df['final_prob']=df['q_a_prob'].apply(KG_RAG_Tool.extract_boxed_answer)

print(KG_RAG_Tool.eval_accuracy(df, pred='final_a', g_t='Label'))

In [None]:
from calibration_metrics import CalibrationMetrics

std_summary = CalibrationMetrics.summarize(df, prob_col="final_prob", correct_col="is_correct", n_bins=10, norm="l2")
print(std_summary["ece"], std_summary["brier"], std_summary["selective_auc"])
tbl_std = std_summary["reliability_table"]

In [None]:
df.to_pickle('verb2s_cot.pickle')