In [1]:
import pandas as pd

df=pd.read_pickle('/kaggle/input/building-strategyqa-sots-dataset/strategy-qa-sots-reasoning-path.pkl').to_pandas()
df.shape

(10605, 8)

In [2]:
from typing import List
from relaxed_fda import RelaxedFDA
from prompt_template_collection import PromptTemplate
from doraemon import Doraemon

def context_to_str(context):
    if context is None:
        return ''
    return ' '.join(context)

logger = Doraemon.get_logger(name=__name__, logfile="relaxed_FDA_on_strategy_qa.log")

grouped = (df
    .groupby('question', sort=False)      # keep questions in first-seen order
    .agg(reasoning_paths=('reason', list), context=('evidence', 'first'), ground_truth=('ground_truth', 'first'))# collect each group's 'reason' values into a list
    .reset_index()                        # turn 'question' back into a column
)

grouped['context']=grouped['context'].apply(context_to_str)

2025-06-19 13:12:40.624668: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750338760.907711      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750338761.012598      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
from typing import List, Dict, Tuple, Optional

task_name = "StrategyQA"

# few-shot examples with correct reasoning with wrong answer
D: List[Dict] = PromptTemplate.sot_construct_inter_strategyqa()

op_system_prompt = RelaxedFDA.get_optimize_system_prompt(task_name)
logger.info(op_system_prompt)

encoder=RelaxedFDA.get_encoder()

2025-06-19 13:13:02,376 INFO You are a concise and helpful assistant for commonsense reasoning tasks. Provide step-by-step solutions, ensuring accuracy. Conclude with the final answer in the format: \boxed{answer}. When given a reasoning process, refine it to be both correct and succinct.


In [4]:
result_pd: pd.DataFrame = RelaxedFDA.evaluate(
    grouped,
    logger,
    op_system_prompt,
    encoder,
    D,
    enable_logger_rs=False
)

2025-06-19 13:13:08,257 INFO Generated 9 valid reasoning path and embeddings.
2025-06-19 13:13:08,328 INFO Cluster 1: |C_0| = 1, P(r_0|do(X)) ≈ 0.11
2025-06-19 13:13:11,799 INFO Cluster 2: |C_1| = 3, P(r_1|do(X)) ≈ 0.33
2025-06-19 13:13:13,503 INFO Cluster 3: |C_2| = 2, P(r_2|do(X)) ≈ 0.22
2025-06-19 13:13:15,218 INFO Cluster 4: |C_3| = 3, P(r_3|do(X)) ≈ 0.33
2025-06-19 13:13:16,712 INFO Aggregated candidate votes (weighted): {'False': 1.0}
2025-06-19 13:13:16,713 INFO Final aggregated answer is False and weight 1.0
2025-06-19 13:13:16,714 INFO Overall estimated probability (aggregated): 1.00
2025-06-19 13:13:16,938 INFO Generated 9 valid reasoning path and embeddings.
2025-06-19 13:13:16,954 INFO Cluster 1: |C_0| = 3, P(r_0|do(X)) ≈ 0.33
2025-06-19 13:13:18,253 INFO Cluster 2: |C_1| = 3, P(r_1|do(X)) ≈ 0.33
2025-06-19 13:13:21,840 INFO Cluster 3: |C_2| = 2, P(r_2|do(X)) ≈ 0.22
2025-06-19 13:13:25,545 INFO Cluster 4: |C_3| = 1, P(r_3|do(X)) ≈ 0.11
2025-06-19 13:13:30,148 INFO Aggregate

In [5]:
logger.info(f"Accuracy on {task_name} Dataset:{RelaxedFDA.calculate_accuracy(result_pd):.2%}")

2025-06-19 19:20:36,845 INFO Accuracy on StrategyQA Dataset:84.45%
