In [1]:
from typing import List, Dict, Tuple, Optional, Union, Any

import pandas as pd

from prompt_template_collection import PromptTemplate
from doraemon import Doraemon
from relaxed_fda import RelaxedFDA
from metrics_collection import MetricsHelper

task_name = "HotpotQA"
# Set up logger
logger = Doraemon.get_logger(name=task_name, logfile=f"cp_on_{task_name}.log")

# Prepare dataset with the logger passed as a parameter
file_path = '/kaggle/input/building-sots-on-hotpotqa-injected/sots_df.pkl'
grouped_data = RelaxedFDA.prepare_dataset(file_path=file_path, mode='r_s', logger=logger)

encoder=RelaxedFDA.get_encoder()

# Demo promts question, corect, and wrong 
D: List[Dict] = PromptTemplate.sot_construct_inter_hotpotqa()
logger.info(len(D))

op_system_prompt = RelaxedFDA.get_optimize_system_prompt(task_name)
logger.info(op_system_prompt)

2025-06-24 03:02:24.133608: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750734144.398570      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750734144.477920      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-24 03:02:42,995 INFO Loaded dataset from /kaggle/input/building-sots-on-hotpotqa-injected/sots_df.pkl with shape (10709, 7)
2025-06-24 03:02:42,997 INFO In r_s mode.
2025-06-24 03:02:43,079 INFO Dataset preparation completed with 1197 entries.
2025-06-24 03:02:47,730 INFO 2
2025-06-24 03:02:47,731 INFO You are a concise and helpful assistant for multihop reasoning tasks. Provide step-by-step solutions, ensuring accuracy. Conc

In [2]:
grouped_data[['question', 'context']] = grouped_data['question'].str.split(r'\nContext:', n=1, expand=True)
grouped_data['question']=grouped_data['question'].str.strip()
grouped_data['context']=grouped_data['context'].str.strip()

In [3]:
result_pd: pd.DataFrame = RelaxedFDA.evaluate(
    grouped_data,
    logger,
    op_system_prompt,
    encoder,
    D,
    enable_logger_rs=False
)

2025-06-24 03:02:48,556 INFO Generated 9 valid reasoning path and embeddings.
2025-06-24 03:02:48,637 INFO Cluster 1: |C_0| = 3, P(r_0|do(X)) ≈ 0.33
2025-06-24 03:02:52,535 INFO Cluster 2: |C_1| = 3, P(r_1|do(X)) ≈ 0.33
2025-06-24 03:02:55,510 INFO Cluster 3: |C_2| = 2, P(r_2|do(X)) ≈ 0.22
2025-06-24 03:02:58,389 INFO Cluster 4: |C_3| = 1, P(r_3|do(X)) ≈ 0.11
2025-06-24 03:03:01,323 INFO Aggregated candidate votes (weighted): {'Irrelevant': 0.3333333333333333, 'Not provided': 0.4444444444444444, 'Gesellschaft mit beschränkter Haftung': 0.14814814814814814}
2025-06-24 03:03:01,324 INFO Final aggregated answer is Not provided and weight 0.4444444444444444
2025-06-24 03:03:01,325 INFO Overall estimated probability (aggregated): 0.48
2025-06-24 03:03:01,598 INFO Generated 9 valid reasoning path and embeddings.
2025-06-24 03:03:01,613 INFO Cluster 1: |C_0| = 2, P(r_0|do(X)) ≈ 0.22
2025-06-24 03:03:04,277 INFO Cluster 2: |C_1| = 5, P(r_1|do(X)) ≈ 0.56
2025-06-24 03:03:05,487 INFO Cluster 3: 

In [4]:
MetricsHelper.evaluate(result_pd, logger)

2025-06-24 08:05:49,949 INFO Evaluated 1197 examples
2025-06-24 08:05:49,951 INFO Exact Match: 451/1197 = 37.68%
2025-06-24 08:05:49,951 INFO Average F1 Score: 48.78%
