In [1]:
from typing import List, Dict, Tuple, Optional, Union, Any

import pandas as pd

from prompt_template_collection import PromptTemplate
from doraemon import Doraemon
from relaxed_fda import RelaxedFDA
from metrics_collection import MetricsHelper

task_name = "HotpotQA"
# Set up logger
logger = Doraemon.get_logger(name=task_name, logfile=f"cp_on_{task_name}.log")

# Prepare dataset with the logger passed as a parameter
file_path = '/kaggle/input/building-sots-on-hotpotqa-shuffled/sots_df.pkl'
grouped_data = RelaxedFDA.prepare_dataset(file_path=file_path, mode='r_s', logger=logger)

encoder=RelaxedFDA.get_encoder()

# Demo promts question, corect, and wrong 
D: List[Dict] = PromptTemplate.sot_construct_inter_hotpotqa()
logger.info(len(D))

op_system_prompt = RelaxedFDA.get_optimize_system_prompt(task_name)
logger.info(op_system_prompt)

2025-06-24 03:08:58.820489: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750734539.072973      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750734539.140993      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-24 03:09:16,267 INFO Loaded dataset from /kaggle/input/building-sots-on-hotpotqa-shuffled/sots_df.pkl with shape (10673, 7)
2025-06-24 03:09:16,268 INFO In r_s mode.
2025-06-24 03:09:16,357 INFO Dataset preparation completed with 1196 entries.
2025-06-24 03:09:20,599 INFO 2
2025-06-24 03:09:20,600 INFO You are a concise and helpful assistant for multihop reasoning tasks. Provide step-by-step solutions, ensuring accuracy. Conc

In [2]:
grouped_data[['question', 'context']] = grouped_data['question'].str.split(r'\nContext:', n=1, expand=True)
grouped_data['question']=grouped_data['question'].str.strip()
grouped_data['context']=grouped_data['context'].str.strip()

In [3]:
grouped_data.head()

Unnamed: 0,question,reasoning_paths,ground_truth,answer,context
0,VIVA Media AG changed it's name in 2004. What ...,[<think>\nLet’s think through this step by ste...,Gesellschaft mit beschränkter Haftung,"[Gesellschaft mit beschränkter Haftung, Gesell...","A Gesellschaft mit beschränkter Haftung (] , a..."
1,Woman's Era and Naj are what kind of magazines?,[<think>\nLet’s think through this step by ste...,fortnightly women interest magazine,"[fortnightly women interest magazine, fortnigh...",Naj is a Polish language fortnightly lifestyle...
2,Jaclyn Stapp is married to the former frontman...,[<think>\nLet’s think through this step by ste...,2004,"[2004, 2004, 2004, 2004, 2004, 2004, 2004, 2004]","The band's third album, ""Weathered"", was relea..."
3,When was the Argentine former footballer whic...,[<think>\nLet’s think through this step by ste...,2 March 1972,"[2 March 1972, 2 March 1972, 2 March 1972, 2 M...","Mauricio Roberto Pochettino (] , ] ; born 2 Ma..."
4,Stronger Together was used for the campaign co...,[<think>\nLet’s think through this step by ste...,Virginia,"[Virginia, Virginia, Virginia, Virginia, Virgi...","Timothy Michael Kaine ( , born February 26, 19..."


In [4]:
result_pd: pd.DataFrame = RelaxedFDA.evaluate(
    grouped_data,
    logger,
    op_system_prompt,
    encoder,
    D,
    enable_logger_rs=False
)

2025-06-24 03:09:21,337 INFO Generated 8 valid reasoning path and embeddings.
2025-06-24 03:09:21,401 INFO Cluster 1: |C_0| = 3, P(r_0|do(X)) ≈ 0.38
2025-06-24 03:09:25,564 INFO Cluster 2: |C_1| = 1, P(r_1|do(X)) ≈ 0.12
2025-06-24 03:09:28,961 INFO Cluster 3: |C_2| = 2, P(r_2|do(X)) ≈ 0.25
2025-06-24 03:09:32,384 INFO Cluster 4: |C_3| = 2, P(r_3|do(X)) ≈ 0.25
2025-06-24 03:09:35,732 INFO Aggregated candidate votes (weighted): {'Gesellschaft\\ mit\\ beschränkter\\ Haftung': 0.7499999999999999}
2025-06-24 03:09:35,733 INFO Final aggregated answer is Gesellschaft\ mit\ beschränkter\ Haftung and weight 0.7499999999999999
2025-06-24 03:09:35,734 INFO Overall estimated probability (aggregated): 1.00
2025-06-24 03:09:35,962 INFO Generated 9 valid reasoning path and embeddings.
2025-06-24 03:09:35,975 INFO Cluster 1: |C_0| = 2, P(r_0|do(X)) ≈ 0.22
2025-06-24 03:09:38,333 INFO Cluster 2: |C_1| = 1, P(r_1|do(X)) ≈ 0.11
2025-06-24 03:09:41,174 INFO Cluster 3: |C_2| = 4, P(r_2|do(X)) ≈ 0.44
2025-0

In [5]:
MetricsHelper.evaluate(result_pd, logger)

2025-06-24 08:10:58,360 INFO Evaluated 1196 examples
2025-06-24 08:10:58,361 INFO Exact Match: 720/1196 = 60.20%
2025-06-24 08:10:58,362 INFO Average F1 Score: 75.84%
