In [1]:
from typing import List, Dict, Tuple, Optional, Union, Any

import pandas as pd

from prompt_template_collection import PromptTemplate
from doraemon import Doraemon
from relaxed_fda import RelaxedFDA
from metrics_collection import MetricsHelper

task_name = "CommonsenseQA"
hp_m = 8 # -> SoTs 4,6,8,10,12
hp_k = 4 # -> cluster 1,3,4,5,7
logger = Doraemon.get_logger(name=task_name, logfile=f"relaxed_fda_on_{task_name}.log")
file_path = '/kaggle/input/llama-3-building-sots-on-commonsenseqa/sots_df.pkl'
df=pd.read_pickle(file_path)
df=df.rename(columns={'question':'query', 'reason':'r_s', 'ground_truth':'g_t','temperature':'t_p'})
if hp_m:
    df = df.groupby('query').head(hp_m).reset_index(drop=True)
df.to_pickle('sots_df.pkl')

grouped_data = RelaxedFDA.prepare_dataset(file_path='/kaggle/working/sots_df.pkl', logger=logger)

D: List[Dict] = PromptTemplate.sot_construct_inter_commonsenseqa()
op_system_prompt = RelaxedFDA.get_optimize_system_prompt(task_name)
encoder=RelaxedFDA.get_encoder()

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m124.9/124.9 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.7/210.7 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25h

2025-07-04 07:19:55.042271: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751613595.277406      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751613595.347696      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-04 07:20:13,990 INFO Loaded dataset from /kaggle/working/sots_df.pkl with shape (9621, 8)
2025-07-04 07:20:13,991 INFO In r_s mode.
2025-07-04 07:20:14,074 INFO Dataset preparation completed with 1204 entries.


In [2]:
result_pd: pd.DataFrame = RelaxedFDA.evaluate(
    grouped_data,
    logger,
    op_system_prompt,
    encoder,
    D,
    enable_logger_rs=False,
    ablation='all',
    K=hp_k
)

  0%|          | 0/1204 [00:00<?, ?it/s]2025-07-04 07:20:20,540 INFO Cluster 1: |C_0| = 5, P(r_0|do(X)) ≈ 0.62
2025-07-04 07:20:22,288 INFO Cluster 2: |C_1| = 1, P(r_1|do(X)) ≈ 0.12
2025-07-04 07:20:27,850 ERROR Error while querying LLM: Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2024-05-01-preview have exceeded token rate limit of your current OpenAI S0 pricing tier. Please retry after 1 second. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit. For Free Account customers, upgrade to Pay as you Go here: https://aka.ms/429TrialUpgrade.'}}
Traceback (most recent call last):
  File "/kaggle/usr/lib/relaxed_fda/relaxed_fda.py", line 56, in optimize_rs
    improved_rs, tokens = Doraemon.inference(logger=logger, messages=messages)
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/kaggle/usr/lib/doraemon

In [3]:
MetricsHelper.evaluate(result_pd, logger)

2025-07-04 13:01:16,205 INFO Exact Match: 74.83%
2025-07-04 13:01:16,207 INFO Average F1 Score: 59.36%
2025-07-04 13:01:16,208 INFO Accuracy: 74.75%
