In [None]:
from typing import List, Dict, Any

import re
import ast
import tiktoken
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Patch

from doraemon import Doraemon

logger = Doraemon.get_logger(name=__name__, logfile=f"efficiency_comparison.log")

gsm8k_cot='/kaggle/input/building-cots-on-gsm8k/cots_df.pkl'
math_cot='/kaggle/input/building-cots-on-math/cots_df.pkl'
coqa_cot='/kaggle/input/llama-3-building-cots-on-commonsenseqa/cots_df.pkl'
strqa_cot='/kaggle/input/deprecated-building-cots-on-strategyqa/cots_df.pkl'
hotpotqa_cot='/kaggle/input/llama-3-building-cots-on-hotpotqa/cots_df.pkl'
musique_cot='/kaggle/input/llama-3-building-cots-on-musique/cots_df.pkl'
fever_cot='/kaggle/input/llama-3-building-cots-on-fever/cots_df.pkl'

gsm8k_sot='/kaggle/input/building-gsm8k-sots-dataset/sots.pkl'
math_sot='/kaggle/input/gpt35-building-sots-on-math/sots_df.pkl'
coqa_sot='/kaggle/input/building-commonsenseqa-sots-dataset/commonsense-qa-reasoning-path.pkl'
strqa_sot='/kaggle/input/building-strategyqa-sots-dataset/strategy-qa-sots-reasoning-path.pkl'
hotpotqa_sot='/kaggle/input/building-hotpotqa-sots-dataset/sots_df.pkl'
musique_sot='/kaggle/input/building-musique-sots-dataset/musique-sots-dataset.pkl'
fever_sot='/kaggle/input/building-sot-on-fever/sots_df.pkl'

def count_tokens(text):
    return len(enc.encode(text))

def count_steps_sot(text):
    """Count steps for <think>...</think> format (SoT)."""
    if not isinstance(text, str):
        return None
    match = re.search(r'<think>(.*?)</think>', text, re.DOTALL)
    if match:
        inner = match.group(1)
        return len([seg for seg in inner.split('→') if seg.strip()])
    return None

def count_steps_cot(text):
    """Count steps for natural language CoT format."""
    if not isinstance(text, str):
        return None
    main = re.split(r'\\boxed{|\\boxed{|A:|Answer:', text)[0]
    sentences = re.split(r'(?<=[.?!])\s+', main)
    return len([s for s in sentences if s.strip()])

enc = tiktoken.encoding_for_model("gpt-3.5-turbo")  # or "gpt-4"

# Cot and Sot

In [None]:
df_gsm8k_cot = pd.read_pickle(gsm8k_cot)
df_gsm8k_cot['num_tokens'] = df_gsm8k_cot['cots'].apply(lambda x: count_tokens(x))
avg_tokens_gsm8k_cot = df_gsm8k_cot['num_tokens'].mean()
logger.info(avg_tokens_gsm8k_cot)

df_math_cot = pd.read_pickle(math_cot)
df_math_cot['num_tokens'] = df_math_cot['cots'].apply(lambda x: count_tokens(x))
avg_tokens_math_cot = df_math_cot['num_tokens'].mean()
logger.info(avg_tokens_math_cot)

df_coqa_cot = pd.read_pickle(coqa_cot)
df_coqa_cot['num_tokens'] = df_coqa_cot['r_s'].apply(lambda x: count_tokens(x))
avg_tokens_coqa_cot = df_coqa_cot['num_tokens'].mean()
logger.info(avg_tokens_coqa_cot)

df_strqa_cot = pd.read_pickle(strqa_cot)
df_strqa_cot['num_tokens'] = df_strqa_cot['r_s'].apply(lambda x: count_tokens(x))
avg_tokens_strqa_cot = df_strqa_cot['num_tokens'].mean()
logger.info(avg_tokens_strqa_cot)

df_hotpotqa_cot = pd.read_pickle(hotpotqa_cot)
df_hotpotqa_cot['num_tokens'] = df_hotpotqa_cot['r_s'].apply(lambda x: count_tokens(x))
avg_tokens_hotpotqa_cot = df_hotpotqa_cot['num_tokens'].mean()
logger.info(avg_tokens_hotpotqa_cot)

df_musique_cot = pd.read_pickle(musique_cot)
df_musique_cot['num_tokens'] = df_musique_cot['r_s'].apply(lambda x: count_tokens(x))
avg_tokens_musique_cot = df_musique_cot['num_tokens'].mean()
logger.info(avg_tokens_musique_cot)

df_fever_cot = pd.read_pickle(fever_cot)
df_fever_cot['num_tokens'] = df_fever_cot['r_s'].apply(lambda x: count_tokens(x))
avg_tokens_fever_cot = df_fever_cot['num_tokens'].mean()
logger.info(avg_tokens_fever_cot)
########################################################################################
df_gsm8k_sot = pd.read_pickle(gsm8k_sot)
df_gsm8k_sot = df_gsm8k_sot.drop_duplicates(subset=['question'], keep='first')
df_gsm8k_sot['num_tokens'] = df_gsm8k_sot['reason'].apply(lambda x: count_tokens(x))
avg_tokens_gsm8k_sot = df_gsm8k_sot['num_tokens'].mean()
logger.info(avg_tokens_gsm8k_sot)

df_math_sot = pd.read_pickle(math_sot)
df_math_sot = df_math_sot.drop_duplicates(subset=['query'], keep='first')
df_math_sot['num_tokens'] = df_math_sot['r_s'].apply(lambda x: count_tokens(x))
avg_tokens_math_sot = df_math_sot['num_tokens'].mean()
logger.info(avg_tokens_math_sot)

df_coqa_sot = pd.read_pickle(coqa_sot).to_pandas()
df_coqa_sot = df_coqa_sot.drop_duplicates(subset=['question'], keep='first')
df_coqa_sot['num_tokens'] = df_coqa_sot['reason'].apply(lambda x: count_tokens(x))
avg_tokens_coqa_sot = df_coqa_sot['num_tokens'].mean()
logger.info(avg_tokens_coqa_sot)

df_strqa_sot = pd.read_pickle(strqa_sot).to_pandas()
df_strqa_sot = df_strqa_sot.drop_duplicates(subset=['question'], keep='first')
df_strqa_sot['num_tokens'] = df_strqa_sot['reason'].apply(lambda x: count_tokens(x))
avg_tokens_strqa_sot = df_strqa_sot['num_tokens'].mean()
logger.info(avg_tokens_strqa_sot)

df_hotpotqa_sot = pd.read_pickle(hotpotqa_sot)
df_hotpotqa_sot = df_hotpotqa_sot.drop_duplicates(subset=['question'], keep='first')
df_hotpotqa_sot['num_tokens'] = df_hotpotqa_sot['reason'].apply(lambda x: count_tokens(x))
avg_tokens_hotpotqa_sot = df_hotpotqa_sot['num_tokens'].mean()
logger.info(avg_tokens_hotpotqa_sot)

df_musique_sot = pd.read_pickle(musique_sot).to_pandas()
df_musique_sot = df_musique_sot.drop_duplicates(subset=['question'], keep='first')
df_musique_sot['num_tokens'] = df_musique_sot['reason'].apply(lambda x: count_tokens(x))
avg_tokens_musique_sot = df_musique_sot['num_tokens'].mean()
logger.info(avg_tokens_musique_sot)

df_fever_sot = pd.read_pickle(fever_sot)
df_fever_sot = df_fever_sot.drop_duplicates(subset=['question'], keep='first')
df_fever_sot['num_tokens'] = df_fever_sot['r_s'].apply(lambda x: count_tokens(x))
avg_tokens_fever_sot = df_fever_sot['num_tokens'].mean()
logger.info(avg_tokens_fever_sot)
########################################################################################

# GSM8K
df_gsm8k_cot['cot_steps'] = df_gsm8k_cot['cots'].apply(count_steps_cot)
avg_steps_gsm8k_cot = df_gsm8k_cot['cot_steps'].mean()
logger.info(avg_steps_gsm8k_cot)

# MATH
df_math_cot['cot_steps'] = df_math_cot['cots'].apply(count_steps_cot)
avg_steps_math_cot = df_math_cot['cot_steps'].mean()
logger.info(avg_steps_math_cot)

# ComQA
df_coqa_cot['cot_steps'] = df_coqa_cot['r_s'].apply(count_steps_cot)
avg_steps_coqa_cot = df_coqa_cot['cot_steps'].mean()
logger.info(avg_steps_coqa_cot)

# StrQA
df_strqa_cot['cot_steps'] = df_strqa_cot['r_s'].apply(count_steps_cot)
avg_steps_strqa_cot = df_strqa_cot['cot_steps'].mean()
logger.info(avg_steps_strqa_cot)

# HotpotQA
df_hotpotqa_cot['cot_steps'] = df_hotpotqa_cot['r_s'].apply(count_steps_cot)
avg_steps_hotpotqa_cot = df_hotpotqa_cot['cot_steps'].mean()
logger.info(avg_steps_hotpotqa_cot)

# MuSiQue
df_musique_cot['cot_steps'] = df_musique_cot['r_s'].apply(count_steps_cot)
avg_steps_musique_cot = df_musique_cot['cot_steps'].mean()
logger.info(avg_steps_musique_cot)

# FEVER
df_fever_cot['cot_steps'] = df_fever_cot['r_s'].apply(count_steps_cot)
avg_steps_fever_cot = df_fever_cot['cot_steps'].mean()
logger.info(avg_steps_fever_cot)
########################################################################################

# GSM8K SoT
df_gsm8k_sot['num_steps'] = df_gsm8k_sot['reason'].apply(count_steps_sot)
avg_steps_gsm8k_sot = df_gsm8k_sot['num_steps'].mean()
logger.info(f"GSM8K SoT avg steps: {avg_steps_gsm8k_sot}")

# MATH SoT
df_math_sot['num_steps'] = df_math_sot['r_s'].apply(count_steps_sot)
avg_steps_math_sot = df_math_sot['num_steps'].mean()
logger.info(f"MATH SoT avg steps: {avg_steps_math_sot}")

# ComQA SoT
df_coqa_sot['num_steps'] = df_coqa_sot['reason'].apply(count_steps_sot)
avg_steps_coqa_sot = df_coqa_sot['num_steps'].mean()
logger.info(f"ComQA SoT avg steps: {avg_steps_coqa_sot}")

# StrQA SoT
df_strqa_sot['num_steps'] = df_strqa_sot['reason'].apply(count_steps_sot)
avg_steps_strqa_sot = df_strqa_sot['num_steps'].mean()
logger.info(f"StrQA SoT avg steps: {avg_steps_strqa_sot}")

# HotpotQA SoT
df_hotpotqa_sot['num_steps'] = df_hotpotqa_sot['reason'].apply(count_steps_sot)
avg_steps_hotpotqa_sot = df_hotpotqa_sot['num_steps'].mean()
logger.info(f"HotpotQA SoT avg steps: {avg_steps_hotpotqa_sot}")

# MuSiQue SoT
df_musique_sot['num_steps'] = df_musique_sot['reason'].apply(count_steps_sot)
avg_steps_musique_sot = df_musique_sot['num_steps'].mean()
logger.info(f"MuSiQue SoT avg steps: {avg_steps_musique_sot}")

# FEVER SoT
df_fever_sot['num_steps'] = df_fever_sot['r_s'].apply(count_steps_sot)
avg_steps_fever_sot = df_fever_sot['num_steps'].mean()
logger.info(f"FEVER SoT avg steps: {avg_steps_fever_sot}")

In [None]:
datasets = ['GSM8K', 'MATH', 'ComQA', 'StrQA', 'HotpotQA', 'MuSiQue', 'FEVER']
methods_plot1 = ['CoT', 'SoT']

avg_tokens_plot1 = np.array([
    [
        avg_tokens_gsm8k_cot,
        avg_tokens_math_cot,
        avg_tokens_coqa_cot,
        avg_tokens_strqa_cot,
        avg_tokens_hotpotqa_cot,
        avg_tokens_musique_cot,
        avg_tokens_fever_cot
    ],  # CoT
    [
        avg_tokens_gsm8k_sot,
        avg_tokens_math_sot,
        avg_tokens_coqa_sot,
        avg_tokens_strqa_sot,
        avg_tokens_hotpotqa_sot,
        avg_tokens_musique_sot,
        avg_tokens_fever_sot
    ],   # SoT
])

n_methods = len(methods_plot1)
x = np.arange(len(datasets))
bar_width = 0.3
colors = plt.get_cmap('tab10').colors

fig, ax = plt.subplots(figsize=(8, 5))
for i, method in enumerate(methods_plot1):
    ax.bar(x + (i - n_methods/2) * bar_width, avg_tokens_plot1[i], width=bar_width,
           color=colors[i], label=method)
ax.set_xticks(x)
ax.set_xticklabels(datasets, rotation=15, fontsize=20)
ax.set_ylabel('Avg. Token', fontsize=20)
# ax.set_title('Avg. Token$\\downarrow$ per Method (CoT, SoT)')
ax.legend(title="Method")
fig.tight_layout()
fig.savefig("avg_token_sot_cot.png", dpi=300)
plt.show()

In [None]:
methods_plot1 = ['CoT', 'SoT']
avg_steps_plot1 = np.array([
    [
        avg_steps_gsm8k_cot,
        avg_steps_math_cot,
        avg_steps_coqa_cot,
        avg_steps_strqa_cot,
        avg_steps_hotpotqa_cot,
        avg_steps_musique_cot,
        avg_steps_fever_cot
    ],  # CoT

    [
        avg_steps_gsm8k_sot,
        avg_steps_math_sot,
        avg_steps_coqa_sot,
        avg_steps_strqa_sot,
        avg_steps_hotpotqa_sot,
        avg_steps_musique_sot,
        avg_steps_fever_sot
    ]  # SoT
])

n_methods = len(methods_plot1)
x = np.arange(len(datasets))
bar_width = 0.3
colors = plt.get_cmap('tab10').colors

fig, ax = plt.subplots(figsize=(8, 5))
for i, method in enumerate(methods_plot1):
    ax.bar(x + (i - n_methods/2) * bar_width, avg_steps_plot1[i], width=bar_width,
           color=colors[i], label=method)
ax.set_xticks(x)
ax.set_xticklabels(datasets, rotation=15, fontsize=20)
ax.set_ylabel('Avg. Step', fontsize=20)
# ax.set_title('Avg. Step$\\downarrow$ per Method (CoT, SoT)')
ax.legend(title="Method")
fig.tight_layout()
fig.savefig("avg_steps_cot_sot.png", dpi=300)
plt.show()

# Casuality based Prompting Framework

In [None]:
decot_strqa='/kaggle/input/generate-alter-decot-on-strategyqa/alternative_decots.pkl'
deco_hotpotqa='/kaggle/input/generate-alter-decot-on-hotpotqa/alternative_decots.pkl'
decot_musiqueqa='/kaggle/input/generate-alter-decot-on-musiqueqa/alternative_decots.pkl'
decot_fever='/kaggle/input/generate-alter-decot-on-fever/alternative_decots.pkl'

# STRQA
df_decot_strqa = pd.read_pickle(decot_strqa)
df_decot_strqa['cad_tokens'] = df_decot_strqa.apply(lambda row: len(row['entities'][:2]) * row['tokens'], axis=1)
avg_tokens_decot_strqa = df_decot_strqa['cad_tokens'].mean()
logger.info(f"STRQA DeCoT avg tokens: {avg_tokens_decot_strqa}")

# HotpotQA
df_decot_hotpotqa = pd.read_pickle(deco_hotpotqa)
df_decot_hotpotqa['cad_tokens'] = df_decot_hotpotqa.apply(lambda row: len(row['entities']) * row['tokens'], axis=1)
avg_tokens_decot_hotpotqa = df_decot_hotpotqa['cad_tokens'].mean()
logger.info(f"HotpotQA DeCoT avg tokens: {avg_tokens_decot_hotpotqa}")

# MuSiQue
df_decot_musiqueqa = pd.read_pickle(decot_musiqueqa)
df_decot_musiqueqa['cad_tokens'] = df_decot_musiqueqa.apply(lambda row: len(row['entities']) * row['tokens'], axis=1)
avg_tokens_decot_musiqueqa = df_decot_musiqueqa['cad_tokens'].mean()
logger.info(f"MuSiQue DeCoT avg tokens: {avg_tokens_decot_musiqueqa}")

# FEVER
df_decot_fever = pd.read_pickle(decot_fever)
df_decot_fever['cad_tokens'] = df_decot_fever.apply(lambda row: len(row['entities']) * row['tokens'], axis=1)
avg_tokens_decot_fever = df_decot_fever['cad_tokens'].mean()
logger.info(f"FEVER DeCoT avg tokens: {avg_tokens_decot_fever}")
##############################################################################

# STRQA
df_decot_strqa['cot_steps'] = df_decot_strqa['cots'].apply(lambda x: count_steps_cot(x[0]))
avg_steps_decot_strqa = df_decot_strqa['cot_steps'].mean()
logger.info(avg_steps_decot_strqa)

# HotpotQA
df_decot_hotpotqa = pd.read_pickle(deco_hotpotqa)
df_decot_hotpotqa['cot_steps'] = df_decot_hotpotqa['cots'].apply(lambda x: count_steps_cot(x[0]))
avg_steps_decot_hotpotqa = df_decot_hotpotqa['cot_steps'].mean()
logger.info(avg_steps_decot_hotpotqa)

# MuSiQue
df_decot_musiqueqa = pd.read_pickle(decot_musiqueqa)
df_decot_musiqueqa['cot_steps'] = df_decot_musiqueqa['cots'].apply(lambda x: count_steps_cot(x[0]))
avg_steps_decot_musiqueqa = df_decot_musiqueqa['cot_steps'].mean()
logger.info(avg_steps_decot_musiqueqa)

# FEVER
df_decot_fever = pd.read_pickle(decot_fever)
df_decot_fever['cot_steps'] = df_decot_fever['cots'].apply(lambda x: count_steps_cot(x[0]))
avg_steps_decot_fever = df_decot_fever['cot_steps'].mean()
logger.info(avg_steps_decot_fever)

In [None]:
methods_plot2 = ['CoT', 'CoT-SC', 'SoT', 'CAD', 'DeCoT', 'CP', 'ACPS']

avg_tokens_plot2 = np.array([
    # CoT (raw, no multiplier),
    avg_tokens_plot1[0],
    # CoT-SC (×9 cot every question)
    [
        9 * avg_tokens_gsm8k_cot,
        9 * avg_tokens_math_cot,
        9 * avg_tokens_coqa_cot,
        9 * avg_tokens_strqa_cot,
        9 * avg_tokens_hotpotqa_cot,
        9 * avg_tokens_musique_cot,
        9 * avg_tokens_fever_cot
    ],
    # SoT (raw) 20 times data generalization for visualization
    [x * 20 for x in avg_steps_plot1[1]],
    # CAD (×2 for get logprob with/without context, only for multi-hop QA & fact checking)
    [
        0,
        0,
        0,
        2 * avg_tokens_strqa_cot,
        2 * avg_tokens_hotpotqa_cot,
        2 * avg_tokens_musique_cot,
        2 * avg_tokens_fever_cot
    ],
    # DeCoT (direct values)
    [
        0,
        0,
        0,
        avg_tokens_decot_strqa,
        avg_tokens_decot_hotpotqa,
        avg_tokens_decot_musiqueqa,
        avg_tokens_decot_fever
    ],
    # CP (9 times reasoning paths + 3 times intervening)
    [
        12 * avg_tokens_gsm8k_cot,
        12 * avg_tokens_math_cot,
        12 * avg_tokens_coqa_cot,
        12 * avg_tokens_strqa_cot,
        12 * avg_tokens_hotpotqa_cot,
        12 * avg_tokens_musique_cot,
        12 * avg_tokens_fever_cot
    ],
    # ACPS (same to cp)
    [
        12 * avg_tokens_gsm8k_sot,
        12 * avg_tokens_math_sot,
        12 * avg_tokens_coqa_sot,
        12 * avg_tokens_strqa_sot,
        12 * avg_tokens_hotpotqa_sot,
        12 * avg_tokens_musique_sot,
        12 * avg_tokens_fever_sot
    ]
])


n_methods = len(methods_plot2)
x = np.arange(len(datasets))
bar_width = 0.12  # Narrower bars to fit 7 methods
colors = plt.get_cmap('tab10').colors  # Optional: extend with + plt.get_cmap('Set2').colors

fig, ax = plt.subplots(figsize=(12, 5))
for i, method in enumerate(methods_plot2):
    ax.bar(x + (i - n_methods / 2) * bar_width, avg_tokens_plot2[i], width=bar_width,
           color=colors[i % len(colors)], label=method)

ax.set_xticks(x)
ax.set_xticklabels(datasets, rotation=15, fontsize=20)
ax.set_ylabel('Avg. Token', fontsize=20)
# ax.set_title('Avg. Token$\\downarrow$ per Method (CoT, CoT-SC, SoT, CAD, DeCoT, CP, ACPS)', fontsize=20)
ax.legend(title="Method")
fig.tight_layout()
fig.savefig("avg_token_causality.png", dpi=300)
plt.show()


In [None]:
import ast

cad_on_strqa='/kaggle/input/cad-on-strategyqa/cleaned_results.pkl'
cad_on_hotpotqa='/kaggle/input/cad-on-hotpotqa/cleaned_results.pkl'
cad_on_musique='/kaggle/input/cad-on-musique/cleaned_results.pkl'
cad_on_fever='/kaggle/input/cad-on-fever/cad.pkl'

def extract_final_answer(logprobs_with_ctx):
    return ''.join(token['token'] for token in logprobs_with_ctx)

# STRQA
df_strqa_cad = pd.read_pickle(cad_on_strqa)
df_strqa_cad['final_answer'] = df_strqa_cad['logprobs_with_ctx'].apply(extract_final_answer)
df_strqa_cad['cot_steps'] = df_strqa_cad['final_answer'].apply(count_steps_cot)
strqa_cad_avg_steps = df_strqa_cad['cot_steps'].mean()
logger.info(f"STRQA CAD average steps: {strqa_cad_avg_steps:.2f}")

# MusiQue
df_musique_cad=pd.read_pickle('/kaggle/input/cad-on-musique/cleaned_results.pkl')
df_musique_cad['final_answer'] = df_musique_cad['logprobs_with_ctx'].apply(extract_final_answer)
df_musique_cad['cot_steps'] = df_musique_cad['final_answer'].apply(count_steps_cot)
musique_cad_avg_steps = df_musique_cad['cot_steps'].mean() # cad requires to get logprob with/without context
logger.info(musique_cad_avg_steps)

# Convert string to dict and extract the key (reasoning)
def extract_key_from_str_dict(x):
    try:
        d = ast.literal_eval(x)
        return list(d.keys())[0]
    except Exception as e:
        return None

# HotpotQA
df_hotpotqa_cad = pd.read_pickle(cad_on_hotpotqa)
df_hotpotqa_cad['final_answer'] = df_hotpotqa_cad['answer_with_context'].apply(extract_key_from_str_dict)
df_hotpotqa_cad['cot_steps'] = df_hotpotqa_cad['final_answer'].apply(count_steps_cot)
hotpotqa_cad_avg_steps = df_hotpotqa_cad['cot_steps'].mean()
logger.info(f"HotpotQA CAD average steps: {hotpotqa_cad_avg_steps:.2f}")

# FEVER
df_fever_cad = pd.read_pickle(cad_on_fever)
df_fever_cad['final_answer'] = df_fever_cad['answer_with_context'].apply(extract_key_from_str_dict)
df_fever_cad['cot_steps'] = df_fever_cad['final_answer'].apply(count_steps_cot)
fever_cad_avg_steps = df_fever_cad['cot_steps'].mean()
logger.info(f"FEVER CAD average steps: {fever_cad_avg_steps:.2f}")

# CoT-SC Avg Step

In [None]:
# GSM8K
df_gsm8k_cot['cot_steps'] = df_gsm8k_cot['cots'].apply(count_steps_cot)
final_avg_steps_gsm8k = df_gsm8k_cot.groupby('question')['cot_steps'].mean().mean()
logger.info(final_avg_steps_gsm8k)

# MATH
df_math_cot['cot_steps'] = df_math_cot['cots'].apply(count_steps_cot)
final_avg_steps_math = df_math_cot.groupby('question')['cot_steps'].mean().mean()
logger.info(final_avg_steps_math)

# ComQA
df_coqa_cot['cot_steps'] = df_coqa_cot['r_s'].apply(count_steps_cot)
final_avg_steps_coqa = df_coqa_cot.groupby('question')['cot_steps'].mean().mean()
logger.info(final_avg_steps_coqa)

# StrategyQA
df_strqa_cot['cot_steps'] = df_strqa_cot['r_s'].apply(count_steps_cot)
final_avg_steps_strqa = df_strqa_cot.groupby('question')['cot_steps'].mean().mean()
logger.info(final_avg_steps_strqa)

# HotpotQA
df_hotpotqa_cot['cot_steps'] = df_hotpotqa_cot['r_s'].apply(count_steps_cot)
final_avg_steps_hotpotqa = df_hotpotqa_cot.groupby('question')['cot_steps'].mean().mean()
logger.info(final_avg_steps_hotpotqa)

# MuSiQue
df_musique_cot['cot_steps'] = df_musique_cot['r_s'].apply(count_steps_cot)
final_avg_steps_musique = df_musique_cot.groupby('question')['cot_steps'].mean().mean()
logger.info(final_avg_steps_musique)

# FEVER
df_fever_cot['cot_steps'] = df_fever_cot['r_s'].apply(count_steps_cot)
final_avg_steps_fever = df_fever_cot.groupby('question')['cot_steps'].mean().mean()
logger.info(final_avg_steps_fever)

In [None]:
methods_plot2 = ['CoT', 'CoT-SC', 'SoT', 'CAD', 'DeCoT', 'CP', 'ACPS']

avg_steps_plot2 = np.array([
    # CoT
    avg_steps_plot1[0],
    # CoT-SC (same as CoT)
    [
    final_avg_steps_gsm8k,
    final_avg_steps_math,
    final_avg_steps_coqa,
    final_avg_steps_strqa,
    final_avg_steps_hotpotqa,
    final_avg_steps_musique,
    final_avg_steps_fever
    ],
    # SoT
    avg_steps_plot1[1],
    # CAD (only multi-hop/fact-checking have values)
    [
        0,
        0,
        0,
        strqa_cad_avg_steps,
        hotpotqa_cad_avg_steps,
        musique_cad_avg_steps,
        fever_cad_avg_steps
    ],
    # DeCoT
    [
        0,
        0,
        0,
        avg_steps_decot_strqa,
        avg_steps_decot_hotpotqa,
        avg_steps_decot_musiqueqa,
        avg_steps_decot_fever
    ],
    # CP (same as CoT)
    avg_steps_plot1[0],
    # ACPS (same as SoT)
    avg_steps_plot1[1]
])



n_methods = len(methods_plot2)
x = np.arange(len(datasets))
bar_width = 0.12  # narrower for 7 methods
colors = plt.get_cmap('tab10').colors + plt.get_cmap('Set2').colors  # safe for more methods

fig, ax = plt.subplots(figsize=(12, 5))
for i, method in enumerate(methods_plot2):
    ax.bar(x + (i - n_methods / 2) * bar_width, avg_steps_plot2[i], width=bar_width,
           color=colors[i % len(colors)], label=method)

ax.set_xticks(x)
ax.set_xticklabels(datasets, rotation=15, fontsize=20)
ax.set_ylabel('Avg. Step', fontsize=20)
# ax.set_title('Avg. Step$\\downarrow$ per Method (CoT, CoT-SC, SoT, CAD, DeCoT, CP, ACPS)', fontsize=20)
ax.legend(title="Method", bbox_to_anchor=(1.05, 1), loc='upper left')
fig.tight_layout()
fig.savefig("avg_step_causality.png", dpi=300)
plt.show()