In [None]:
import os
import pandas as pd

In [None]:
# Set the path to your directory
directory_path = "outputs/revised_answers"

# Get all CSV files in the directory
csv_files = [f for f in os.listdir(directory_path) if f.endswith('.csv')]

# Read and combine them into one DataFrame
df_list = [pd.read_csv(os.path.join(directory_path, file)) for file in csv_files]
combined_df = pd.concat(df_list, ignore_index=True)

combined_df.head()

In [None]:
combined_df['final_answer'] = combined_df['final_answer'].str.lower()
combined_df['target_text'] = combined_df['target_text'].str.lower()
combined_df['revised_answer'] = combined_df['revised_answer'].str.lower()
combined_df['pass'] = combined_df.apply(
    lambda row: 1 if str(row['target_text']).replace('-', '') in str(row['revised_answer']) else 0, axis=1
)
combined_df

In [None]:
combined_df['pass_pre_revised'] = combined_df.apply(
    lambda row: 1 if str(row['target_text']).replace('-', '') in str(row['final_answer']) else 0, axis=1
)
combined_df

In [None]:
pass_count = combined_df['pass'].sum()
print(f"Total number of questions passed: {pass_count}")
pass_pre_revised_count = combined_df['pass_pre_revised'].sum()
print(f"Total number of questions passed before revision: {pass_pre_revised_count}")

In [4]:
import os
import pandas as pd

# Set the path to your directory
conventional_directory_path = "outputs/conventional_answers"

# Get all CSV files in the directory
conventional_csv_files = [f for f in os.listdir(conventional_directory_path) if f.endswith('.csv')]

# Read and combine them into one DataFrame
conventional_df_list = [pd.read_csv(os.path.join(conventional_directory_path, file)) for file in conventional_csv_files]
conventional_combined_df = pd.concat(conventional_df_list, ignore_index=True)

conventional_combined_df

Unnamed: 0,answer,reason,batch_num,question_num,num_hops,target_text,query
0,grandson,"Based on the knowledge graph, Michael is Emily...",0,0,2,grandson,"('Clarence', 'Michael')"
1,niece,To determine the missing relation between June...,0,1,2,granddaughter,"('Clarence', 'June')"
2,Relation,"Given the input story and question, we use a s...",0,2,2,granddaughter,"('Clarence', 'Louise')"
3,granddaughter,Step 1: Extract facts from input story\n- Tony...,0,3,2,granddaughter,"('Ashley', 'Charlotte')"
4,niece,To address the question and identify the relat...,0,4,2,grandson,"('Ashley', 'Michael')"
...,...,...,...,...,...,...,...
64,sister-in-law,"In the story, it was identified that Margaret ...",1,14,10,sister,"('William', 'Patrice')"
65,niece,1. **Creating the Knowledge Graph**:\n - [Eu...,1,15,10,niece,"('Ellen', 'Mary')"
66,cousin,Based on the information provided in the input...,1,16,10,grandmother,"('Charles', 'Elsie')"
67,uncle,To determine the missing relationship between ...,1,17,10,nephew,"('James', 'Don')"


In [None]:
def select_pass_row(group):
    # If there's at least one row with pass == 1, return the first of those
    if any(group['pass'] == 1):
        return group[group['pass'] == 1].iloc[0]
    else:
        return group.iloc[0]  # just return one row (all are pass == 0)


# Apply this logic to each group
grouped_df = combined_df.groupby(['batch_num', 'question_num'], as_index=False).apply(select_pass_row)

# Reset index if needed (since groupby + apply creates a multi-index)
grouped_df.reset_index(drop=True, inplace=True)
grouped_df.head(10)

In [None]:
from datasets import load_dataset
import ast

ds = load_dataset("CLUTRR/v1", "gen_train234_test2to10")
df_test = ds['test'].to_pandas()
df_test['story_edges_list'] = df_test['story_edges'].apply(ast.literal_eval)
df_test['num_hops'] = df_test['story_edges_list'].apply(len)
df_test = df_test[['num_hops']]
df_test = df_test.groupby('num_hops').size().reset_index(name='count')
df_test

In [None]:
total_questions_count = len(grouped_df.index)
print(f"total questions with answers: {total_questions_count}")

In [None]:
num_pass = (grouped_df['pass'] == 1).sum()
print(f"total accurate answers: {num_pass}")

In [None]:
total_accuracy = (num_pass / total_questions_count) * 100.0
print(f"accuracy: {total_accuracy:.2f}%")

In [None]:
import matplotlib.pyplot as plt

# Group by num_hops
grouped_hop_count_df = grouped_df.groupby('num_hops')['pass'].sum().reset_index()
merged_df = grouped_hop_count_df.merge(df_test, on='num_hops', how='left')
merged_df

In [None]:
# Set 'num_hops' as index so the bars align with hop counts
merged_df.set_index('num_hops', inplace=True)

# Calculate percentage of pass == 1
percent_pass = (merged_df['pass'] / merged_df['count'] * 100).round(2)
question_count = merged_df['count']

# Plotting
ax = percent_pass.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Accuracy by number of hops')
plt.xlabel('number of hops')
plt.ylabel('Accuracy (%)')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

# Add annotations
for i, (pct, count) in enumerate(zip(percent_pass, question_count)):
    ax.text(i, pct + 1, f"{pct}%", ha='center', va='bottom', color='black', fontweight='bold')
    ax.text(i, 1, f"{count}", ha='center', va='bottom', color='black', fontweight='normal')  # Near base of the bar

plt.show()

In [None]:
"""
Selected questions for the sample :-

hops 2:
- correct in both o3-mini and 4o - 0, 2
- correct in o3-mini but wrong in 4o - 5, 7
- correct in 4o but wrong in o3-mini -
- wrong in both o3-mini and 4o - 1, 12

hops 3:
- correct in both o3-mini and 4o - 39, 42
- correct in o3-mini but wrong in 4o - 40, 41
- correct in 4o but wrong in o3-mini - 140
- wrong in both o3-mini and 4o - 38, 46

hops 4:
- correct in both o3-mini and 4o - 146, 147
- correct in o3-mini but wrong in 4o - 148, 149
- correct in 4o but wrong in o3-mini - 166, 184
- wrong in both o3-mini and 4o - 145, 150

hops 5:
- correct in both o3-mini and 4o - 223, 224
- correct in o3-mini but wrong in 4o - 226, 227
- correct in 4o but wrong in o3-mini - 242, 347
- wrong in both o3-mini and 4o - 222, 225

hops 6:
- correct in both o3-mini and 4o - 502, 510
- correct in o3-mini but wrong in 4o - 500, 503
- correct in 4o but wrong in o3-mini - 505, 450
- wrong in both o3-mini and 4o - 501, 507

hops 7:
- correct in both o3-mini and 4o - 514, 524
- correct in o3-mini but wrong in 4o - 513, 518
- correct in 4o but wrong in o3-mini - 512, 523
- wrong in both o3-mini and 4o - 517, 526

hops 8:
- correct in both o3-mini and 4o - 671, 674
- correct in o3-mini but wrong in 4o - 668, 669
- correct in 4o but wrong in o3-mini - 670, 711
- wrong in both o3-mini and 4o - 667, 673

hops 9:
- correct in both o3-mini and 4o - 804, 805
- correct in o3-mini but wrong in 4o - 803, 807
- correct in 4o but wrong in o3-mini - 827, 852
- wrong in both o3-mini and 4o - 806, 809

hops 10:
- correct in both o3-mini and 4o - 935, 938
- correct in o3-mini but wrong in 4o - 928, 932
- correct in 4o but wrong in o3-mini - 927, 942
- wrong in both o3-mini and 4o - 926, 931

"""