In [1]:
import os
import json
import subprocess
import pandas as pd

def get_text_from_file(file_path: str) -> str:
  ''' Returns the content of the given file '''
  with open(file_path,encoding='utf-8') as f:
    text = f.read()
    return text
  
exp_name = 'baseline_topk'
question_types = ['Simple Question (Direct)', 'Simple Question (Coreferenced)', 'Comparative Reasoning (All)', 'Comparative Reasoning (Count) (All)', 'Quantitative Reasoning (Count) (All)', 'Logical Reasoning (All)', 'Verification (Boolean) (All)', 'Simple Question (Ellipsis)', 'Quantitative Reasoning (All)']

In [2]:
for question_type in question_types:
    file_path = f"../output/{exp_name}/prediction.json"
    out_eval_file = f"../output/{exp_name}/Evaluation/{question_type}.json"
    out_eval_folder = out_eval_file.split("/")[:-1]
    out_eval_folder = "/".join(out_eval_folder)

    # Check if folder exists and create it if not
    if not os.path.exists(out_eval_folder):
        os.makedirs(out_eval_folder)

    # Execute evaluation script
    print(f"Executing evaluation script for {question_type}")
    
    # If it consumes too much memory or gets stuck, try to run the command in the terminal
    cmd = [
        'python3',
        '../../SPICE_code/evaluation/run_subtype_lf.py',
        '--file_path', file_path,
        '--question_type', question_type,
        '--em_only', 'False',
        '--server_link', 'http://localhost:9999/blazegraph/sparql',
        '--out_eval_file', out_eval_file,
        '--context_dist_file', '../../SPICE_dataset/context_distance_test.log'
    ]
    subprocess.run(cmd)

Executing evaluation script for Simple Question (Direct)
*	Evaluating: 
['../output/baseline_topk/prediction.json']
{'turnID': '100.61.0', 'question_type': 'Simple Question (Direct)', 'description': 'Simple Question', 'question': 'Which male person was the parent of Ludovico II, Marquess of Saluzzo ? [CTX]', 'answer': 'Ludovico I, Marquess of Saluzzo', 's_expression': '(AND (JOIN (R P22) Q1063295) (JOIN P31 Q502895))', 's_expression_cores': ['(AND (JOIN (R P22) Q1063295) (JOIN P31 Q502895))'], 'sparql_delex': 'SELECT ?x WHERE { wd:Q1063295 wdt:P22 ?x . ?x wdt:P31 wd:Q502895 .  }', 'results': ['Q3839126'], 'coreference_resolved_question': 'Which male person was the parent of Ludovico II, Marquess of Saluzzo?', 's_expression_fn': '(AND (JOIN (R father) Ludovico_II,_Marquess_of_Saluzzo) (JOIN instance_of common_name))', 'predicted_s_expression_fn': '(AND (JOIN (R father) Ludovico_II,_Marquess_of_Saluzzo) (JOIN sex_or_gender male))', 'predicted_s_expression': '(AND (JOIN (R P22) Q1063295) 

In [3]:
summary_data = []

for question_type in question_types:
    file_path = f"../output/{exp_name}/Evaluation/{question_type}.json"
    data = json.loads(get_text_from_file(file_path))
    result = data[question_type]
        
    # Prepare data for DataFrame by adding 'model-prompt' and 'question_type' keys
    result.update({'question_type': question_type})
    summary_data.append(result)

# Create the DataFrame
summary_df = pd.DataFrame(summary_data)

# Set 'model-prompt' and 'question_type' as the index
summary_df.set_index(['question_type'], inplace=True)
# Export to csv sorted by model-prompt
summary_df.to_csv(f'../output/{exp_name}/Evaluation/summary.csv', index=True)
summary_df.head(10)

Unnamed: 0_level_0,instances,precision,recall,f1score,macro-f1score,em,accuracy
question_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Simple Question (Direct),739.0,0.782356,0.416245,0.543387,0.668827,0.0,
Simple Question (Coreferenced),698.0,0.60687,0.48788,0.540908,0.622774,0.0,
Comparative Reasoning (All),329.0,0.67941,0.097197,0.170065,0.092498,0.0,
Comparative Reasoning (Count) (All),336.0,,,,,0.0,0.044643
Quantitative Reasoning (Count) (All),482.0,,,,,0.0,0.30083
Logical Reasoning (All),421.0,0.977284,0.733341,0.837919,0.444329,0.0,
Verification (Boolean) (All),385.0,,,,,0.0,0.677922
Simple Question (Ellipsis),181.0,0.124539,0.347938,0.183424,0.519337,0.0,
Quantitative Reasoning (All),220.0,0.836179,0.089661,0.161955,0.147124,0.0,
