In [1]:
import numpy as np
import pandas as pd
import re
from difflib import SequenceMatcher
from pathlib import Path
import itertools

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from rapidfuzz import fuzz



In [2]:
METHODS = ["agent", "mrr", "sc", "default", "cot", "cot2"]
RUN_IDS = [1, 2, 3]

THRESH_LCS = 0.9
THRESH_FUZZY = 0.95

PARAM_GRID = list(itertools.product(METHODS,RUN_IDS))

In [3]:
#define 
questions = pd.read_json('data/curebench_valset_pharse1.jsonl',lines=True) 

def _norm(s: str) -> str:
    s = "" if s is None else str(s)
    s = s.lower().strip()
    s = re.sub(r"\s+", " ", s)
    return s

def best_option_by_fuzzy(prediction: str, options_dict: dict) -> tuple[str, float]:
    pred = _norm(prediction)
    scores = {}
    for k, opt in options_dict.items():
        optn = _norm(opt)

        # Primary: robust to extra words + reordering
        s1 = fuzz.token_set_ratio(pred, optn) / 100.0

        # Secondary: robust if pred includes a copied snippet of the option
        s2 = fuzz.partial_ratio(pred, optn) / 100.0

        # Blend (tweak weights if you want)
        score = 0.7 * s1 + 0.3 * s2
        scores[k] = score

    best = max(scores, key=scores.get)
    return best, float(scores[best])

def run(METHOD , RUN_ID, THRESH_LCS, THRESH_FUZZY ):
    #Read data
    path = Path("competition_results",METHOD,str(RUN_ID),"submission.csv")
    answers = pd.read_csv(path)
    
    #SPLITTING DATA INTO MC, OPEN-ENDED MC, and OPEN QUESTIONS
    data = questions.merge(answers,on="id",how="right")
    colnames = data.columns

    data_mc = data[data["question_type"]=="multi_choice"]
    data_open_mc = data[data["question_type"]=="open_ended_multi_choice"]
    data_open = data[data["question_type"]=="open_ended"]
    
    #####ANALYZE MC DATA #######
    data_mc = data_mc.copy()
    
    for i,row in data_mc.iterrows():
        prediction = row["prediction"]
        options = dict(row["options"])
        LCSratios = {}
        for key in options.keys():
            option = options[key]
            LCSratios[key] = SequenceMatcher(None, prediction, option).ratio()
        
        best_pred = max(LCSratios, key=LCSratios.get)
        data_mc.loc[i,"pred_LCS"] = best_pred
        data_mc.loc[i,"LCS_similarity"]=LCSratios[best_pred]
    
    data_mc['final_choice'] = np.where(
        data_mc['LCS_similarity'] > THRESH_LCS,
        data_mc['pred_LCS'],
        data_mc['choice']    
    )
    
    data_mc["is_correct"] = data_mc['final_choice'] == data_mc["correct_answer"]     
    NaN_mc = len(data_mc[data_mc["final_choice"] == 'NOTAVALUE'])
    valid_mc = len(data_mc) - NaN_mc
    ACC_mc = data_mc["is_correct"].sum() / valid_mc
    
    #ANALYZE OPEN MC DATA
    data_open_mc = data_open_mc.copy()

    for i,row in data_open_mc.iterrows():
        prediction = row["prediction"]
        options = dict(row["options"])
        LCSratios = {}
        for key in options.keys():
            option = options[key]
            LCSratios[key] = SequenceMatcher(None, prediction, option).ratio()
        
        best_pred = max(LCSratios, key=LCSratios.get)
        data_open_mc.loc[i,"pred_LCS"] = best_pred
        data_open_mc.loc[i,"LCS_similarity"] = LCSratios[best_pred]
        
    data_open_mc['final_choice'] = np.where(
        data_open_mc['LCS_similarity'] > THRESH_LCS,
        data_open_mc['pred_LCS'],
        data_open_mc['choice']
    )
    
    data_open_mc['is_correct'] = data_open_mc["final_choice"] == data_open_mc["correct_answer"]    
    NaN_open_mc = len(data_open_mc[data_open_mc["final_choice"] == 'NOTAVALUE'])
    valid_open_mc = len(data_open_mc) - NaN_open_mc
    ACC_open_mc = data_open_mc["is_correct"].sum() / valid_open_mc

    #TOTAL ACCURACY (excl. open questions)
    ACC_total = ((ACC_mc*valid_mc + 
             ACC_open_mc*valid_open_mc) / 
             (valid_open_mc + valid_mc))

    #ANALYZE OPEN QUESTIONS
    data_open = data_open.copy()
    
    fuzz_results = data_open.apply(
        lambda r: best_option_by_fuzzy(r["prediction"], dict(r["options"])),
        axis=1
    )
    data_open["pred_LCS"] = fuzz_results.map(lambda x: x[0])
    data_open["LCS_similarity"] = fuzz_results.map(lambda x: x[1])
    
    
    data_open["final_choice"] = np.where(
        data_open["LCS_similarity"] >= THRESH_FUZZY,
        data_open["pred_LCS"],
        data_open["choice"])
    
    data_open["is_correct"] = data_open["final_choice"] == data_open["correct_answer"]
    
    NaN_open = len(data_open[data_open["final_choice"] == 'NOTAVALUE'])
    valid_open = len(data_open) - NaN_open
    ACC_open = data_open["is_correct"].sum() / valid_open
    
    #COMBINE DATA INTO DATAFRAMES:
    # det: detailed predictions, 
    # acc: summuary statistics
    det = pd.concat(
        [data_mc,
        data_open_mc,
        data_open],
        axis=0
    )
    det['method']=METHOD
    det['run_id']=RUN_ID
    det.rename(columns={'pred_LCS':'pred_SIM',
                        'LCS_similarity':'SIM_score',
                        'id':'question_id'},
               inplace=True)
    det = det[
        ['method','run_id','question_type','question_id','question',
         'correct_answer','options','prediction','choice','reasoning','pred_SIM',
         'SIM_score','final_choice','is_correct']
    ]
    
    
    acc = pd.DataFrame(
        {'method':[METHOD]*4,
         'run_id':[RUN_ID]*4,
         'question_type':['multi_choice','open_ended_multi_choice','open_ended','total'],
         })
    acc['accuracy'] = [ACC_mc,ACC_open_mc,ACC_open,ACC_total]
    acc['valid_answers'] = [valid_mc,valid_open_mc,valid_open,valid_mc+valid_open_mc]
    acc['invalid_answers'] = [NaN_mc,NaN_open_mc,NaN_open,NaN_mc+NaN_open_mc]
    acc['prop_valid'] = acc['valid_answers'] / (acc['valid_answers']+acc['invalid_answers'])
    acc['similarity_threshold'] = [THRESH_LCS,THRESH_LCS,THRESH_FUZZY,THRESH_LCS]
    
    return acc, det
    


In [4]:
### gogo sanchez ski shoes
ACC = pd.DataFrame(
    columns=['method','run_id','question_type','accuracy','valid_answers','invalid_answers','prop_valid','similarity_threshold']
)
DET = pd.DataFrame(
    columns = ['method','run_id','question_type','question_id','question',
         'correct_answer','options','prediction','choice','reasoning','pred_SIM',
         'SIM_score','final_choice','is_correct']
)

for METHOD, RUN_ID in PARAM_GRID:
    acc,det = run(METHOD=METHOD,
        RUN_ID=RUN_ID,
        THRESH_LCS=THRESH_LCS,
        THRESH_FUZZY=THRESH_FUZZY)
    
    ACC = pd.concat([ACC,acc],axis=0)
    DET = pd.concat([DET,det],axis=0)
    

  ACC = pd.concat([ACC,acc],axis=0)
  DET = pd.concat([DET,det],axis=0)


In [10]:
ACC_f = ACC[ACC['question_type']!='open_ended']  
ACC_f = ACC_f.sort_values(by=['question_type','prop_valid'],ascending=False)


In [11]:
#Save results
ACC.to_csv('accuracy_summary.csv',index=False)
ACC_f.to_csv('accuracy_summary_excl_open.csv',index=False)