In [1]:
import numpy as np
import pandas as pd
from itertools import product
from collections import Counter
from datasets import load_dataset
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

In [2]:
def metric_calculation(pred, gt):    
    acc=accuracy_score(gt, pred)
    f1=f1_score(gt, pred, average='macro')
    confusion=confusion_matrix(gt, pred)
    fpr=confusion[0,1]/len(gt) ## predict to be 1; actual 0
    fnr=confusion[1,0]/len(gt) ## predict to be 0; actual 1
    return acc, f1, fpr, fnr

In [3]:
def post_processing(pred):
    new_pred=[]
    for i in pred:
        i=i.lower()
        if 'response' in i:
            try: new_pred.append(i.split('response')[1].split()[1].replace('</s>', ''))
            except: new_pred.append(2)
        elif 'output' in i:
            try: new_pred.append(i.split('output')[1].split()[1].replace('</s>', ''))
            except: new_pred.append(2)
        else:
            try: new_pred.append(i.split()[0].replace('</s>', ''))
            except:new_pred.append(2)
    new_pred = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in new_pred])
    return new_pred

In [3]:
ds = load_dataset("beanham/spatial_union_dataset")
test=ds['test']
gt=np.array(test['label'])
configs = [
    'zero_shot_no_heur_cot',    
    'zero_shot_with_heur_hint_angle_cot',
    'zero_shot_with_heur_hint_area_cot',
    'zero_shot_with_heur_hint_angle_area_cot',
    'zero_shot_with_heur_value_angle_cot',
    'zero_shot_with_heur_value_area_cot',
    'zero_shot_with_heur_value_angle_area_cot',
    'few_shot_no_heur_cot',    
    'few_shot_with_heur_hint_angle_cot',
    'few_shot_with_heur_hint_area_cot',
    'few_shot_with_heur_hint_angle_area_cot',
    'few_shot_with_heur_value_angle_cot',
    'few_shot_with_heur_value_area_cot',
    'few_shot_with_heur_value_angle_area_cot'
]

In [27]:
pred=np.load(f'base/4o_mini/4o_mini_zero_shot_with_heur_value_angle_area_cot.npy')

In [5]:
results=[]
models=['llama3', 'mistral', '4o_mini', 'qwen_plus', '4o']
for model in models:
    print(f'Model: {model}...')
    for config in configs:
        pred=np.load(f'base/{model}/{model}_{config}.npy')
        pred=post_processing(pred)        
        metrics=metric_calculation(pred, gt)
        results.append([config, model, round(metrics[0],3), metrics[1]])
results=pd.DataFrame(results, columns=['config', 'model', 'acc', 'f1'])

Model: llama3...
Model: mistral...
Model: 4o_mini...
Model: qwen_plus...
Model: 4o...


In [6]:
results[results['model']=='4o']

Unnamed: 0,config,model,acc,f1
56,zero_shot_no_heur,4o,0.817,0.813881
57,zero_shot_with_heur_hint_angle,4o,0.784,0.772635
58,zero_shot_with_heur_hint_area,4o,0.777,0.775498
59,zero_shot_with_heur_hint_angle_area,4o,0.524,0.505389
60,zero_shot_with_heur_value_angle,4o,0.837,0.797738
61,zero_shot_with_heur_value_area,4o,0.862,0.83652
62,zero_shot_with_heur_value_angle_area,4o,0.955,0.950643
63,few_shot_no_heur,4o,0.802,0.767347
64,few_shot_with_heur_hint_angle,4o,0.777,0.723057
65,few_shot_with_heur_hint_area,4o,0.779,0.729825
