In [1]:
import numpy as np
import pandas as pd

from itertools import product
from collections import Counter
from datasets import load_dataset
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

In [2]:
def metric_calculation(pred, gt):    
    acc=accuracy_score(gt, pred)
    f1=f1_score(gt, pred, average='macro')
    confusion=confusion_matrix(gt, pred)
    fpr=confusion[0,1]/len(gt) ## predict to be 1; actual 0
    fnr=confusion[1,0]/len(gt) ## predict to be 0; actual 1
    return acc, f1, fpr, fnr
    
def post_processing(pred):
    new_pred=[]
    for i in pred:
        i=i.lower()
        if 'response' in i:
            try: new_pred.append(i.split('response')[1].split()[1].replace('</s>', ''))
            except: new_pred.append(2)
        elif 'output' in i:
            try: new_pred.append(i.split('output')[1].split()[1].replace('</s>', ''))
            except: new_pred.append(2)
        else:
            try: new_pred.append(i.split()[0].replace('</s>', ''))
            except:new_pred.append(2)
    new_pred = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in new_pred])
    return new_pred

In [3]:
ds = load_dataset("beanham/spatial_join_dataset")
test=ds['test']
gt=np.array(test['label'])

In [4]:
results=[]
metric_values = ['random','worst_single', 'best_single', 'worst_comb', 'best_comb', 'worst_all', 'best_all']
for model in ['4o_mini', 'qwen_plus', '4o']:
    for value in metric_values:
        few_shot_hints=np.load(f'base/{model}_correction/{model}_{value}_few_shot_with_heur_hint_all_correction.npy')
        few_shot_values=np.load(f'base/{model}_correction/{model}_{value}_few_shot_with_heur_value_all_correction.npy')
        few_shot_hints=post_processing(few_shot_hints)
        few_shot_values=post_processing(few_shot_values)
        few_hints_metrics=metric_calculation(few_shot_hints, gt)
        few_values_metrics=metric_calculation(few_shot_values, gt)
        results.append([model, value, 'few_shot_hints', few_hints_metrics[0], few_hints_metrics[1]])
        results.append([model, value, 'few_shot_values', few_values_metrics[0], few_values_metrics[1]])
results=pd.DataFrame(results, columns=['model', 'value', 'prompt', 'acc', 'f1'])        

In [5]:
results

Unnamed: 0,model,value,prompt,acc,f1
0,4o_mini,random,few_shot_hints,0.405,0.327744
1,4o_mini,random,few_shot_values,0.946,0.942522
2,4o_mini,worst_single,few_shot_hints,0.41,0.330811
3,4o_mini,worst_single,few_shot_values,0.945,0.941365
4,4o_mini,best_single,few_shot_hints,0.367,0.274541
5,4o_mini,best_single,few_shot_values,0.966,0.964271
6,4o_mini,worst_comb,few_shot_hints,0.386,0.310964
7,4o_mini,worst_comb,few_shot_values,0.957,0.954532
8,4o_mini,best_comb,few_shot_hints,0.378,0.281792
9,4o_mini,best_comb,few_shot_values,0.952,0.949269
