In [1]:
import numpy as np
from itertools import product
from collections import Counter
from datasets import load_dataset
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

In [2]:
def metric_calculation(pred, gt):    
    acc=accuracy_score(gt, pred)
    f1=f1_score(gt, pred, average='macro')
    confusion=confusion_matrix(gt, pred)
    fpr=confusion[0,1]/len(gt) ## predict to be 1; actual 0
    fnr=confusion[1,0]/len(gt) ## predict to be 0; actual 1
    return acc, f1, fpr, fnr

In [3]:
def post_processing(pred):
    new_pred=[]
    for i in pred:
        i=i.lower()
        if 'response' in i:
            try: new_pred.append(i.split('response')[1].split()[1].replace('</s>', ''))
            except: new_pred.append(2)
        elif 'output' in i:
            try: new_pred.append(i.split('output')[1].split()[1].replace('</s>', ''))
            except: new_pred.append(2)
        else:
            try: new_pred.append(i.split()[0].replace('</s>', ''))
            except:new_pred.append(2)
    new_pred = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in new_pred])
    return new_pred

In [4]:
ds = load_dataset("beanham/spatial_join_dataset")
test=ds['test']
gt=np.array(test['label'])
## evaluate on a subset
np.random.seed(100)
index=np.random.randint(0, len(test), 1000)

### 4o_mini

In [5]:
model='4o_mini'
for threshold in [1,2,5,10,20]:
    print('----------------------------------')
    print(f'Threshold: {threshold}...')
    zero_shot_comb=np.load(f'base/4o_mini_ec/4o_mini_degree_{threshold}_zero_shot_with_heur_value_comb_ec.npy')
    few_shot_comb=np.load(f'base/4o_mini_ec/4o_mini_degree_{threshold}_few_shot_with_heur_value_comb_ec.npy')
    zero_shot_comb=post_processing(zero_shot_comb)
    few_shot_comb=post_processing(few_shot_comb)
    print(f'Heuristics: ', metric_calculation(np.array(test['min_angle'])<=threshold, gt)[0])
    print(f'Zero-Shot-Comb:   ', metric_calculation(zero_shot_comb[index], gt[index])[0])
    print(f'Few-Shot-Comb:    ', metric_calculation(few_shot_comb[index], gt[index])[0])

----------------------------------
Threshold: 1...
Heuristics:  0.8572825024437928
Zero-Shot-Comb:    0.946
Few-Shot-Comb:     0.949
----------------------------------
Threshold: 2...
Heuristics:  0.90257412838058
Zero-Shot-Comb:    0.946
Few-Shot-Comb:     0.948
----------------------------------
Threshold: 5...
Heuristics:  0.9377647442163571
Zero-Shot-Comb:    0.948
Few-Shot-Comb:     0.948
----------------------------------
Threshold: 10...
Heuristics:  0.9433040078201369
Zero-Shot-Comb:    0.95
Few-Shot-Comb:     0.948
----------------------------------
Threshold: 20...
Heuristics:  0.9354838709677419
Zero-Shot-Comb:    0.941
Few-Shot-Comb:     0.94


### qwen

In [7]:
for threshold in [1]:#,2,5,10,20]:
    print('----------------------------------')
    print(f'Threshold: {threshold}...')
    zero_shot_comb=np.load(f'base/qwen_ec/qwen_degree_{threshold}_zero_shot_with_heur_value_comb_ec.npy')
    few_shot_comb=np.load(f'base/qwen_ec/qwen_degree_{threshold}_few_shot_with_heur_value_comb_ec.npy')    
    zero_shot_comb=post_processing(zero_shot_comb)
    few_shot_comb=post_processing(few_shot_comb)
    print(f'Heuristics: ', metric_calculation(np.array(test['min_angle'])<=threshold, gt)[0])    
    print(f'Zero-Shot-Comb:   ', metric_calculation(zero_shot_comb, gt[index])[0])
    print(f'Few-Shot-Comb:    ', metric_calculation(few_shot_comb, gt[index])[0])

----------------------------------
Threshold: 1...
Heuristics:  0.8572825024437928
Zero-Shot-Comb:    0.913
Few-Shot-Comb:     0.95
