In [1]:
import numpy as np
from collections import Counter
from datasets import load_dataset,concatenate_datasets
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
ds = load_dataset("beanham/spatial_union_dataset")
test=ds['test']
gt=np.array(test['label'])

In [3]:
def metric_calculation(pred, gt):    
    acc=accuracy_score(gt, pred)    
    confusion=confusion_matrix(gt, pred)
    fpr=confusion[0,1]/len(gt) ## predict to be 1; actual 0
    fnr=confusion[1,0]/len(gt) ## predict to be 0; actual 1
    return acc, fpr, fnr

### llama

In [4]:
zero_no_exp=np.load('base/llama3/llama3_zero_shot_no_exp.npy')
zero_with_exp=np.load('base/llama3/llama3_zero_shot_with_exp.npy')
few_no_exp=np.load('base/llama3/llama3_few_shot_no_exp.npy')
few_with_exp=np.load('base/llama3/llama3_few_shot_with_exp.npy')

## post-processing
zero_no_exp = np.array([i.split('Response')[1].split('\n')[1] if 'Response' in i else i.split()[0] for i in zero_no_exp])
few_no_exp = np.array([i.split('}')[0].replace('{','') if '{' in i else i.split()[0] for i in few_no_exp])
zero_with_exp = np.array([i.split('Response')[1].split()[1] if 'Response' in i else i.split()[0] for i in zero_with_exp])
few_with_exp = np.array([i.split('}')[0].replace('{','') if '{' in i else i.split()[0] for i in few_with_exp])
zero_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in zero_no_exp])
zero_with_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in zero_with_exp])
few_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_no_exp])
few_with_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_with_exp])

In [5]:
Counter(zero_no_exp), Counter(zero_with_exp), Counter(few_no_exp), Counter(few_with_exp)

(Counter({0: 245, 1: 154}),
 Counter({1: 280, 0: 119}),
 Counter({0: 256, 1: 138, 2: 5}),
 Counter({1: 222, 0: 171, 2: 6}))

In [6]:
print('Zero, No Heuristic', metric_calculation(zero_no_exp, gt))
print('Zero, With Heuristic', metric_calculation(zero_with_exp, gt))
print('Few, No Heuristic', metric_calculation(few_no_exp, gt))
print('Few, With Heuristic', metric_calculation(few_with_exp, gt))

Zero, No Heuristic (0.5213032581453634, 0.11278195488721804, 0.3659147869674185)
Zero, With Heuristic (0.6265664160401002, 0.21804511278195488, 0.15538847117794485)
Few, No Heuristic (0.49122807017543857, 0.10776942355889724, 0.38847117794486213)
Few, With Heuristic (0.6917293233082706, 0.11278195488721804, 0.18045112781954886)


### mistral

In [7]:
zero_no_exp=np.load('base/mistral/mistral_zero_shot_no_exp.npy')
zero_with_exp=np.load('base/mistral/mistral_zero_shot_with_exp.npy')
few_no_exp=np.load('base/mistral/mistral_few_shot_no_exp.npy')
few_with_exp=np.load('base/mistral/mistral_few_shot_with_exp.npy')

In [8]:
## post-processing
zero_no_exp = np.array([i.split('Response')[1].split('\n')[1].replace('</s>', '') if 'Response' in i else i.split()[0].replace('</s>', '') for i in zero_no_exp])
few_no_exp = np.array([i.split('Response')[1].split('}')[0].split('{')[1] if 'Response' in i else i.split('}')[0].replace('{','').replace('</s>', '') for i in few_no_exp])
zero_with_exp = np.array([i.split('Response')[1].split('\n')[1].replace('</s>', '') if 'Response' in i else i.split()[0].replace('</s>', '') for i in zero_with_exp])
few_with_exp = np.array([i.split('Response')[1].split('}')[0].split('{')[1] if 'Response' in i else i.split('}')[0].replace('{','').replace('</s>', '') for i in few_with_exp])
zero_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in zero_no_exp])
zero_with_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in zero_with_exp])
few_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_no_exp])
few_with_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_with_exp])

In [9]:
Counter(zero_no_exp), Counter(zero_with_exp), Counter(few_no_exp), Counter(few_with_exp)

(Counter({1: 375, 0: 24}),
 Counter({1: 399}),
 Counter({1: 350, 0: 49}),
 Counter({1: 326, 0: 73}))

In [10]:
print('Zero, No Heuristic', metric_calculation(zero_no_exp, gt))
print('Zero, With Heuristic', metric_calculation(zero_with_exp, gt))
print('Few, No Heuristic', metric_calculation(few_no_exp, gt))
print('Few, With Heuristic', metric_calculation(few_with_exp, gt))

Zero, No Heuristic (0.6741854636591479, 0.3132832080200501, 0.012531328320802004)
Zero, With Heuristic (0.6390977443609023, 0.3609022556390977, 0.0)
Few, No Heuristic (0.706766917293233, 0.2656641604010025, 0.02756892230576441)
Few, With Heuristic (0.731829573934837, 0.22305764411027568, 0.045112781954887216)


### 4o-mini

In [11]:
zero_no_exp=np.load('base/4o_mini/4o_mini_zero_shot_no_exp.npy')
zero_with_exp=np.load('base/4o_mini/4o_mini_zero_shot_with_exp.npy')
few_no_exp=np.load('base/4o_mini/4o_mini_few_shot_no_exp.npy')
few_with_exp=np.load('base/4o_mini/4o_mini_few_shot_with_exp.npy')

## post-processing
few_no_exp = np.array([i.split('}')[0].split('{')[1] for i in few_no_exp])
few_with_exp = np.array([i.split('}')[0].split('{')[1] for i in few_with_exp])
zero_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in zero_no_exp])
zero_with_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in zero_with_exp])
few_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_no_exp])
few_with_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_with_exp])

In [12]:
Counter(zero_no_exp), Counter(zero_with_exp), Counter(few_no_exp), Counter(few_with_exp)

(Counter({0: 338, 1: 61}),
 Counter({1: 309, 0: 90}),
 Counter({0: 210, 1: 189}),
 Counter({1: 322, 0: 77}))

In [13]:
print('Zero, No Heuristic', metric_calculation(zero_no_exp, gt))
print('Zero, With Heuristic', metric_calculation(zero_with_exp, gt))
print('Few, No Heuristic', metric_calculation(few_no_exp, gt))
print('Few, With Heuristic', metric_calculation(few_with_exp, gt))

Zero, No Heuristic (0.49874686716791977, 0.007518796992481203, 0.49373433583959897)
Zero, With Heuristic (0.8245614035087719, 0.15538847117794485, 0.020050125313283207)
Few, No Heuristic (0.6942355889724311, 0.07017543859649122, 0.23558897243107768)
Few, With Heuristic (0.8220551378446115, 0.17293233082706766, 0.005012531328320802)


### 4o

In [14]:
zero_no_exp=np.load('base/4o/4o_zero_shot_no_exp.npy')
zero_with_exp=np.load('base/4o/4o_zero_shot_with_exp.npy')
few_no_exp=np.load('base/4o/4o_few_shot_no_exp.npy')
few_with_exp=np.load('base/4o/4o_few_shot_with_exp.npy')

## post-processing
few_no_exp = np.array([i.split('}')[0].split('{')[1] for i in few_no_exp])
few_with_exp = np.array([i.split('}')[0].split('{')[1] for i in few_with_exp])
zero_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in zero_no_exp])
zero_with_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in zero_with_exp])
few_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_no_exp])
few_with_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_with_exp])

In [15]:
Counter(zero_no_exp), Counter(zero_with_exp), Counter(few_no_exp), Counter(few_with_exp)

(Counter({0: 220, 1: 179}),
 Counter({1: 257, 0: 142}),
 Counter({1: 354, 0: 45}),
 Counter({1: 292, 0: 107}))

In [16]:
print('Zero, No Heuristic', metric_calculation(zero_no_exp, gt))
print('Zero, With Heuristic', metric_calculation(zero_with_exp, gt))
print('Few, No Heuristic', metric_calculation(few_no_exp, gt))
print('Few, With Heuristic', metric_calculation(few_with_exp, gt))

Zero, No Heuristic (0.7794486215538847, 0.015037593984962405, 0.20551378446115287)
Zero, With Heuristic (0.949874686716792, 0.02756892230576441, 0.022556390977443608)
Few, No Heuristic (0.7418546365914787, 0.2531328320802005, 0.005012531328320802)
Few, With Heuristic (0.8972431077694235, 0.09774436090225563, 0.005012531328320802)
