In [1]:
import numpy as np
from collections import Counter
from datasets import load_dataset,concatenate_datasets
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
ds = load_dataset("beanham/spatial_union_dataset")
test=ds['test']
gt=np.array(test['label'])

In [3]:
def metric_calculation(pred, gt):    
    acc=accuracy_score(gt, pred)    
    confusion=confusion_matrix(gt, pred)
    fpr=confusion[0,1]/len(gt) ## predict to be 1; actual 0
    fnr=confusion[1,0]/len(gt) ## predict to be 0; actual 1
    return acc, fpr, fnr

### llama

In [8]:
zero_no_exp=np.load('base/llama3/llama3_zero_shot_no_exp.npy')
zero_with_exp=np.load('base/llama3/llama3_zero_shot_with_exp.npy')
few_no_exp=np.load('base/llama3/llama3_few_shot_no_exp.npy')
few_with_exp=np.load('base/llama3/llama3_few_shot_with_exp.npy')

## post-processing
zero_no_exp = np.array([i.split('Response')[1].split('\n')[1] if 'Response' in i else i.split()[0] for i in zero_no_exp])
few_no_exp = np.array([i.split('}')[0].replace('{','') if '{' in i else i.split()[0] for i in few_no_exp])
zero_with_exp = np.array([i.split('Response')[1].split()[1] if 'Response' in i else i.split()[0] for i in zero_with_exp])
few_with_exp = np.array([i.split('}')[0].replace('{','') if '{' in i else i.split()[0] for i in few_with_exp])
zero_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in zero_no_exp])
zero_with_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in zero_with_exp])
few_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_no_exp])
few_with_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_with_exp])

In [9]:
Counter(zero_no_exp), Counter(zero_with_exp), Counter(few_no_exp), Counter(few_with_exp)

(Counter({0: 257, 1: 142}),
 Counter({1: 230, 0: 169}),
 Counter({0: 369, 1: 25, 2: 5}),
 Counter({0: 252, 1: 141, 2: 6}))

In [10]:
print('Zero, No Heuristic', metric_calculation(zero_no_exp, gt))
print('Zero, With Heuristic', metric_calculation(zero_with_exp, gt))
print('Few, No Heuristic', metric_calculation(few_no_exp, gt))
print('Few, With Heuristic', metric_calculation(few_with_exp, gt))

Zero, No Heuristic (0.5162907268170426, 0.10025062656641603, 0.38345864661654133)
Zero, With Heuristic (0.581453634085213, 0.17794486215538846, 0.24060150375939848)
Few, No Heuristic (0.38847117794486213, 0.017543859649122806, 0.581453634085213)
Few, With Heuristic (0.6240601503759399, 0.045112781954887216, 0.3157894736842105)


### mistral

In [16]:
zero_no_exp=np.load('base/mistral/mistral_zero_shot_no_exp.npy')
zero_with_exp=np.load('base/mistral/mistral_zero_shot_with_exp.npy')
few_no_exp=np.load('base/mistral/mistral_few_shot_no_exp.npy')
few_with_exp=np.load('base/mistral/mistral_few_shot_with_exp.npy')

In [17]:
## post-processing
zero_no_exp = np.array([i.split('Response')[1].split('\n')[1].replace('</s>', '') if 'Response' in i else i.split()[0].replace('</s>', '') for i in zero_no_exp])
few_no_exp = np.array([i.split('Response')[1].split('}')[0].split('{')[1] if 'Response' in i else i.split('}')[0].replace('{','').replace('</s>', '') for i in few_no_exp])
zero_with_exp = np.array([i.split('Response')[1].split('\n')[1].replace('</s>', '') if 'Response' in i else i.split()[0].replace('</s>', '') for i in zero_with_exp])
few_with_exp = np.array([i.split('Response')[1].split('}')[0].split('{')[1] if 'Response' in i else i.split('}')[0].replace('{','').replace('</s>', '') for i in few_with_exp])
zero_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in zero_no_exp])
zero_with_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in zero_with_exp])
few_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_no_exp])
few_with_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_with_exp])

In [18]:
Counter(zero_no_exp), Counter(zero_with_exp), Counter(few_no_exp), Counter(few_with_exp)

(Counter({0: 232, 1: 167}),
 Counter({1: 280, 0: 119}),
 Counter({0: 399}),
 Counter({0: 386, 1: 13}))

In [19]:
print('Zero, No Heuristic', metric_calculation(zero_no_exp, gt))
print('Zero, With Heuristic', metric_calculation(zero_with_exp, gt))
print('Few, No Heuristic', metric_calculation(few_no_exp, gt))
print('Few, With Heuristic', metric_calculation(few_with_exp, gt))

Zero, No Heuristic (0.5939849624060151, 0.09273182957393483, 0.3132832080200501)
Zero, With Heuristic (0.6265664160401002, 0.21804511278195488, 0.15538847117794485)
Few, No Heuristic (0.3609022556390977, 0.0, 0.6390977443609023)
Few, With Heuristic (0.39348370927318294, 0.0, 0.606516290726817)


### 4o-mini

In [11]:
zero_no_exp=np.load('base/4o_mini/4o_mini_zero_shot_no_exp.npy')
zero_with_exp=np.load('base/4o_mini/4o_mini_zero_shot_with_exp.npy')
few_no_exp=np.load('base/4o_mini/4o_mini_few_shot_no_exp.npy')
few_with_exp=np.load('base/4o_mini/4o_mini_few_shot_with_exp.npy')

## post-processing
few_no_exp = np.array([i.split('}')[0].split('{')[1] for i in few_no_exp])
few_with_exp = np.array([i.split('}')[0].split('{')[1] for i in few_with_exp])
zero_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in zero_no_exp])
zero_with_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in zero_with_exp])
few_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_no_exp])
few_with_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_with_exp])

In [12]:
Counter(zero_no_exp), Counter(zero_with_exp), Counter(few_no_exp), Counter(few_with_exp)

(Counter({0: 370, 1: 29}),
 Counter({1: 345, 0: 54}),
 Counter({0: 303, 1: 96}),
 Counter({1: 325, 0: 74}))

In [13]:
print('Zero, No Heuristic', metric_calculation(zero_no_exp, gt))
print('Zero, With Heuristic', metric_calculation(zero_with_exp, gt))
print('Few, No Heuristic', metric_calculation(few_no_exp, gt))
print('Few, With Heuristic', metric_calculation(few_with_exp, gt))

Zero, No Heuristic (0.42857142857142855, 0.002506265664160401, 0.568922305764411)
Zero, With Heuristic (0.7744360902255639, 0.22556390977443608, 0.0)
Few, No Heuristic (0.5764411027568922, 0.012531328320802004, 0.41102756892230574)
Few, With Heuristic (0.8195488721804511, 0.17794486215538846, 0.002506265664160401)


### 4o

In [10]:
zero_no_exp=np.load('base/4o/4o_zero_shot_no_exp.npy')
zero_with_exp=np.load('base/4o/4o_zero_shot_with_exp.npy')
few_no_exp=np.load('base/4o/4o_few_shot_no_exp.npy')
few_with_exp=np.load('base/4o/4o_few_shot_with_exp.npy')

## post-processing
few_no_exp = np.array([i.split('}')[0].split('{')[1] for i in few_no_exp])
few_with_exp = np.array([i.split('}')[0].split('{')[1] for i in few_with_exp])
zero_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in zero_no_exp])
zero_with_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in zero_with_exp])
few_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_no_exp])
few_with_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_with_exp])

In [11]:
Counter(zero_no_exp), Counter(zero_with_exp), Counter(few_no_exp), Counter(few_with_exp)

(Counter({0: 307, 1: 92}),
 Counter({1: 255, 0: 144}),
 Counter({1: 334, 0: 65}),
 Counter({1: 288, 0: 111}))

In [12]:
print('Zero, No Heuristic', metric_calculation(zero_no_exp, gt))
print('Zero, With Heuristic', metric_calculation(zero_with_exp, gt))
print('Few, No Heuristic', metric_calculation(few_no_exp, gt))
print('Few, With Heuristic', metric_calculation(few_with_exp, gt))

Zero, No Heuristic (0.581453634085213, 0.005012531328320802, 0.41353383458646614)
Zero, With Heuristic (0.9598997493734336, 0.020050125313283207, 0.020050125313283207)
Few, No Heuristic (0.7769423558897243, 0.21052631578947367, 0.012531328320802004)
Few, With Heuristic (0.9072681704260651, 0.08771929824561403, 0.005012531328320802)
