In [1]:
import numpy as np
from collections import Counter
from datasets import load_dataset,concatenate_datasets
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
ds = load_dataset("beanham/spatial_join_dataset")
test=ds['test']
gt=np.array(test['label'])

In [43]:
def metric_calculation(pred, gt):
    
    acc=accuracy_score(gt, pred)    
    confusion=confusion_matrix(gt, pred)
    fpr=confusion[0,1]/len(gt) ## predict to be 1; actual 0
    fnr=confusion[1,0]/len(gt) ## predict to be 0; actual 1
    return acc, fpr, fnr

### llama

In [53]:
zero_no_exp=np.load('base/llama3/llama3_zero_shot_no_exp.npy')
zero_with_exp=np.load('base/llama3/llama3_zero_shot_with_exp.npy')
few_no_exp=np.load('base/llama3/llama3_few_shot_no_exp.npy')
few_with_exp=np.load('base/llama3/llama3_few_shot_with_exp.npy')

## post-processing
zero_no_exp = np.array([i.split('Response')[1].split('\n')[1] if 'Response' in i else i.split()[0] for i in zero_no_exp])
few_no_exp = np.array([i.split('}')[0].replace('{','') if '{' in i else i.split()[0] for i in few_no_exp])
zero_with_exp = np.array([i.split('Response')[1].split()[1] if 'Response' in i else i.split()[0] for i in zero_with_exp])
few_with_exp = np.array([i.split('}')[0].replace('{','') if '{' in i else i.split()[0] for i in few_with_exp])
zero_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in zero_no_exp])
zero_with_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in zero_with_exp])
few_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_no_exp])
few_with_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_with_exp])

In [54]:
Counter(zero_no_exp), Counter(zero_with_exp), Counter(few_no_exp), Counter(few_with_exp)

(Counter({0: 1962, 1: 1062, 2: 45}),
 Counter({1: 1942, 0: 1125, 2: 2}),
 Counter({0: 2668, 1: 392, 2: 9}),
 Counter({1: 1722, 0: 1334, 2: 13}))

In [55]:
print('Zero, No Heuristic', metric_calculation(zero_no_exp, gt))
print('Zero, With Heuristic', metric_calculation(zero_with_exp, gt))
print('Few, No Heuristic', metric_calculation(few_no_exp, gt))
print('Few, With Heuristic', metric_calculation(few_with_exp, gt))

Zero, No Heuristic (0.47409579667644186, 0.1335940045617465, 0.3776474421635712)
Zero, With Heuristic (0.5294884327142392, 0.2512218963831867, 0.21863799283154123)
Few, No Heuristic (0.47768002606712284, 0.024437927663734114, 0.494949494949495)
Few, With Heuristic (0.6168132942326491, 0.17139133268165527, 0.20755946562398175)


### mistral

In [76]:
zero_no_exp=np.load('base/mistral/mistral_zero_shot_no_exp.npy')
zero_with_exp=np.load('base/mistral/mistral_zero_shot_with_exp.npy')
few_no_exp=np.load('base/mistral/mistral_few_shot_no_exp.npy')
few_with_exp=np.load('base/mistral/mistral_few_shot_with_exp.npy')

In [77]:
## post-processing
zero_no_exp = np.array([i.split('Response')[1].split('\n')[1].replace('</s>', '') if 'Response' in i else i.split()[0].replace('</s>', '') for i in zero_no_exp])
few_no_exp = np.array([i.split('Response')[1].split('}')[0].split('{')[1] if 'Response' in i else i.split('}')[0].replace('{','').replace('</s>', '') for i in few_no_exp])
zero_with_exp = np.array([i.split('Response')[1].split('\n')[1].replace('</s>', '') if 'Response' in i else i.split()[0].replace('</s>', '') for i in zero_with_exp])
few_with_exp = np.array([i.split('Response')[1].split('}')[0].split('{')[1] if 'Response' in i else i.split('}')[0].replace('{','').replace('</s>', '') for i in few_with_exp])
zero_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in zero_no_exp])
zero_with_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in zero_with_exp])
few_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_no_exp])
few_with_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_with_exp])

In [78]:
Counter(zero_no_exp), Counter(zero_with_exp), Counter(few_no_exp), Counter(few_with_exp)

(Counter({1: 3065, 0: 4}),
 Counter({1: 3069}),
 Counter({0: 2239, 1: 830}),
 Counter({0: 2429, 1: 637, 2: 3}))

In [79]:
print('Zero, No Heuristic', metric_calculation(zero_no_exp, gt))
print('Zero, With Heuristic', metric_calculation(zero_with_exp, gt))
print('Few, No Heuristic', metric_calculation(few_no_exp, gt))
print('Few, With Heuristic', metric_calculation(few_with_exp, gt))

Zero, No Heuristic (0.6008471814923427, 0.3985011404366243, 0.0006516780710329097)
Zero, With Heuristic (0.6008471814923427, 0.3991528185076572, 0.0)
Few, No Heuristic (0.5158031932225481, 0.07689801238188335, 0.4072987943955686)
Few, With Heuristic (0.5187357445421962, 0.04398826979472141, 0.4362984685565331)


### 4o-mini

In [16]:
zero_no_exp=np.load('base/4o_mini/4o_mini_zero_shot_no_exp.npy')
few_no_exp=np.load('base/4o_mini/4o_mini_few_shot_no_exp.npy')

## post-processing
few_no_exp = np.array([i.split('}')[0].split('{')[1] for i in few_no_exp])
zero_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in zero_no_exp])
few_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_no_exp])

# metric 
zero_acc=accuracy_score(gt, zero_no_exp)
few_acc=accuracy_score(gt, few_no_exp)
zero_confusion=confusion_matrix(gt, zero_no_exp)
few_confusion=confusion_matrix(gt, few_no_exp)
zero_fpr=(zero_confusion[1,0])/len(test)
zero_fnr=(zero_confusion[0,1])/len(test)
few_fpr=(few_confusion[1,0])/len(test)
few_fnr=(few_confusion[0,1])/len(test)

In [18]:
print('Zero Shot, Acc', zero_acc)
print('Zero Shot, FPR', zero_fpr)
print('Zero Shot, FNR', zero_fnr)
print('Few Shot, ACC', few_acc)
print('Few Shot, FPR', few_fpr)
print('Few Shot, FNR', few_fnr)

Zero Shot, Acc 0.4447702834799609
Zero Shot, FPR 0.3831867057673509
Zero Shot, FNR 0.17204301075268819
Few Shot, ACC 0.5135223199739328
Few Shot, FPR 0.27891821440208536
Few Shot, FNR 0.20755946562398175


### 4o

In [20]:
zero_no_exp=np.load('base/4o/4o_zero_shot_no_exp.npy')
few_no_exp=np.load('base/4o/4o_few_shot_no_exp.npy')

## post-processing
few_no_exp = np.array([i.split('}')[0].split('{')[1] for i in few_no_exp])
zero_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in zero_no_exp])
few_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_no_exp])

In [27]:
acc=accuracy_score(gt, zero_no_exp)
confusion=confusion_matrix(gt, zero_no_exp)

In [28]:
tn, fp, fn, tp=confusion.ravel()

In [30]:
fp

552

In [32]:
confusion[1,0]

996

In [31]:
fn

996

In [23]:
confusion

array([[673, 552],
       [996, 848]])

In [11]:
zero_no_exp=np.load('base/4o/4o_zero_shot_no_exp.npy')
few_no_exp=np.load('base/4o/4o_few_shot_no_exp.npy')

## post-processing
few_no_exp = np.array([i.split('}')[0].split('{')[1] for i in few_no_exp])
zero_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in zero_no_exp])
few_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_no_exp])

# metric 
zero_acc=accuracy_score(gt, zero_no_exp)
few_acc=accuracy_score(gt, few_no_exp)
zero_confusion=confusion_matrix(gt, zero_no_exp)
few_confusion=confusion_matrix(gt, few_no_exp)
zero_fpr=(zero_confusion[1,0])/len(test)
zero_fnr=(zero_confusion[0,1])/len(test)
few_fpr=(few_confusion[1,0])/len(test)
few_fnr=(few_confusion[0,1])/len(test)

print('Zero Shot, Acc', zero_acc)
print('Zero Shot, FPR', zero_fpr)
print('Zero Shot, FNR', zero_fnr)
print('Few Shot, ACC', few_acc)
print('Few Shot, FPR', few_fpr)
print('Few Shot, FNR', few_fnr)

Zero Shot, Acc 0.49560117302052786
Zero Shot, FPR 0.32453567937438904
Zero Shot, FNR 0.1798631476050831
Few Shot, ACC 0.6340827631150212
Few Shot, FPR 0.03519061583577713
Few Shot, FNR 0.3307266210492017
