In [1]:
import numpy as np
from collections import Counter
from datasets import load_dataset,concatenate_datasets
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
ds = load_dataset("beanham/spatial_join_dataset")
test=ds['test']
gt=np.array(test['label'])

In [3]:
def metric_calculation(pred, gt):    
    acc=accuracy_score(gt, pred)    
    confusion=confusion_matrix(gt, pred)
    fpr=confusion[0,1]/len(gt) ## predict to be 1; actual 0
    fnr=confusion[1,0]/len(gt) ## predict to be 0; actual 1
    return acc, fpr, fnr

### llama

In [4]:
zero_no_exp=np.load('base/llama3/llama3_zero_shot_no_exp.npy')
zero_with_exp=np.load('base/llama3/llama3_zero_shot_with_exp.npy')
few_no_exp=np.load('base/llama3/llama3_few_shot_no_exp.npy')
few_with_exp=np.load('base/llama3/llama3_few_shot_with_exp.npy')

## post-processing
zero_no_exp = np.array([i.split('Response')[1].split('\n')[1] if 'Response' in i else i.split()[0] for i in zero_no_exp])
few_no_exp = np.array([i.split('}')[0].replace('{','') if '{' in i else i.split()[0] for i in few_no_exp])
zero_with_exp = np.array([i.split('Response')[1].split()[1] if 'Response' in i else i.split()[0] for i in zero_with_exp])
few_with_exp = np.array([i.split('}')[0].replace('{','') if '{' in i else i.split()[0] for i in few_with_exp])
zero_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in zero_no_exp])
zero_with_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in zero_with_exp])
few_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_no_exp])
few_with_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_with_exp])

In [5]:
Counter(zero_no_exp), Counter(zero_with_exp), Counter(few_no_exp), Counter(few_with_exp)

(Counter({0: 1962, 1: 1062, 2: 45}),
 Counter({1: 1942, 0: 1125, 2: 2}),
 Counter({0: 2668, 1: 392, 2: 9}),
 Counter({1: 1722, 0: 1334, 2: 13}))

In [6]:
print('Zero, No Heuristic', metric_calculation(zero_no_exp, gt))
print('Zero, With Heuristic', metric_calculation(zero_with_exp, gt))
print('Few, No Heuristic', metric_calculation(few_no_exp, gt))
print('Few, With Heuristic', metric_calculation(few_with_exp, gt))

Zero, No Heuristic (0.47409579667644186, 0.1335940045617465, 0.3776474421635712)
Zero, With Heuristic (0.5294884327142392, 0.2512218963831867, 0.21863799283154123)
Few, No Heuristic (0.47768002606712284, 0.024437927663734114, 0.494949494949495)
Few, With Heuristic (0.6168132942326491, 0.17139133268165527, 0.20755946562398175)


### mistral

In [7]:
zero_no_exp=np.load('base/mistral/mistral_zero_shot_no_exp.npy')
zero_with_exp=np.load('base/mistral/mistral_zero_shot_with_exp.npy')
few_no_exp=np.load('base/mistral/mistral_few_shot_no_exp.npy')
few_with_exp=np.load('base/mistral/mistral_few_shot_with_exp.npy')

In [8]:
## post-processing
zero_no_exp = np.array([i.split('Response')[1].split('\n')[1].replace('</s>', '') if 'Response' in i else i.split()[0].replace('</s>', '') for i in zero_no_exp])
few_no_exp = np.array([i.split('Response')[1].split('}')[0].split('{')[1] if 'Response' in i else i.split('}')[0].replace('{','').replace('</s>', '') for i in few_no_exp])
zero_with_exp = np.array([i.split('Response')[1].split('\n')[1].replace('</s>', '') if 'Response' in i else i.split()[0].replace('</s>', '') for i in zero_with_exp])
few_with_exp = np.array([i.split('Response')[1].split('}')[0].split('{')[1] if 'Response' in i else i.split('}')[0].replace('{','').replace('</s>', '') for i in few_with_exp])
zero_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in zero_no_exp])
zero_with_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in zero_with_exp])
few_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_no_exp])
few_with_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_with_exp])

In [9]:
Counter(zero_no_exp), Counter(zero_with_exp), Counter(few_no_exp), Counter(few_with_exp)

(Counter({1: 3065, 0: 4}),
 Counter({1: 3069}),
 Counter({0: 2239, 1: 830}),
 Counter({0: 2429, 1: 637, 2: 3}))

In [10]:
print('Zero, No Heuristic', metric_calculation(zero_no_exp, gt))
print('Zero, With Heuristic', metric_calculation(zero_with_exp, gt))
print('Few, No Heuristic', metric_calculation(few_no_exp, gt))
print('Few, With Heuristic', metric_calculation(few_with_exp, gt))

Zero, No Heuristic (0.6008471814923427, 0.3985011404366243, 0.0006516780710329097)
Zero, With Heuristic (0.6008471814923427, 0.3991528185076572, 0.0)
Few, No Heuristic (0.5158031932225481, 0.07689801238188335, 0.4072987943955686)
Few, With Heuristic (0.5187357445421962, 0.04398826979472141, 0.4362984685565331)


### 4o-mini

In [4]:
zero_no_exp=np.load('base/4o_mini/4o_mini_zero_shot_no_exp.npy')
zero_with_exp=np.load('base/4o_mini/4o_mini_zero_shot_with_exp.npy')
few_no_exp=np.load('base/4o_mini/4o_mini_few_shot_no_exp.npy')
few_with_exp=np.load('base/4o_mini/4o_mini_few_shot_with_exp.npy')

## post-processing
few_no_exp = np.array([i.split('}')[0].split('{')[1] for i in few_no_exp])
few_with_exp = np.array([i.split('}')[0].split('{')[1] for i in few_with_exp])
zero_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in zero_no_exp])
zero_with_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in zero_with_exp])
few_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_no_exp])
few_with_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_with_exp])

In [5]:
Counter(zero_no_exp), Counter(zero_with_exp), Counter(few_no_exp), Counter(few_with_exp)

(Counter({0: 1873, 1: 1196}),
 Counter({1: 2321, 0: 748}),
 Counter({1: 1625, 0: 1444}),
 Counter({1: 2211, 0: 858}))

In [6]:
print('Zero, No Heuristic', metric_calculation(zero_no_exp, gt))
print('Zero, With Heuristic', metric_calculation(zero_with_exp, gt))
print('Few, No Heuristic', metric_calculation(few_no_exp, gt))
print('Few, With Heuristic', metric_calculation(few_with_exp, gt))

Zero, No Heuristic (0.4447702834799609, 0.17204301075268819, 0.3831867057673509)
Zero, With Heuristic (0.8126425545780385, 0.17139133268165527, 0.01596611274030629)
Few, No Heuristic (0.5135223199739328, 0.20755946562398175, 0.27891821440208536)
Few, With Heuristic (0.8797653958944281, 0.11990876507005539, 0.00032583903551645487)


### 4o

In [7]:
zero_no_exp=np.load('base/4o/4o_zero_shot_no_exp.npy')
zero_with_exp=np.load('base/4o/4o_zero_shot_with_exp.npy')
few_no_exp=np.load('base/4o/4o_few_shot_no_exp.npy')
few_with_exp=np.load('base/4o/4o_few_shot_with_exp.npy')

## post-processing
few_no_exp = np.array([i.split('}')[0].split('{')[1] for i in few_no_exp])
few_with_exp = np.array([i.split('}')[0].split('{')[1] for i in few_with_exp])
zero_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in zero_no_exp])
zero_with_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in zero_with_exp])
few_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_no_exp])
few_with_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_with_exp])

In [8]:
Counter(zero_no_exp), Counter(zero_with_exp), Counter(few_no_exp), Counter(few_with_exp)

(Counter({0: 1669, 1: 1400}),
 Counter({1: 1792, 0: 1277}),
 Counter({1: 2751, 0: 318}),
 Counter({1: 2059, 0: 1010}))

In [9]:
print('Zero, No Heuristic', metric_calculation(zero_no_exp, gt))
print('Zero, With Heuristic', metric_calculation(zero_with_exp, gt))
print('Few, No Heuristic', metric_calculation(few_no_exp, gt))
print('Few, With Heuristic', metric_calculation(few_with_exp, gt))

Zero, No Heuristic (0.49560117302052786, 0.1798631476050831, 0.32453567937438904)
Zero, With Heuristic (0.9139784946236559, 0.03453893776474422, 0.05148256761159987)
Few, No Heuristic (0.6340827631150212, 0.3307266210492017, 0.03519061583577713)
Few, With Heuristic (0.9273378950798306, 0.07135874877810362, 0.0013033561420658195)


### o3-mini

In [49]:
zero_no_exp=np.load('base/o3_mini/o3_mini_zero_shot_no_exp.npy')
few_no_exp=np.load('base/o3_mini/o3_mini_few_shot_no_exp.npy')
few_with_exp=np.load('base/o3_mini/o3_mini_few_shot_with_exp.npy')
few_with_exp[np.where(few_with_exp=='1')]='{1}'
few_with_exp[np.where(few_with_exp=='0')]='{0}'

In [50]:
few_no_exp = np.array([i.split('Response:')[1].split('}')[0].split('{')[1] if 'Response' in i else  i.split('}')[0].split('{')[1]  for i in few_no_exp])
few_with_exp = np.array([i.split('Response:')[1].split('}')[0].split('{')[1] if 'Response' in i else  i.split('}')[0].split('{')[1]  for i in few_with_exp])
zero_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in zero_no_exp])
few_no_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_no_exp])
few_with_exp = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in few_with_exp])

In [52]:
print('Zero, No Heuristic', metric_calculation(zero_no_exp, gt))
#print('Zero, With Heuristic', metric_calculation(zero_with_exp, gt))
print('Few, No Heuristic', metric_calculation(few_no_exp, gt))
print('Few, With Heuristic', metric_calculation(few_with_exp, gt))

Zero, No Heuristic (0.9231019876181167, 0.05311176278918214, 0.023786249592701206)
Few, No Heuristic (0.9198435972629521, 0.03779732811990877, 0.042359074617139135)
Few, With Heuristic (0.9364613880742912, 0.06158357771260997, 0.0019550342130987292)
