In [1]:
import numpy as np
from itertools import product
from collections import Counter
from datasets import load_dataset
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

In [2]:
def metric_calculation(pred, gt):    
    acc=accuracy_score(gt, pred)
    f1=f1_score(gt, pred, average='macro')
    confusion=confusion_matrix(gt, pred)
    fpr=confusion[0,1]/len(gt) ## predict to be 1; actual 0
    fnr=confusion[1,0]/len(gt) ## predict to be 0; actual 1
    return acc, f1, fpr, fnr

In [3]:
def post_processing(pred):
    new_pred=[]
    for i in pred:
        i=i.lower()
        if 'response' in i:
            try: new_pred.append(i.split('response')[1].split()[1].replace('</s>', ''))
            except: new_pred.append(2)
        elif 'output' in i:
            try: new_pred.append(i.split('output')[1].split()[1].replace('</s>', ''))
            except: new_pred.append(2)
        else:
            try: new_pred.append(i.split()[0].replace('</s>', ''))
            except:new_pred.append(2)
    new_pred = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in new_pred])
    return new_pred

In [4]:
ds = load_dataset("beanham/spatial_join_dataset")
test=ds['test']
gt=np.array(test['label'])
## evaluate on a subset
np.random.seed(100)
index=np.random.randint(0, 3069, 1000)

### 4o_mini

In [37]:
reviews=np.load('base/4o_mini_correction/4o_mini_degree_1_few_shot_with_heur_value_all_correction_reviews.npy')
outputs=np.load('base/4o_mini_correction/4o_mini_degree_1_few_shot_with_heur_value_all_correction.npy')
new_outputs=post_processing(outputs)
wrong_index=np.where(new_outputs!=gt)[0]

In [38]:
reviews[wrong_index[1]]

"The response provided is incorrect based on the conditions outlined for determining whether the sidewalk runs alongside the road. Let's evaluate the conditions:\n\n1. **Parallelism**: The min_angle is 0.045416253580953025 degrees, which indicates that the sidewalk and road are nearly parallel. This condition is satisfied.\n\n2. **Clearance**: The min_distance is 5.867581638586974 meters. This indicates that the sidewalk and road maintain a distance apart, which satisfies the clearance condition.\n\n3. **Overlap**: The max_area is 0.6352940789228652, which indicates a significant overlap in the 10-meter buffer area around both the sidewalk and the road. This condition is not satisfied, as the sidewalk and road should not have significant overlap in their buffers.\n\nSince the overlap condition is not satisfied, the correct response should be 0, indicating that the sidewalk does not run alongside the road. \n\nTherefore, the original response of 1 is incorrect"

In [32]:
reviews=np.load('base/qwen_correction/qwen_degree_1_few_shot_with_heur_value_all_correction_reviews.npy')
outputs=np.load('base/qwen_correction/qwen_degree_1_few_shot_with_heur_value_all_correction.npy')
new_outputs=post_processing(outputs)
wrong_index=np.where(new_outputs!=gt)[0]

In [35]:
reviews[wrong_index[2]]

"Upon reviewing the given input and response, here's the breakdown:\n\n1. **Parallelism**:  \n   - The `min_angle` is 0.22°, which indicates that the sidewalk and road are almost perfectly parallel. This satisfies the condition for parallelism.\n\n2. **Clearance**:  \n   - The `min_distance` is approximately 8.37 meters. This value suggests there is sufficient clearance between the sidewalk and the road, satisfying the clearance condition.\n\n3. **Overlap**:  \n   - The `max_area` is about 4.18%, which is relatively low. However, it still indicates some overlap within the 10-meter buffer zone around the sidewalk and road. While the overlap percentage is not very large, it meets the criteria for minimal allowable overlap in this context.\n\nBased on these evaluations:\n- Parallelism is satisfied.\n- Clearance is satisfied.\n- Overlap, though minimal, is still present within acceptable limits.\n\nThus, the response of"

In [36]:
outputs[wrong_index[2]], gt[wrong_index[2]]

('1', 0)

In [6]:
for threshold in [1,2,5,10,20]:
    print('----------------------------------')
    print(f'Threshold: {threshold}...')
    zero_shot_all=np.load(f'base/4o_mini_correction/4o_mini_degree_{threshold}_zero_shot_with_heur_value_all_correction.npy')
    few_shot_all=np.load(f'base/4o_mini_correction/4o_mini_degree_{threshold}_few_shot_with_heur_value_all_correction.npy')
    zero_shot_all=post_processing(zero_shot_all)
    few_shot_all=post_processing(few_shot_all)
    print(f'Heuristics: ', round(metric_calculation(np.array(test['min_angle'])<=threshold, gt)[0],3))
    print(f'Zero-Shot-All:   ', metric_calculation(zero_shot_all, gt)[0])
    print(f'Few-Shot-All:    ', metric_calculation(few_shot_all, gt)[0])

----------------------------------
Threshold: 1...
Heuristics:  0.867
Zero-Shot-All:    0.912
Few-Shot-All:     0.69
----------------------------------
Threshold: 2...
Heuristics:  0.908
Zero-Shot-All:    0.918
Few-Shot-All:     0.692
----------------------------------
Threshold: 5...
Heuristics:  0.946
Zero-Shot-All:    0.922
Few-Shot-All:     0.712
----------------------------------
Threshold: 10...
Heuristics:  0.948
Zero-Shot-All:    0.908
Few-Shot-All:     0.693
----------------------------------
Threshold: 20...
Heuristics:  0.934
Zero-Shot-All:    0.906
Few-Shot-All:     0.736


### qwen

In [7]:
for threshold in [1,2,5,10,20]:
    print('----------------------------------')
    print(f'Threshold: {threshold}...')
    zero_shot_all=np.load(f'base/qwen_correction/qwen_degree_{threshold}_zero_shot_with_heur_value_all_correction.npy')
    #few_shot_all=np.load(f'base/qwen_correction/qwen_degree_{threshold}_few_shot_with_heur_value_all_correction.npy')
    zero_shot_all=post_processing(zero_shot_all)
    #few_shot_all=post_processing(few_shot_all)
    print(f'Heuristics: ', round(metric_calculation(np.array(test['min_angle'])<=threshold, gt)[0],3))
    print(f'Zero-Shot-All:   ', metric_calculation(zero_shot_all, gt)[0])
    #print(f'Few-Shot-All:    ', metric_calculation(few_shot_all, gt)[0])

----------------------------------
Threshold: 1...
Heuristics:  0.867
Zero-Shot-All:    0.928
----------------------------------
Threshold: 2...
Heuristics:  0.908
Zero-Shot-All:    0.937
----------------------------------
Threshold: 5...
Heuristics:  0.946
Zero-Shot-All:    0.94
----------------------------------
Threshold: 10...
Heuristics:  0.948
Zero-Shot-All:    0.939
----------------------------------
Threshold: 20...
Heuristics:  0.934
Zero-Shot-All:    0.929


In [6]:
zero_shot_comb=np.load(f'base/qwen_ec/qwen_comb_zero_shot_with_heur_value_comb_ec.npy')
few_shot_comb=np.load(f'base/qwen_ec/qwen_comb_few_shot_with_heur_value_comb_ec.npy')
zero_shot_comb=post_processing(zero_shot_comb)
few_shot_comb=post_processing(few_shot_comb)
print(f'Zero-Shot-Comb:   ', metric_calculation(zero_shot_comb, gt[index])[0])
print(f'Few-Shot-Comb:    ', metric_calculation(few_shot_comb, gt[index])[0])

Zero-Shot-Comb:    0.924
Few-Shot-Comb:     0.949
