In [1]:
import numpy as np
import pandas as pd

from itertools import product
from collections import Counter
from datasets import load_dataset
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

In [2]:
def metric_calculation(pred, gt):    
    acc=accuracy_score(gt, pred)
    f1=f1_score(gt, pred, average='macro')
    confusion=confusion_matrix(gt, pred)
    fpr=confusion[0,1]/len(gt) ## predict to be 1; actual 0
    fnr=confusion[1,0]/len(gt) ## predict to be 0; actual 1
    return acc, f1, fpr, fnr
    
def post_processing(pred):
    new_pred=[]
    for i in pred:
        i=i.lower()
        if 'response' in i:
            try: new_pred.append(i.split('response')[1].split()[1].replace('</s>', ''))
            except: new_pred.append(2)
        elif 'output' in i:
            try: new_pred.append(i.split('output')[1].split()[1].replace('</s>', ''))
            except: new_pred.append(2)
        else:
            try: new_pred.append(i.split()[0].replace('</s>', ''))
            except:new_pred.append(2)
    new_pred = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in new_pred])
    return new_pred

In [3]:
ds = load_dataset("beanham/spatial_join_dataset")
test=ds['test']
gt=np.array(test['label'])

In [5]:
model='4o_mini'
value='worst_single'
outputs=np.load(f'base/{model}_correction/{model}_{value}_few_shot_with_heur_value_all_correction.npy')
reviews=np.load(f'base/{model}_correction/{model}_{value}_few_shot_with_heur_value_all_correction_reviews.npy')
processed=post_processing(outputs)
wrong_index=np.where(processed!=gt)[0]

In [16]:
wrong_index

array([  0,   2,   5,   7,  10,  16,  18,  23,  26,  28,  30,  31,  32,
        39,  46,  50,  51,  54,  59,  60,  71,  74,  77,  80,  91,  98,
       108, 112, 113, 126, 128, 132, 135, 136, 138, 143, 144, 146, 147,
       152, 155, 156, 159, 162, 163, 166, 167, 173, 180, 183, 184, 198,
       205, 207, 211, 215, 217, 222, 224, 225, 228, 229, 232, 233, 244,
       246, 248, 256, 257, 267, 269, 270, 271, 272, 277, 281, 283, 284,
       287, 288, 291, 294, 296, 297, 308, 310, 316, 317, 324, 325, 329,
       330, 331, 332, 333, 336, 346, 347, 355, 368, 369, 373, 376, 377,
       384, 385, 390, 391, 395, 402, 405, 408, 409, 410, 424, 429, 430,
       431, 438, 442, 445, 448, 452, 462, 465, 467, 468, 474, 478, 484,
       485, 486, 487, 491, 495, 496, 503, 505, 511, 516, 519, 520, 535,
       536, 537, 548, 549, 556, 561, 564, 566, 567, 570, 571, 572, 575,
       577, 581, 587, 591, 594, 600, 601, 604, 608, 610, 615, 617, 631,
       635, 641, 642, 646, 653, 654, 656, 657, 660, 663, 669, 67

In [13]:
print(reviews[wrong_index[0]])

The response provided is incorrect. Let's evaluate the conditions based on the provided statistics:

1. **Parallelism**: The min_angle is 1.094690596035889 degrees, which indicates that the sidewalk is approximately parallel to the road. This condition is satisfied.

2. **Clearance**: The min_distance is 8.916691213846207 meters, which suggests that the sidewalk and road maintain a sufficient distance apart. This condition is also satisfied.

3. **Overlap**: The max_area is 0.538494352036627, which indicates that there is a significant amount of overlap within the 10-meter buffer around each geometry. However, the requirement states that the sidewalk and road must not directly overlap. Given that the max_area exceeds 0.5, this condition is not satisfied.

Since the overlap condition is not met, the correct response should be 0, indicating that the sidewalk does not run alongside the road. 

### Correct Response: 0


In [7]:
results=[]
metric_values = ['worst_single', 'best_single', 'worst_comb', 'best_comb', 'worst_all', 'best_all']
for model in ['4o_mini', 'qwen']:
    for value in metric_values:
        few_shot=np.load(f'base/{model}_correction/{model}_{value}_few_shot_with_heur_value_all_correction.npy')
        few_shot=post_processing(few_shot)
        few_metrics=metric_calculation(few_shot, gt)        
        results.append([model, value, 'few_shot', few_metrics[0], few_metrics[1]])
results=pd.DataFrame(results, columns=['model', 'value', 'prompt', 'acc', 'f1'])        

In [8]:
results

Unnamed: 0,model,value,prompt,acc,f1
0,4o_mini,worst_single,few_shot,0.715,0.713936
1,4o_mini,best_single,few_shot,0.687,0.686804
2,4o_mini,worst_comb,few_shot,0.73,0.72889
3,4o_mini,best_comb,few_shot,0.683,0.682769
4,4o_mini,worst_all,few_shot,0.719,0.717808
5,4o_mini,best_all,few_shot,0.702,0.701569
6,qwen,worst_single,few_shot,0.887,0.885486
7,qwen,best_single,few_shot,0.954,0.951287
8,qwen,worst_comb,few_shot,0.877,0.875521
9,qwen,best_comb,few_shot,0.981,0.980115
