In [1]:
import string
import numpy as np
import pandas as pd
from itertools import product
from collections import Counter
from datasets import load_dataset
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

In [2]:
def metric_calculation(pred, gt):    
    acc=accuracy_score(gt, pred)
    f1=f1_score(gt, pred, average='macro')
    confusion=confusion_matrix(gt, pred)
    fpr=confusion[0,1]/len(gt) ## predict to be 1; actual 0
    fnr=confusion[1,0]/len(gt) ## predict to be 0; actual 1
    return acc, f1, fpr, fnr

In [3]:
def post_processing_old(pred):
    new_pred=[]
    for i in pred:
        i=i.lower()
        if 'response' in i:
            try: new_pred.append(i.split('response')[1].split()[1].replace('</s>', ''))
            except: new_pred.append(2)
        elif 'output' in i:
            try: new_pred.append(i.split('output')[1].split()[1].replace('</s>', ''))
            except: new_pred.append(2)
        else:
            try: new_pred.append(i.split()[0].replace('</s>', ''))
            except:new_pred.append(2)
    new_pred = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in new_pred])
    return new_pred

In [6]:
def post_processing(pred, model):

    if model=='mistral':
        new_pred = [p.replace('</s>', '').split()[0] for p in pred]
        new_pred = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in new_pred])
    else:
        new_pred=[]        
        for p in pred:
            if (p.split()[0]=='0') or (p.split()[0]=='1'):
                new_pred.append(p.split()[0])
            else:
                p = p.lower().replace('</s>', '').replace('boxed', '')
                splits=[s for s in p.lower().split('\n') if s != '']
                p = ' '.join(splits[-3:]).translate(str.maketrans('', '', string.punctuation))                
                if 'response' in p:
                    try: new_pred.append([t for t in p.split('response')[-1].split() if t.isnumeric()][0])
                    except: new_pred.append(2)
                elif 'output' in p:
                    try: new_pred.append([t for t in p.split('output')[-1].split() if t.isnumeric()][0])
                    except: new_pred.append(2)
                elif 'return' in p:
                    try: new_pred.append([t for t in p.split('return')[-1].split() if t.isnumeric()][0])
                    except: new_pred.append(2)
                elif 'result' in p:
                    try: new_pred.append([t for t in p.split('result')[-1].split() if t.isnumeric()][0])
                    except: new_pred.append(2)
                elif 'plaintext' in p:
                    try: new_pred.append([t for t in p.split('plaintext')[-1].split() if t.isnumeric()][0])
                    except: new_pred.append(2)
                elif 'json' in p:
                    try: new_pred.append([t for t in p.split('json')[-1].split() if t.isnumeric()][0])
                    except: new_pred.append(2)
                else:
                    try: new_pred.append(p.split()[0])
                    except:new_pred.append(2)
        new_pred = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in new_pred])
    return new_pred

In [7]:
ds = load_dataset("beanham/spatial_union_dataset")
test=ds['test']
gt=np.array(test['label'])
configs = [
    'zero_shot_with_heur_value_angle_cot',
    'zero_shot_with_heur_value_area_cot',
    'zero_shot_with_heur_value_angle_area_cot',    
    'few_shot_with_heur_value_angle_cot',
    'few_shot_with_heur_value_area_cot',
    'few_shot_with_heur_value_angle_area_cot'
]

In [8]:
results=[]
models=['mistral', '4o_mini', 'qwen_plus', '4o']
for model in models:
    print(f'Model: {model}...')
    for config in configs:
        pred=np.load(f'base/{model}/{model}_{config}.npy')
        pred=post_processing(pred, model)
        metrics=metric_calculation(pred, gt)
        results.append([config, model, round(metrics[0],3), metrics[1]])
results=pd.DataFrame(results, columns=['config', 'model', 'acc', 'f1'])

Model: mistral...
Model: 4o_mini...
Model: qwen_plus...
Model: 4o...


In [9]:
results[results['model']=='4o_mini']

Unnamed: 0,config,model,acc,f1
6,zero_shot_with_heur_value_angle_cot,4o_mini,0.81,0.755908
7,zero_shot_with_heur_value_area_cot,4o_mini,0.637,0.259714
8,zero_shot_with_heur_value_angle_area_cot,4o_mini,0.794,0.731714
9,few_shot_with_heur_value_angle_cot,4o_mini,0.792,0.727578
10,few_shot_with_heur_value_area_cot,4o_mini,0.875,0.856218
11,few_shot_with_heur_value_angle_area_cot,4o_mini,0.797,0.73582


### Error Analysis

In [46]:
model='4o'
config='few_shot_with_heur_value_angle_area'
## plain
pred=np.load(f'base/{model}/{model}_{config}.npy')
pred_proc=post_processing_old(pred)
## cot
cot_pred=np.load(f'base/{model}/{model}_{config}_cot.npy')
cot_pred_proc=post_processing(cot_pred, model)
## correction
correction=np.load(f'base/{model}_correction/{model}_best_comb_{config}_correction.npy')
correction_proc=post_processing_old(correction)
reviews=np.load(f'base/{model}_correction/{model}_best_comb_{config}_correction_reviews.npy')
metric_calculation(pred_proc, gt)[0], metric_calculation(cot_pred_proc, gt)[0]#, metric_calculation(correction_proc, gt)[0]

(0.8822055137844611, 0.9097744360902256)

In [47]:
index=np.where(pred_proc!=cot_pred_proc)[0]
index

array([  1,  22,  26,  41, 105, 120, 147, 148, 152, 154, 156, 164, 170,
       175, 191, 200, 211, 220, 223, 237, 260, 268, 269, 299, 307, 322,
       324])

In [54]:
i=4
pred_proc[index[i]], cot_pred_proc[index[i]]#, correction_proc[index[i]]

(0, 1)

In [55]:
print(pred[index[i]])

0


In [56]:
print(cot_pred[index[i]])

To determine whether the given sidewalk and road geometries represent the same sidewalk, either fully or partially, we need to evaluate the conditions of parallelism and overlap based on the provided statistics.

1. **Parallelism**: The min_angle value indicates the angular difference between the sidewalk and the road. A small min_angle suggests that the two geometries are approximately parallel. In this case, the min_angle is 3.315739160909672 degrees, which is relatively small and suggests that the sidewalk and road are approximately parallel.

2. **Overlap**: The max_area value represents the maximum percentage of overlapping area relative to the sidewalk and road, considering a 10-meter buffer. A higher max_area value indicates a significant overlap. Here, the max_area is 0.5441133330997189, which means there is a 54.41% overlap. This is a substantial overlap, indicating that the sidewalk and road geometries do intersect significantly.

Since both conditions of parallelism and over