In [14]:
import string
import numpy as np
import pandas as pd
from itertools import product
from collections import Counter
from datasets import load_dataset
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

In [15]:
def metric_calculation(pred, gt):    
    acc=accuracy_score(gt, pred)
    f1=f1_score(gt, pred, average='macro')
    confusion=confusion_matrix(gt, pred)
    fpr=confusion[0,1]/len(gt) ## predict to be 1; actual 0
    fnr=confusion[1,0]/len(gt) ## predict to be 0; actual 1
    return acc, f1, fpr, fnr

In [16]:
def post_processing(pred, model):

    if model=='mistral':
        new_pred = [p.replace('</s>', '').split()[0] for p in pred]
        new_pred = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in new_pred])
    else:
        new_pred=[]        
        for p in pred:
            if (p.split()[0]=='0') or (p.split()[0]=='1'):
                new_pred.append(p.split()[0])
            else:
                p = p.lower().replace('</s>', '').replace('boxed', '')
                splits=[s for s in p.lower().split('\n') if s != '']
                p = ' '.join(splits[-3:]).translate(str.maketrans('', '', string.punctuation))                
                if 'response' in p:
                    try: new_pred.append([t for t in p.split('response')[-1].split() if t.isnumeric()][0])
                    except: new_pred.append(2)
                elif 'output' in p:
                    try: new_pred.append([t for t in p.split('output')[-1].split() if t.isnumeric()][0])
                    except: new_pred.append(2)
                elif 'return' in p:
                    try: new_pred.append([t for t in p.split('return')[-1].split() if t.isnumeric()][0])
                    except: new_pred.append(2)
                elif 'result' in p:
                    try: new_pred.append([t for t in p.split('result')[-1].split() if t.isnumeric()][0])
                    except: new_pred.append(2)
                elif 'plaintext' in p:
                    try: new_pred.append([t for t in p.split('plaintext')[-1].split() if t.isnumeric()][0])
                    except: new_pred.append(2)
                elif 'json' in p:
                    try: new_pred.append([t for t in p.split('json')[-1].split() if t.isnumeric()][0])
                    except: new_pred.append(2)
                else:
                    try: new_pred.append(p.split()[0])
                    except:new_pred.append(2)
        new_pred = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in new_pred])
    return new_pred

In [17]:
ds = load_dataset("beanham/spatial_join_dataset")
test=ds['test']
gt=np.array(test['label'])[:50]
configs = [
    "zero_shot_no_heur_traveler",
    "zero_shot_with_heur_value_all_traveler",
    "few_shot_no_heur_traveler",
    "few_shot_with_heur_value_all_traveler",
]

In [18]:
results=[]
models=['4o_mini', '4o']
for model in models:
    print(f'Model: {model}...')
    for config in configs:
        pred=np.load(f'base/{model}_traveler/{model}_{config}.npy')[:50]
        if model=='deepseek':
            pred=[i[0] for i in pred]
        pred=post_processing(pred, model)
        metrics=metric_calculation(pred, gt)
        results.append([config, model, round(metrics[0],3), metrics[1]])
results=pd.DataFrame(results, columns=['config', 'model', 'acc', 'f1'])
results

Model: 4o_mini...
Model: 4o...


Unnamed: 0,config,model,acc,f1
0,zero_shot_no_heur_traveler,4o_mini,0.68,0.457995
1,zero_shot_with_heur_value_all_traveler,4o_mini,0.62,0.336209
2,few_shot_no_heur_traveler,4o_mini,0.64,0.475524
3,few_shot_with_heur_value_all_traveler,4o_mini,0.8,0.6875
4,zero_shot_no_heur_traveler,4o,0.72,0.479167
5,zero_shot_with_heur_value_all_traveler,4o,0.76,0.40647
6,few_shot_no_heur_traveler,4o,0.68,0.457995
7,few_shot_with_heur_value_all_traveler,4o,0.74,0.539334


In [19]:
configs = [
    "zero_shot_no_heur_traveler",
    "zero_shot_with_heur_value_all_traveler",
    "few_shot_no_heur_traveler",
    "few_shot_with_heur_value_all_traveler",
]

### Analysis

In [20]:
model='4o_mini'
config='few_shot_no_heur_traveler'
pred=np.load(f'base/{model}_traveler/{model}_{config}.npy')[:50]
if model=='deepseek': 
    output=[i[0] for i in pred]
    reasoning=[i[1] for i in pred]
    proc_pred=post_processing(output, model)
else:
    proc_pred=post_processing(pred, model)
right=np.where(proc_pred==gt)[0]
wrong=np.where(proc_pred!=gt)[0]
len(right), len(wrong)

(32, 18)

In [22]:
index=0
print(pred[wrong[index]])

To determine whether the sidewalk runs alongside the road, we need to analyze the coordinates of both the sidewalk and the road.

1. **Extract Coordinates**:
   - Sidewalk coordinates: 
     ```
     [[-122.2017203, 47.6112977], 
      [-122.2017112, 47.61130719999999], 
      [-122.2017023, 47.611323799999994], 
      [-122.20169890000001, 47.61134139999999], 
      [-122.20169890000001, 47.6113726], 
      [-122.2017066, 47.6118441], 
      [-122.20170970000001, 47.6118763], 
      [-122.2017135, 47.611885999999984], 
      [-122.2017215, 47.61189660000001], 
      [-122.2017359, 47.6119085], 
      [-122.20174929999999, 47.6119183]]
     ```
   - Road coordinates: 
     ```
     [[-122.2016161, 47.6114907], 
      [-122.2016188, 47.6113494]]
     ```

2. **Visualize the Geometry**:
   - The sidewalk is a series of points that form a line, and the road is also a line formed by its coordinates.
   - The sidewalk appears to be a longer line, while the road is a shorter segment.

3. **D