In [1]:
import string
import numpy as np
import pandas as pd
from itertools import product
from collections import Counter
from datasets import load_dataset
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

In [2]:
def metric_calculation(pred, gt):    
    acc=accuracy_score(gt, pred)
    f1=f1_score(gt, pred, average='macro')
    confusion=confusion_matrix(gt, pred)
    fpr=confusion[0,1]/len(gt) ## predict to be 1; actual 0
    fnr=confusion[1,0]/len(gt) ## predict to be 0; actual 1
    return acc, f1, fpr, fnr

In [3]:
def post_processing(pred, model):

    if model=='mistral':
        new_pred = [p.replace('</s>', '').split()[0] for p in pred]
        new_pred = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in new_pred])
    else:
        new_pred=[]        
        for p in pred:
            if (p.split()[0]=='0') or (p.split()[0]=='1'):
                new_pred.append(p.split()[0])
            else:
                p = p.lower().replace('</s>', '').replace('boxed', '')
                splits=[s for s in p.lower().split('\n') if s != '']
                p = ' '.join(splits[-3:]).translate(str.maketrans('', '', string.punctuation))                
                if 'response' in p:
                    try: new_pred.append([t for t in p.split('response')[-1].split() if t.isnumeric()][0])
                    except: new_pred.append(2)
                elif 'output' in p:
                    try: new_pred.append([t for t in p.split('output')[-1].split() if t.isnumeric()][0])
                    except: new_pred.append(2)
                elif 'return' in p:
                    try: new_pred.append([t for t in p.split('return')[-1].split() if t.isnumeric()][0])
                    except: new_pred.append(2)
                elif 'result' in p:
                    try: new_pred.append([t for t in p.split('result')[-1].split() if t.isnumeric()][0])
                    except: new_pred.append(2)
                elif 'plaintext' in p:
                    try: new_pred.append([t for t in p.split('plaintext')[-1].split() if t.isnumeric()][0])
                    except: new_pred.append(2)
                elif 'json' in p:
                    try: new_pred.append([t for t in p.split('json')[-1].split() if t.isnumeric()][0])
                    except: new_pred.append(2)
                else:
                    try: new_pred.append(p.split()[0])
                    except:new_pred.append(2)
        new_pred = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in new_pred])
    return new_pred

In [4]:
ds = load_dataset("beanham/spatial_join_dataset")
test=ds['test']
gt=np.array(test['label'])[:20]
configs = [
    "few_shot_no_heur_cot",
    "few_shot_with_heur_value_all_cot"
]

In [5]:
results=[]
models=['4o_mini', 'qwen_plus', '4o', 'o3_mini', 'deepseek']
for model in models:
    print(f'Model: {model}...')
    for config in configs:
        pred=np.load(f'base/{model}_cot/{model}_{config}.npy')[:20]
        if model=='deepseek':
            pred=[i[0] for i in pred]
        pred=post_processing(pred, model)
        metrics=metric_calculation(pred, gt)
        results.append([config, model, round(metrics[0],3), metrics[1]])
results=pd.DataFrame(results, columns=['config', 'model', 'acc', 'f1'])

Model: 4o_mini...
Model: qwen_plus...
Model: 4o...
Model: o3_mini...
Model: deepseek...


### Analysis

In [None]:
configs = [
    "few_shot_no_heur_cot",
    "few_shot_with_heur_value_all_cot"
]

In [96]:
model='4o_mini'
config='few_shot_no_heur_cot'
pred=np.load(f'base/{model}_cot/{model}_{config}.npy')[:20]
if model=='deepseek': 
    output=[i[0] for i in pred]
    reasoning=[i[1] for i in pred]
    proc_pred=post_processing(output, model)
else:
    proc_pred=post_processing(pred, model)
right=np.where(proc_pred==gt)[0]
wrong=np.where(proc_pred!=gt)[0]
len(right), len(wrong)

(13, 7)

In [97]:
proximity=0
alignment=0
for p in pred:
    proximity += 1*('proximity' in p.lower() or 'distance' in p.lower())
    alignment += 1*('alignment' in p.lower() or 'angle' in p.lower())
proximity, alignment

(19, 5)

In [98]:
appear_examples=[]
approximate_examples=[]
assume_exampels=[]
for p in pred:
    if 'appear' in p.lower():
        appear_examples.append(p.replace('appear', 'APPEAR'))
    if 'approximate' in p.lower():
        approximate_examples.append(p.replace('approximate', 'APPROXIMATE'))
    if 'assume' in p.lower():
        assume_exampels.append(p.replace('assume', 'ASSUME'))
len(appear_examples), len(approximate_examples), len(assume_exampels)

(13, 4, 0)

In [100]:
examples=[]
for p in pred:
    if 1*('appear' in p.lower() or 'approximate' in p.lower() or 'assume' in p.lower()):
        examples.append(p)
len(examples)

14

In [91]:
print(examples[7])

To determine whether the sidewalk runs alongside the road, we will analyze the provided coordinates of both geometries step by step.

1. **Extract Coordinates**:
   - Sidewalk coordinates: 
     ```
     [[-122.15343690000002, 47.61303290000001], 
      [-122.1534257, 47.613036799999996], 
      [-122.1534098, 47.613039199999996], 
      [-122.1531203, 47.613043699999984], 
      [-122.1530779, 47.6130432], 
      [-122.1530471, 47.61303959999999], 
      [-122.1530222, 47.6130356], 
      [-122.15300619999998, 47.6130315], 
      [-122.15299070000002, 47.613027], 
      [-122.15298000000001, 47.6130224], 
      [-122.152972, 47.61301749999999]]
     ```
   - Road coordinates: 
     ```
     [[-122.1534612, 47.6130981], 
      [-122.1533986, 47.6131006], 
      [-122.1533184, 47.6131003], 
      [-122.1528869, 47.6130989]]
     ```

2. **Visualize the Coordinates**:
   - The sidewalk appears to have a series of points that are relatively close together, indicating a continuous path.
  

In [71]:
print(approximate_examples[3])

To determine whether the sidewalk runs alongside the road, we will analyze the coordinates of both geometries step by step.

1. **Extract Coordinates**:
   - Sidewalk coordinates: 
     ```
     [[-122.1909905, 47.54851559999999], 
      [-122.1909933, 47.54832630000001], 
      [-122.19099799999998, 47.54809680000001], 
      [-122.1909995, 47.5480882], 
      [-122.1910037, 47.548079499999986], 
      [-122.191008, 47.54807359999999], 
      [-122.19101569999998, 47.54806740000001], 
      [-122.19102939999999, 47.5480583]]
     ```
   - Road coordinates: 
     ```
     [[-122.1918109, 47.5480049], 
      [-122.1910218, 47.5479995]]
     ```

2. **Visualize the Coordinates**:
   - The sidewalk appears to be a continuous line with multiple points, while the road is represented by two points.
   - The sidewalk starts at APPROXIMATEly `(-122.1909905, 47.5485156)` and ends at `(-122.1910294, 47.5480583)`.
   - The road starts at `(-122.1918109, 47.5480049)` and ends at `(-122.1910218, 47

## deepseek

#### no heur
- Maybe I can compare the direction and proximity of the line segments.
- Looking at the road's coordinates, they are moving eastward (since longitude becomes more negative) and northward (latitude increases).

- wrong conclusion: The sidewalk's start is ~23 meters west of the road's start, beyond typical adjacency for sidewalks.
- wrong conclusoin: The sidewalk moves north, and the road moves south,

### 4o

#### no heur
- check proximinity by calculate distance (pairwise); check alignment
- no calculation, but with conclusion. (indicating; likely parallele; same general direction)
  
#### no heur hints
- follows the hint: calculate angle & distance;
- wrong calculation: the change in longitude & latitude
- unclear calculation: The closest approach between the sidewalk and the road appears to be around **2-3 meters**.
- no calculation, but with conclusion. (For this specific input, after performing the necessary calculations, the conditions are satisfied, so the response is:) (Given the complexity of these calculations and the need for precise geospatial analysis, let's assume the calculations have been performed, and based on the input provided, the conditions are not fully satisfied.)


#### with heur hints
- check each heuristic hints invidually; and in combination.
- compare values (use typical values determined by their own knowledge 10; 5 meter distance; 20%)
- but inconsistent: overlap: 20% v.s. 30~70%; distance 2-10 meters or 1-20

------------

### qwen-plus

#### no heur
- check proximinity; calculate distance (pairwise); direction (northwest to southeast direction). (The sidewalk is close enough to the road to pass the proximity test.)
- Let’s assume the computed distance is approximately **1 meter**(??)
  
#### no heur hints
- follows the hint: calculate angle & distance;
- wrong calculation: Similarly, we can use the first and last points to determine the direction of the road (wrong); or pairwise point angle?? but could happen to arrive at the correction conclusion
- unclear calculation: The closest approach between the sidewalk and the road appears to be around **2-3 meters**.
- no calculation, but with conclusion.


#### with heur hints
- check each heuristic hints invidually; and in combination.
- compare values (use typical values determined by their own knowledge 10; 5 meter distance; 20%)
- but inconsistent: overlap: 20% v.s. 30~70%; distance 2-10 meters or 1-20

------------

### 4o-mini

#### no heur
- check proximinity; calculate distance (pairwise). (We need to check if the sidewalk is close enough to the road to be considered "alongside".)
- threshold 1 meter??
- no results returned
  
#### no heur hints
- follows the hint: calculate angle & distance;
- Similarly, we can use the first and last points to determine the direction of the road (wrong)
- Given the complexity of the calculations and checks, I will assume that the conditions are not satisfied based on the provided examples and return:

#### with heur hints
- check each heuristic hints invidually; and in combination.
- compare values (use typical values determined by their own knowledge 10; 1 meter distance; 10%)