In [24]:
import string
import numpy as np
import pandas as pd

from itertools import product
from collections import Counter
from datasets import load_dataset
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

In [14]:
def metric_calculation(pred, gt):    
    acc=accuracy_score(gt, pred)
    f1=f1_score(gt, pred, average='macro')
    confusion=confusion_matrix(gt, pred)
    fpr=confusion[0,1]/len(gt) ## predict to be 1; actual 0
    fnr=confusion[1,0]/len(gt) ## predict to be 0; actual 1
    return acc, f1, fpr, fnr
    
def post_processing(pred):
    new_pred=[]
    for i in pred:
        i=i.lower()
        if 'response' in i:
            try: new_pred.append(i.split('response')[1].split()[1].replace('</s>', ''))
            except: new_pred.append(2)
        elif 'output' in i:
            try: new_pred.append(i.split('output')[1].split()[1].replace('</s>', ''))
            except: new_pred.append(2)
        else:
            try: new_pred.append(i.split()[0].replace('</s>', ''))
            except:new_pred.append(2)
    new_pred = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in new_pred])
    return new_pred

In [15]:
ds = load_dataset("beanham/spatial_join_dataset")
test=ds['test']
gt=np.array(test['label'])

In [16]:
results=[]
metric_values = ['random','worst_single', 'best_single', 'worst_comb', 'best_comb', 'worst_all', 'best_all']
for model in ['4o_mini', 'qwen_plus', '4o']:
    for value in metric_values:
        few_shot_hints=np.load(f'base/{model}_correction/{model}_{value}_few_shot_with_heur_hint_all_correction.npy')
        few_shot_values=np.load(f'base/{model}_correction/{model}_{value}_few_shot_with_heur_value_all_correction.npy')
        few_shot_hints=post_processing(few_shot_hints)
        few_shot_values=post_processing(few_shot_values)
        few_hints_metrics=metric_calculation(few_shot_hints, gt)
        few_values_metrics=metric_calculation(few_shot_values, gt)
        results.append([model, value, 'few_shot_hints', few_hints_metrics[0], few_hints_metrics[1]])
        results.append([model, value, 'few_shot_values', few_values_metrics[0], few_values_metrics[1]])
results=pd.DataFrame(results, columns=['model', 'value', 'prompt', 'acc', 'f1'])        

### Analysis

In [25]:
def post_processing_cot(pred, model):

    if model=='mistral':
        new_pred = [p.replace('</s>', '').split()[0] for p in pred]
        new_pred = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in new_pred])
    else:
        new_pred=[]        
        for p in pred:
            if (p.split()[0]=='0') or (p.split()[0]=='1'):
                new_pred.append(p.split()[0])
            else:
                p = p.lower().replace('</s>', '').replace('boxed', '')
                splits=[s for s in p.lower().split('\n') if s != '']
                p = ' '.join(splits[-3:]).translate(str.maketrans('', '', string.punctuation))                
                if 'response' in p:
                    try: new_pred.append([t for t in p.split('response')[-1].split() if t.isnumeric()][0])
                    except: new_pred.append(2)
                elif 'output' in p:
                    try: new_pred.append([t for t in p.split('output')[-1].split() if t.isnumeric()][0])
                    except: new_pred.append(2)
                elif 'return' in p:
                    try: new_pred.append([t for t in p.split('return')[-1].split() if t.isnumeric()][0])
                    except: new_pred.append(2)
                elif 'result' in p:
                    try: new_pred.append([t for t in p.split('result')[-1].split() if t.isnumeric()][0])
                    except: new_pred.append(2)
                elif 'plaintext' in p:
                    try: new_pred.append([t for t in p.split('plaintext')[-1].split() if t.isnumeric()][0])
                    except: new_pred.append(2)
                elif 'json' in p:
                    try: new_pred.append([t for t in p.split('json')[-1].split() if t.isnumeric()][0])
                    except: new_pred.append(2)
                else:
                    try: new_pred.append(p.split()[0])
                    except:new_pred.append(2)
        new_pred = np.array([int(float(i)) if i in ['0', '0.0', '1', '1.0'] else 2 for i in new_pred])
    return new_pred

In [143]:
model='qwen_plus'
config='few_shot_with_heur_value_all'
## plain
pred=np.load(f'base/{model}/{model}_{config}.npy')
pred_proc=post_processing(pred)

## cot
cot_pred=np.load(f'base/{model}_cot/{model}_{config}_cot.npy')
cot_pred_proc=post_processing_cot(cot_pred, model)

## correction
correction=np.load(f'base/{model}_correction/{model}_best_all_{config}_correction.npy')
reviews=np.load(f'base/{model}_correction/{model}_best_all_{config}_correction_reviews.npy')
correction_proc=post_processing(correction)

In [144]:
(cot_pred_proc==gt).mean(), (correction_proc==gt).mean()

(0.981, 0.989)

In [145]:
unmatched_index=np.where(correction_proc!=cot_pred_proc)[0]
correction_index=np.where(correction_proc==gt)[0]
index=unmatched_index[np.where(np.isin(unmatched_index, correction_index))[0]]
index

array([152, 310, 333, 336, 346, 382, 402, 505, 549, 582, 587, 601, 726,
       753, 850, 888, 903])

In [171]:
#12, 13
i=12
print(cot_pred[index[i]])

To determine whether the sidewalk runs alongside the road, we evaluate the three conditions: parallelism, clearance, and overlap. Let's go through each condition step by step.

---

### 1. **Parallelism**:
The `min_angle` value measures the smallest angle between the sidewalk and the road. For the sidewalk to be considered parallel to the road, this value should be relatively small (e.g., less than 10–15 degrees). 

- **Given**: `min_angle = 5.802232362551285`
- **Evaluation**: The angle is very small (less than 10 degrees), indicating that the sidewalk and road are approximately parallel. This condition is satisfied.

---

### 2. **Clearance**:
The `min_distance` value represents the minimum distance between the sidewalk and the road. A sidewalk running alongside a road should maintain a reasonable distance without intersecting or overlapping with the road. A typical threshold for this distance could be greater than 2–3 meters.

- **Given**: `min_distance = 7.587268867058914`
- **Eval

In [172]:
print(reviews[index[i]])

Upon reviewing the provided data and conditions, let's analyze whether the response is correct:

1. **Parallelism (min_angle):**  
   The `min_angle` value is 5.802°, which indicates that the sidewalk and road are fairly parallel since the angle difference is small. This condition appears to be satisfied.

2. **Clearance (min_distance):**  
   The `min_distance` value is 7.587 meters. This suggests there is a reasonable clearance between the sidewalk and the road, satisfying the clearance requirement.

3. **Overlap (max_area):**  
   The `max_area` value is 0.235, which represents the percentage of overlapping area within a 10-meter buffer. A value this low indicates minimal overlap between the sidewalk and road buffers. Typically, for a sidewalk to be considered alongside a road, we expect this value to be higher—close to or above 0.4 (as seen in the second example). 

Given these conditions:
- Parallelism: Satisfied.
- Clearance: Satisfied.
- Overlap: Not satisfied (value too low).



In [160]:
gt[index[i]]

0