
> **Evaluation**
> ---

This file is used to check the evaluation results for the train set

## Setup

Setting up workspace

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import os

# root_dir = # "PATH_TO_YOUR_FOLDER"
data_dir = os.path.join(root_dir, "project_data")

os.chdir(root_dir)
print("Current working directory:", os.getcwd())

Current working directory: /content/drive/My Drive/03 Ewha Exchange/05 computer vision/group project final


In [None]:
# !git clone https://github.com/ewha-pai/cv_project_tools.git

In [None]:
%cd cv_project_tools

/content/drive/My Drive/03 Ewha Exchange/05 computer vision/group project final/cv_project_tools


In [None]:
# !python setup.py build_ext --inplace

In [None]:
import numpy as np
from tqdm import tqdm

from core.dataset import BoundaryDataset
from core import evaluate_boundaries

Loading train images dataset

In [None]:
split = 'train'

dataset = BoundaryDataset(root_dir + "/project_data", split=split)

print(f"Data size: {len(dataset)}")

Data size: 200


Loading prediction npy file

In [None]:
# Load the saved predictions
predictions = np.load(os.path.join(root_dir, 'predictions_train.npy'), allow_pickle=True).item()

# Function to load the prediction for a given sample name
def load_pred(sample_name):
    # Retrieve the prediction for the given sample name from the loaded dictionary
    if sample_name in predictions:
        pred = predictions[sample_name]
        return pred
    else:
        raise KeyError(f"Sample '{sample_name}' not found in the predictions.")

## Primary Evaluation using first 10 images

In [None]:
thresholds = 20
apply_thinning = True

sample_names = dataset.sample_names[:10]

sample_results, threshold_results, best_result_single, best_result = evaluate_boundaries.pr_evaluation(
    thresholds, sample_names, dataset.load_boundaries, load_pred, apply_thinning=apply_thinning, progress=tqdm
)

Processing samples: 100%|██████████| 10/10 [03:10<00:00, 19.07s/it]


To calculate your final score, the following two F1 scores will be averaged:
- F1 score using the best threshold for each individual image.
- F1 score using the single best threshold for all images.

In [None]:
print('{:<16}: {:<10.6f}'.format('Best F1', best_result.f1))
print('{:<16}: {:<10.6f}'.format('Best F1 (Single)', best_result_single.f1))

Best F1         : 0.653369  
Best F1 (Single): 0.631434  


Overall Evaluation Metrics Using the Best Threshold for Each Image

In [None]:
print('[Overall Results]')
print('{:<10} {:<10} {:<10} {:<10}'.format('Recall', 'Precision', 'F1-Score', 'Area PR'))
print('{:<10.6f} {:<10.6f} {:<10.6f} {:<10.6f}'.format(
    best_result.recall, best_result.precision, best_result.f1, best_result.area_pr)
)

[Overall Results]
Recall     Precision  F1-Score   Area PR   
0.718139   0.599315   0.653369   0.598652  


Overall Evaluation Metrics Using a Single Best Threshold for All Images

In [None]:
print('[Overall Results using Single Threshold]')
print('{:<10} {:<10} {:<10} {:<10}'.format('Threshold', 'Recall', 'Precision', 'F1-Score'))
print('{:<10.6f} {:<10.6f} {:<10.6f} {:<10.6f}'.format(
    best_result_single.threshold, best_result_single.recall, best_result_single.precision, best_result_single.f1)
)

[Overall Results using Single Threshold]
Threshold  Recall     Precision  F1-Score  
0.380952   0.675942   0.592425   0.631434  


Evaluation Results Per Image

In [None]:
print('[Results Per Image]')
print('{:<10} {:<10} {:<10} {:<10} {:<10}'.format('ID', 'Threshold', 'Recall', 'Precision', 'F1-Score'))
for sample_index, res in enumerate(sample_results):
    print('{:<10s} {:<10.6f} {:<10.6f} {:<10.6f} {:<10.6f}'.format(
        res.sample_name, res.threshold, res.recall, res.precision, res.f1))

[Results Per Image]
ID         Threshold  Recall     Precision  F1-Score  
37073      0.333333   0.803165   0.711030   0.754295  
140055     0.285714   0.643748   0.399757   0.493227  
118035     0.380952   0.769609   0.826945   0.797247  
159008     0.333333   0.615313   0.456428   0.524093  
78019      0.285714   0.776323   0.482823   0.595367  
314016     0.333333   0.764706   0.647781   0.701404  
254033     0.571429   0.362343   0.304243   0.330761  
189080     0.333333   0.700785   0.748637   0.723921  
8049       0.523810   0.805696   0.889782   0.845654  
12003      0.428571   0.753671   0.703286   0.727607  


Evaluation Results Per Threshold

In [None]:
print('[Results Per Threshold]')
print('{:<10} {:<10} {:<10} {:<10}'.format('Threshold', 'Recall', 'Precision', 'F1-Score'))
for thresh_i, res in enumerate(threshold_results):
    print('{:<10.6f} {:<10.6f} {:<10.6f} {:<10.6f}'.format(
        res.threshold, res.recall, res.precision, res.f1))

[Results Per Threshold]
Threshold  Recall     Precision  F1-Score  
0.047619   0.823757   0.215383   0.341481  
0.095238   0.879792   0.264079   0.406225  
0.142857   0.887145   0.324193   0.474857  
0.190476   0.871942   0.386209   0.535313  
0.238095   0.837121   0.444191   0.580408  
0.285714   0.789364   0.500357   0.612480  
0.333333   0.733723   0.548160   0.627510  
0.380952   0.675942   0.592425   0.631434  
0.428571   0.618139   0.634570   0.626247  
0.476190   0.559260   0.669744   0.609536  
0.523810   0.504555   0.704095   0.587853  
0.571429   0.451179   0.738735   0.560212  
0.619048   0.396451   0.766329   0.522561  
0.666667   0.348266   0.786245   0.482714  
0.714286   0.300983   0.803087   0.437862  
0.761905   0.251642   0.817083   0.384780  
0.809524   0.199850   0.831750   0.322266  
0.857143   0.142416   0.819709   0.242671  
0.904762   0.057445   0.833194   0.107480  
0.952381   0.013584   0.815254   0.026722  


## Evaluation using all images

In [None]:
thresholds = 20
apply_thinning = True

sample_names = dataset.sample_names

sample_results, threshold_results, best_result_single, best_result = evaluate_boundaries.pr_evaluation(
    thresholds, sample_names, dataset.load_boundaries, load_pred, apply_thinning=apply_thinning, progress=tqdm
)

Processing samples: 100%|██████████| 200/200 [50:14<00:00, 15.07s/it]


To calculate your final score, the following two F1 scores will be averaged:
- F1 score using the best threshold for each individual image.
- F1 score using the single best threshold for all images.

In [None]:
print('{:<16}: {:<10.6f}'.format('Best F1', best_result.f1))
print('{:<16}: {:<10.6f}'.format('Best F1 (Single)', best_result_single.f1))

Best F1         : 0.585329  
Best F1 (Single): 0.558811  


Overall Evaluation Metrics Using the Best Threshold for Each Image

In [None]:
print('[Overall Results]')
print('{:<10} {:<10} {:<10} {:<10}'.format('Recall', 'Precision', 'F1-Score', 'Area PR'))
print('{:<10.6f} {:<10.6f} {:<10.6f} {:<10.6f}'.format(
    best_result.recall, best_result.precision, best_result.f1, best_result.area_pr)
)

[Overall Results]
Recall     Precision  F1-Score   Area PR   
0.673192   0.517754   0.585329   0.506729  


Overall Evaluation Metrics Using a Single Best Threshold for All Images

In [None]:
print('[Overall Results using Single Threshold]')
print('{:<10} {:<10} {:<10} {:<10}'.format('Threshold', 'Recall', 'Precision', 'F1-Score'))
print('{:<10.6f} {:<10.6f} {:<10.6f} {:<10.6f}'.format(
    best_result_single.threshold, best_result_single.recall, best_result_single.precision, best_result_single.f1)
)

[Overall Results using Single Threshold]
Threshold  Recall     Precision  F1-Score  
0.380952   0.615178   0.511906   0.558811  


Evaluation Results Per Image

In [None]:
print('[Results Per Image]')
print('{:<10} {:<10} {:<10} {:<10} {:<10}'.format('ID', 'Threshold', 'Recall', 'Precision', 'F1-Score'))
for sample_index, res in enumerate(sample_results):
    print('{:<10s} {:<10.6f} {:<10.6f} {:<10.6f} {:<10.6f}'.format(
        res.sample_name, res.threshold, res.recall, res.precision, res.f1))

[Results Per Image]
ID         Threshold  Recall     Precision  F1-Score  
37073      0.333333   0.802942   0.710865   0.754104  
140055     0.285714   0.643748   0.399583   0.493095  
118035     0.380952   0.769363   0.826945   0.797115  
159008     0.333333   0.615313   0.456110   0.523883  
78019      0.285714   0.776393   0.483759   0.596098  
314016     0.333333   0.764961   0.647566   0.701385  
254033     0.571429   0.362343   0.304644   0.330997  
189080     0.285714   0.741412   0.706993   0.723794  
8049       0.523810   0.805794   0.890371   0.845974  
12003      0.428571   0.753557   0.703092   0.727450  
159091     0.476190   0.690040   0.628788   0.657991  
71046      0.380952   0.715615   0.636364   0.673667  
59078      0.428571   0.548759   0.623219   0.583623  
2092       0.333333   0.782979   0.680518   0.728161  
208001     0.190476   0.713926   0.422140   0.530562  
232038     0.285714   0.698475   0.588376   0.638716  
188063     0.285714   0.617951   0.391222   0

Evaluation Results Per Threshold

In [None]:
print('[Results Per Threshold]')
print('{:<10} {:<10} {:<10} {:<10}'.format('Threshold', 'Recall', 'Precision', 'F1-Score'))
for thresh_i, res in enumerate(threshold_results):
    print('{:<10.6f} {:<10.6f} {:<10.6f} {:<10.6f}'.format(
        res.threshold, res.recall, res.precision, res.f1))

[Results Per Threshold]
Threshold  Recall     Precision  F1-Score  
0.047619   0.814186   0.183281   0.299208  
0.095238   0.879992   0.213043   0.343038  
0.142857   0.872778   0.254836   0.394489  
0.190476   0.840265   0.304049   0.446524  
0.238095   0.792374   0.356133   0.491404  
0.285714   0.738133   0.409374   0.526659  
0.333333   0.677125   0.461103   0.548615  
0.380952   0.615178   0.511906   0.558811  
0.428571   0.552673   0.558042   0.555344  
0.476190   0.492318   0.600476   0.541045  
0.523810   0.433901   0.636298   0.515961  
0.571429   0.376027   0.666277   0.480739  
0.619048   0.320271   0.690708   0.437623  
0.666667   0.269389   0.711758   0.390848  
0.714286   0.221309   0.729918   0.339640  
0.761905   0.176778   0.744271   0.285698  
0.809524   0.134743   0.755742   0.228709  
0.857143   0.095561   0.762936   0.169847  
0.904762   0.050831   0.775582   0.095409  
0.952381   0.016034   0.809278   0.031444  
