# Evaluate inferrence with ground truth

After inferrence by Mask-RCNN or YOLOv7, convert annotations into coco json format with annotations.ipynb

Import the inferred annotation into an annotation software (CVAT by default) to review results. Add/amend/delete segmented masks as necessary.

Output the reviewed results in COCO json format. The reviewed results can now be used as the "ground truth" to compare with the inferred annotation.

Utility codes for evaluation obtained from https://github.com/cocodataset/cocoapi/issues/426 and the library pycocotools is used.

In [1]:
# Utilities

from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval

__all__ = ['COCOEvaluator']

class COCOEvaluator(object):

    def __init__(self, anno_gt_file, anno_dt_file):
        self.coco_gt = COCO(anno_gt_file)
        # self.coco_dt = self.coco_gt.loadRes(anno_dt_file)
        self.coco_dt = COCO(anno_dt_file)
        self._hack_coco_dt()

    def _hack_coco_dt(self):
        # inferred file from Mask-R-CNN has score. 
        # YOLOv7 doesn't support exporting the score in annotation files (although the score is included in the prediction tensor det[:,4] in predict.py)
        if 'score' in self.coco_dt.dataset['annotations']: pass
        else:
            for ann in self.coco_dt.dataset['annotations']:
                ann['score'] = 1.0
        
        # the ground truths (after editing in CVAT) don't have scores
        for anno in self.coco_gt.dataset['annotations']:
            anno['score'] = 1.0

    def evaluate(self, iou_type='segm', iou_Thrs=[]):
        coco_eval = COCOeval(self.coco_gt, self.coco_dt, iou_type)
        if iou_Thrs:
            coco_eval.params.iouThrs = iou_Thrs
        coco_eval.evaluate()
        coco_eval.accumulate()
        coco_eval.summarize()
        coco_eval.summarize_per_category()
        return coco_eval

In [2]:
# utilities for annotation file corrections
import numpy as np
import pandas as pd
import json

def findCategory(data):
    # find categories
    cats = data["categories"]
    category = pd.DataFrame(cats)
    category = category.drop(['supercategory'], axis=1)
    category = category.rename(columns={'id': 'category_id'})
    return category

def findImages(data):
    img = data["images"]
    images = pd.DataFrame(img)
    
    # unwanted columns exist if exported from CVAT. Not if generated by my code
    if set(['license','flickr_url','coco_url','date_captured']).issubset(images.columns):
        images = images.drop(columns=['license','flickr_url','coco_url','date_captured'])
    
    return images

def findAnnotations(data):
    anno = data["annotations"]
    df = pd.DataFrame(anno)
    return df

# convert all np.integer, np.floating and np.ndarray into json recognisable int, float and lists
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

# Creating and sorting the dataframe
def createDF(filename):
    with open(filename, 'r') as file:
        data = json.load(file)
        
        category = findCategory(data)
        images = findImages(data)
        nos_image = images['id'].max()
        df = findAnnotations(data)
        df = df.merge(images[['id','file_name']], left_on='image_id', right_on='id')
        df = df.rename(columns={'id_x': 'id'})
        df = drop_columns_if_exist(df,columns=['iscrowd','attributes','id_y'])
        return category, images, df

def drop_columns_if_exist(df, columns):
    df = df.copy()
    for col in columns:
        if col in df.columns:
            df = df.drop(columns=col)
    return df

def mergeDF(df, image, image_new):
    # map image_id to the image df
    image = image.merge(image_new[['id', 'file_name']], on='file_name', how='left')        
    
    df = df.merge(image[['id_x', 'id_y']], left_on='image_id', right_on='id_x', how='left')
    df = df.drop(columns=['image_id', 'id_x']).rename(columns={'id_y': 'image_id'})

    # Make good the dfs
    df['iscrowd'] = 0
    df['attributes'] = [{'occluded':False}] * len(df['id'])
    df = df.drop(columns=['file_name'])
    return df

def fix_image_id(image1, image2, df, df2):
    
    image_comb = pd.concat([image1, image2], ignore_index = True)                              # combine image 1 and 2
    # sort and find unique image names
    image_comb = image_comb.sort_values(by=['file_name']).reset_index(drop=True)               # sort by image name
    image_new = image_comb.drop_duplicates(subset=['file_name'])                               # Get unique image names
    image_new = image_new.reset_index(drop=True)                                               # reset index
    image_new['id'] = image_new.index + 1                                                      # create new image id

    df = mergeDF(df, image1, image_new)
    df2 = mergeDF(df2, image2, image_new)

    return image_new, df, df2

In [8]:
# Files to be evaluated
gt_file = './input/A12AL_val4_SS4.json'
dt_file = './input/Corr3_00.json'

# "image_id" won't match because CVAT (gt_file) outputs all images. My inference (dt_file) only outputs images with positive annotations
# Evaluations are based on the number of images reviewed - hence the nos_image of gt_file
# Task here: Correct image_id of the DETECTION FILE (dt_file)

# Store categories, images and annotations in separate dataframes
category, images, df = createDF(gt_file)
category2, images2, df2 = createDF(dt_file)

# Check categories
for i in range(len(category['name'])):
    if category['name'][i] != category2['name'][i]:
        print('category id: {} , {} in file 1 different from category id: {} , {} in file 2. Please check'.format(category['category_id'][i], category['name'][i], category2['category_id'][i], category2['name'][i]))
# clean category for json dump
category = category.rename(columns={'category_id': 'id'})
category['supercategory'] = ""

# Change image_id in dt_file to the one of gt_file
images_new, df, df2 = fix_image_id(images, images2, df, df2)

# JSON with revised image_id exported for evaluation
dict_to_gt = {
    "categories": category.to_dict('records'),
    "images": images_new.to_dict('records'),
    "annotations": df.to_dict('records')
    }
dict_to_dt = {
    "categories": category.to_dict('records'),
    "images": images_new.to_dict('records'),
    "annotations": df2.to_dict('records')
    }
with open("./input/gt_corrected.json", "w") as outfile:
    json.dump(dict_to_gt, outfile, cls=NpEncoder)
with open("./input/dt_corrected.json", "w") as outfile:
    json.dump(dict_to_dt, outfile, cls=NpEncoder)



## Traditional IOU evaluation

In [7]:
gt_file = './input/gt_corrected.json'
dt_file = './input/dt_corrected.json'

eval = COCOEvaluator(anno_gt_file=gt_file, anno_dt_file=dt_file)
# eval.iouThrs = 0.1
# np.arange(0.05, 1.0, 0.05).tolist() # change to your desired thresholds
result = eval.evaluate()

# Paste results into a DataFrame
# Get titles of metrics
metrics = list(["AP","AP@.50IOU","AP@.75IOU","AP (small)","AP (medium)","AP (large)","AR@1","AR@10","AR@100","AR@100 (small)","AR@100 (medium)","AR@100 (large)"])
# with open('./original/metrics.csv', 'r', encoding='utf-8-sig') as file:
#     metrics = file.readline().strip().split(',')

# Assemble Dataframe
stats = [list(i) for i in zip(*result.category_stats)]
assessed = pd.DataFrame(stats, columns=metrics)
# Copy categories
assessed['category'] = category['name']
metrics.insert(0,'category')
assessed = assessed.reindex(columns = metrics)
# Export Dataframe
assessed.to_csv('evaluated.csv', index=False)

loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
loading annotations into memory...
Done (t=0.03s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *segm*
DONE (t=0.25s).
Accumulating evaluation results...
DONE (t=0.03s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.034
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.075
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.031
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.054
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.037
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.067
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.080
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxD

## Try a range of IoU

In [9]:
import os, sys
import pandas as pd

gt_file = './input/gt_corrected.json'
dt_file = './input/dt_corrected.json'

eval = COCOEvaluator(anno_gt_file=gt_file, anno_dt_file=dt_file)
eval_iou = np.arange(0.05, 1.0, 0.05).tolist()

# Initialize a dictionary to store the results for each IoU threshold
ap, ar = {}, {}
ap_cat, ar_cat = {}, {}

for iou in eval_iou:
    # eval.iouThrs = [iou]  # Set the IoU threshold to the current value
    with open(os.devnull, 'w') as devnull:
        old_stdout = sys.stdout
        sys.stdout = devnull
        try:
            result = eval.evaluate(iou_type='segm', iou_Thrs=[iou])
        finally:
            # Restore standard output
            sys.stdout = old_stdout
    ap[f'{iou:.2f}'] = result.stats[0]  # Store the AP in the results dictionary
    ar[f'{iou:.2f}'] = result.stats[8]  # Store the AR
    ap_cat[f'AP@{iou:.2f}IOU'] = result.category_stats[:][0]  # Store the AP in the results dictionary
    ar_cat[f'AR@{iou:.2f}IOU'] = result.category_stats[:][8]  # Store the AR

# Convert the dictionary to a DataFrame
df_result = pd.DataFrame.from_dict(ap, orient='index')
df_result = df_result.merge(right=pd.DataFrame.from_dict(ar, orient='index'), left_index=True, right_index=True)
df_result.columns = ['AP', 'AR']
df_result.insert(0, 'IOU', df_result.index)
df_cat = pd.DataFrame.from_dict(ap_cat, orient='index').T
df_cat = df_cat.merge(right=pd.DataFrame.from_dict(ar_cat, orient='index').T, left_index=True, right_index=True)
df_cat.insert(0, 'Category', category['name'])

# Output the DataFrame as an excel file
with pd.ExcelWriter('output.xlsx') as writer:  
    df_result.to_excel(writer, sheet_name='overall', index=False)
    df_cat.to_excel(writer, sheet_name='category', index=False)



loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
loading annotations into memory...
Done (t=0.05s)
creating index...
index created!


In [5]:
df_result

Unnamed: 0,IOU,AP,AR
0.05,0.05,0.265705,0.504141
0.1,0.1,0.263901,0.501492
0.15,0.15,0.260749,0.498686
0.2,0.2,0.258005,0.496037
0.25,0.25,0.244169,0.484117
0.3,0.3,0.23631,0.475856
0.35,0.35,0.22836,0.467752
0.4,0.4,0.206202,0.438643
0.45,0.45,0.180385,0.410741
0.5,0.5,0.156995,0.382331


# Read Evaluation Files

This utility reads the evaluation files exported from the CRAAC pipeline. Each evaluation .xlsx file contains three pages:
* conventional: conventional *segm* metrics, by category
* overall_by_iou: mAP and AR at IoU=[0.05:0.95]
* category_by_iou: AP and AR at IoU=[0.05:0.95] by category

In [None]:
import pandas as pd
import os, glob

dir = '../output/240212_craac/eval'
eval_header = 'eval_infer'
export_file = 'infer_agg.xlsx'

# read excel files into pd dataframes
xlsx_files = glob.glob(os.path.join(dir,f'{eval_header}*.xlsx'))
num_xlsx_files = len(xlsx_files)
agg_df = pd.DataFrame(columns=['iter','IOU', 'AP', 'AR', 'AR_50:95'])
for i, xlsx_file in enumerate(xlsx_files):
    print(f'({i+1}/{num_xlsx_files}) Reading {os.path.basename(xlsx_file)}...')
    df = pd.read_excel(xlsx_file, sheet_name='overall_by_iou')
    metrics = df[df['IOU']==0.5].values.tolist()[0]
    metrics.insert(0, i+1)                                  # insert iteration number
    ar_5095 = df[df['IOU']>=0.5]['AR'].mean()
    metrics.append(ar_5095)                                 # append AR_50:95 value
    metrics_df = pd.DataFrame([metrics], columns=agg_df.columns)
    agg_df = pd.concat([agg_df,metrics_df], ignore_index=True)

agg_df = agg_df.reset_index(drop=True)
agg_df.to_excel(os.path.join(dir, export_file), index=False)