# Edit Annotations

Thanks for using this jupyter notebook. This notebook covers several useful functions to play around with the annotations we have. The list below summarises where you can find codes for each function.

### **[Almost always needed]** Importing: 
Import JSON into DataFrame - Function (1)

### Queries and counting:
* Count the category (and/or clicks) distribution - Function (2)
* Count the extra effort after inference - Function (9)

### Visualisation *[in visualisation.ipynb]*
* Export detected instances on images - Function (V1)

# 1) Import JSON into DataFrame

Export annotations from CVAT as COCO JSON. 

Put the json file in the working directory and change the variable "filename".

In [1]:
import pandas as pd
import numpy as np
import json

# utilities

def findCategory(data):
    # find categories
    cats = data["categories"]
    category = pd.DataFrame(cats)
    category = category.drop(['supercategory'], axis=1)
    category = category.rename(columns={'id': 'category_id'})
    return category

def findImages(data):
    img = data["images"]
    images = pd.DataFrame(img)
    
    # unwanted columns exist if exported from CVAT. Not if generated by my code
    if set(['license','flickr_url','coco_url','date_captured']).issubset(images.columns):
        images = images.drop(columns=['license','flickr_url','coco_url','date_captured'])
    
    return images

def findAnnotations(data):
    anno = data["annotations"]
    df = pd.DataFrame(anno)
    return df

def cleanForJson(category=None, df=None):
    # clean category for json dump
    if category is not None:
        category = category.rename(columns={'category_id': 'id'})
        category['supercategory'] = ""

    # add columns in df for json dump
    if df is not None:
        df['iscrowd'] = 0
        df['attributes'] = [{'occluded':False}] * len(df['id'])
        cols = ['id', 'image_id', 'category_id', 'segmentation', 'area', 'bbox', 'iscrowd', 'attributes']
        df = df[cols + [c for c in df.columns if c not in cols]]
    
    return category, df

# convert all np.integer, np.floating and np.ndarray into json recognisable int, float and lists
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

In [4]:
import pandas as pd
import numpy as np
import re
import json

df = pd.DataFrame()

# filename = './A12AL/A12AL_train_11.json'
# filename = './input/230725limR101_SS4_p1_2.json'
filename = './input/A12_v3.json'

with open(filename, 'r') as file:
    data = json.load(file)
    
    if "categories" in data:
        category = findCategory(data)
    
    if "images" in data:
        images = findImages(data)
        nos_image = images['id'].nunique()

    df = findAnnotations(data)


# 2) Count numbers of defects

In [None]:
# Input
findTotal = False           # Count total number of images
area = "hello"              # for finding the total number of images
qall = True                 # Query all images in df?
qclick = False               # Query the number of clicks in segmentation?
qarea = False               # Query the area of segmentation masks?

# Count total number of images
if findTotal == True:
    if area == 'A14 Tothill':
        nos_image = 2343
    elif area == 'A12 Mountnessing':
        nos_image = 6580
# if not imported from json, manually input nos. images
# nos_image = 858
        
# Count images with defects by querying image id
if qall == True:
    start_id = 1
    end_id = max(df['image_id'])
elif qall == False:
    start_id = 4356
    end_id = 6580

df2 = df[(df['image_id']>=start_id) & (df['image_id']<=end_id)]

# report the number of instances of defects
nos_image_d = df2['image_id'].nunique()
nos_defects = len(df2['id'])
print(f'{nos_image_d} out of {nos_image} images have defects.\n{nos_defects} defects in total')

# count clicks in segmentations
def statsQuick(i, seg_pt, mult=1):
    min_num = min(seg_pt)*mult
    max_num = max(seg_pt)*mult
    q3, q2, q1 = np.percentile(np.array(seg_pt)*mult, [75,50,25])
    click_sum = sum(seg_pt)*mult
    avg_num = round(click_sum/len(seg_pt),2)
    sd = round(np.std(np.array(seg_pt)*mult),2)
    return i, avg_num, sd, min_num, q1, q2, q3, max_num

if qclick == True:
    catidx = np.sort(df2['category_id'].unique()).tolist()
    click_count = []
    for i in catidx:
        df_count = df2[(df2['category_id']==i)]
        seg_list = df_count['segmentation'].tolist()
        seg_pt = [len(x[0]) if x else 8 for x in seg_list]
        click_count.append(statsQuick(i, seg_pt,mult=0.5))

# count areas in segmentations
if qarea == True:
    catidx = np.sort(df2['category_id'].unique()).tolist()
    area_count = []
    for i in catidx:
        df_count = df2[(df2['category_id']==i)]
        area_list = df_count['area'].tolist()
        area_count.append(statsQuick(i, area_list))


# -----------------------
# Create a pivot table that counts the number of instance
# -----------------------
defect_cat = pd.pivot_table(df2, values='id', index='category_id', aggfunc='count')

defect_cat = defect_cat.merge(category[['category_id','name']], left_on='category_id', right_on='category_id')
defect_cat = defect_cat.rename(columns={'id': 'counts'})
stats_col = ["category_id","mean", "SD", "min.", "25th percentile", "median", "75th percentile","max."]
if qclick == True:
    click_cat = pd.DataFrame(np.array(click_count), columns=stats_col)
    defect_cat = defect_cat.merge(click_cat, left_on='category_id', right_on='category_id')
if qarea == True:
    area_cat = pd.DataFrame(np.array(area_count), columns=stats_col)
    defect_cat = defect_cat.merge(area_cat, left_on='category_id', right_on='category_id')

defect_cat

# 9) Count extra effort after inference

I use two metrics to evaluate the annotation process. One is the quality of annotations, which is evaluated by some measures of precisions and recalls.

The other is human effort measured by the number of clicks.

Concretely, after receiving the inference annotation file and GT file, I will need to measure:
* how many masks are deleted (1 click per box)
* how many categories are changed (2 clicks per box)
* how many masks are added
    * treat as no change if bbox hasn't moved (just being picky to refine the masks)
    * for genuinely new masks, use the median number of clicks for the category

In [2]:
# Functions to format dataFrames
def extractLists(df):
    gt_bboxes = df['bbox'].to_list()
    gt_catids = df['category_id'].to_list()
    return gt_bboxes, gt_catids

def _imgReindex(df, images_new):
    for i in range(len(df['id'])):
        df.loc[i, 'image_id'] = images_new.loc[(images_new['file_name']==df['file_name'][i]), 'id'].values
    df = df.sort_values(by=['image_id'], ignore_index=True)
    df['id'] = df.index + 1
    df = df.drop(columns=['file_name'])
    return df

def imageReindex(df, df2, images, images2):
    # Merge two annotation df
    df_new = pd.concat([df, df2], ignore_index = True)

    # Combine and re-arrange the image info
    images_new = pd.DataFrame(columns=['id','width','height','file_name'])
    images_new['file_name'] = df_new['file_name'].unique()
    images_new = images_new.sort_values(by=['file_name'], ignore_index=True)
    images_new['id'] = images_new.index + 1
    # Fill in width and height parameters of each image
    for i in range(len(images_new['file_name'])):
        # find out which json records the concerned image
        if images_new['file_name'][i] in images['file_name'].tolist():
            dim = images.loc[(images['file_name']==images_new['file_name'][i]),['width','height']]
        elif images_new['file_name'][i] in images2['file_name'].tolist():
            dim = images2.loc[(images2['file_name']==images_new['file_name'][i]),['width','height']]
        else:
            print('image not included')
            continue
        # paste width and height info
        images_new.loc[i,'width'] = dim['width'].values[0]
        images_new.loc[i,'height'] = dim['height'].values[0]

    # Assign new image id and defect id in df and df2
    df = _imgReindex(df, images_new)
    df2 = _imgReindex(df2, images_new)

    return df, df2, images_new


# Functions to check bbox movements/additions/deletions
def compare_bboxes(image_id, pseudolabels, ground_truths, ps_cat, gt_cat, centroid_threshold=10, areadiff_threshold=0.1):
    deleted_bboxes = []
    added_bboxes = []
    changed_size_bboxes = []
    same_bboxes = []

    pseudolabel_matched = [False] * len(pseudolabels)
    ground_truth_matched = [False] * len(ground_truths)

    # Check for deleted and changed size bounding boxes
    for idx, p_bbox in enumerate(pseudolabels):
        found_match = False
        for gt_idx, gt_bbox in enumerate(ground_truths):
            
            # find area difference
            p_area = (p_bbox[2] - p_bbox[0]) * (p_bbox[3] - p_bbox[1])
            gt_area = (gt_bbox[2] - gt_bbox[0]) * (gt_bbox[3] - gt_bbox[1])
            try:
                area_diff = abs(p_area - gt_area) / max(p_area, gt_area)
            except:
                # print(f'image {image_id} pseudolabel {idx} p_area = {p_area}, GT label {gt_idx} gt_area = {gt_area}')
                continue

            # find centroid distance
            p_centroid = ((p_bbox[0] + p_bbox[2]) / 2, (p_bbox[1] + p_bbox[3]) / 2)
            gt_centroid = ((gt_bbox[0] + gt_bbox[2]) / 2, (gt_bbox[1] + gt_bbox[3]) / 2)
            centroid_dist = ((p_centroid[0] - gt_centroid[0]) ** 2 + (p_centroid[1] - gt_centroid[1]) ** 2) ** 0.5

            # print(f'Psuedolabel {idx} vs GT label {gt_idx}, area diff = {area_diff}, centroid dist = {centroid_dist}')
            if area_diff <= areadiff_threshold and centroid_dist <= centroid_threshold:
                found_match = True
                ground_truth_matched[gt_idx] = True
                if area_diff <= 0.1 and centroid_dist <= centroid_threshold * 0.1:
                    same_bboxes.append((idx, p_bbox, image_id, ps_cat[idx]))
                else:
                    changed_size_bboxes.append((idx, p_bbox, image_id, ps_cat[idx]))
                break
            
            # iou_value = iou(p_bbox, gt_bbox)
            # if iou_value >= iou_threshold:
            #     found_match = True
            #     ground_truth_matched[gt_idx] = True

            #     if abs(p_bbox[0] - gt_bbox[0]) > threshold or \
            #             abs(p_bbox[1] - gt_bbox[1]) > threshold or \
            #             abs(p_bbox[2] - gt_bbox[2]) > threshold or \
            #             abs(p_bbox[3] - gt_bbox[3]) > threshold:
            #         changed_size_bboxes.append((idx, p_bbox, image_id, ps_cat[idx]))
            #     break

        if not found_match:
            deleted_bboxes.append((idx, p_bbox, image_id, ps_cat[idx]))

    # Check for added bounding boxes
    for idx, gt_bbox in enumerate(ground_truths):
        if not ground_truth_matched[idx]:
            added_bboxes.append((idx, gt_bbox, image_id, gt_cat[idx]))

    return deleted_bboxes, added_bboxes, changed_size_bboxes, same_bboxes

def compare_cats(image_id, pseudolabels, ground_truths, ps_cat, gt_cat, iou_threshold=0.5):
    changed_cats_bboxes = []
    ground_truth_matched = [False] * len(ground_truths)
    
    # Check for deleted and changed size bounding boxes
    for idx, p_bbox in enumerate(pseudolabels):
        for gt_idx, gt_bbox in enumerate(ground_truths):
            iou_value = iou(p_bbox, gt_bbox)
            if iou_value >= iou_threshold:
                ground_truth_matched[gt_idx] = True
                if gt_cat[gt_idx] != ps_cat[idx]:       # changed GT categories
                    changed_cats_bboxes.append((idx, p_bbox, image_id, gt_cat[gt_idx]))
    
    return changed_cats_bboxes

def iou(bbox1, bbox2):
    # Calculate the Intersection over Union (IoU) of two bounding boxes
    x1 = max(bbox1[0], bbox2[0])
    y1 = max(bbox1[1], bbox2[1])
    x2 = min(bbox1[2], bbox2[2])
    y2 = min(bbox1[3], bbox2[3])

    intersection = max(0, x2 - x1) * max(0, y2 - y1)
    area_bbox1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
    area_bbox2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])

    if area_bbox1>0 or area_bbox2>0:
        iou_value = intersection / (area_bbox1 + area_bbox2 - intersection)
    else:
        iou_value = 0
    
    return iou_value

### Single File

In [None]:
import pandas as pd
import numpy as np
import re
import json

# Files to be compared
# gt_file = './input/SS4_GT.json'
# ps_file = './input/230810_SS4_coninferred_c3_3.json'
gt_file = '../data/A12AL/A12AL_val4_SS4.json'
ps_file = '../output/240218_al/labels/inferred_14.json'
# click_stat = {1: 8.5, 2: 6, 3: 5, 4: 5, 5: 8, 6: 7, 7: 6.5, 10: 6, 11: 7, 12: 5}
click_stat = {1: 5, 2: 5, 3: 6, 4: 7, 5: 5}
del_click = 1
chsize_click = 4
chcat_click = 2
centroid_tolerance = 0.2
areadiff_deladd = 0.4
# iou_deladd = 0.1
iou_chcat = 0.1

# Store categories, images and annotations in separate dataframes
with open(gt_file, 'r') as file:
    data = json.load(file)
    category = findCategory(data)
    images = findImages(data)
    nos_image = images['id'].max()
    df = findAnnotations(data)
    df = df.merge(images[['id','file_name']], left_on='image_id', right_on='id').rename(columns={'id_x': 'id'}).drop(columns=['id_y'])

with open(ps_file, 'r') as file:
    data2 = json.load(file)
    category2 = findCategory(data2)
    images2 = findImages(data2)
    nos_image2 = images2['id'].max()
    df2 = findAnnotations(data2)
    df2 = df2.merge(images2[['id','file_name']], left_on='image_id', right_on='id').rename(columns={'id_x': 'id'}).drop(columns=['id_y'])

# -------------------------------------
# Safety checks
# -------------------------------------
# Check if categories are the same
# check length
if len(category) != len(category2):
    print('categories not the same. Check before proceeding')
# check each category
for i in range(len(category['name'])):
    if category['name'][i] != category2['name'][i]:
        print('category id: {} , {} in file 1 different from category id: {} , {} in file 2. Please check'.format(category['category_id'][i], category['name'][i], category2['category_id'][i], category2['name'][i]))
# clean category for json dump
category, df = cleanForJson(category, df)
category2, df2 = cleanForJson(category2, df2)

# Update image_id
df, df2, images_new = imageReindex(df, df2, images, images2)
img_id_list = images_new['id'].to_list()

# ---------------------------------------
# Compare Annotations
# ---------------------------------------
effort_accum = []

for image_id in img_id_list:
    # Extract inferred bboxes and GT bboxes
    gt_df = df[(df['image_id']==image_id)]
    ps_df = df2[(df2['image_id']==image_id)]
    gt_bboxes, gt_catids = extractLists(gt_df)
    pred_bboxes, pred_catids = extractLists(ps_df)

    # Extract image dimensions
    dim = images_new.loc[(images_new['id']==image_id), ['width', 'height']].values
    image_width = int(dim[0][0])
    image_height = int(dim[0][1])
    centroid_threshold = max(image_width, image_height) * centroid_tolerance

    # Compare bboxes
    # deleted, added, changed_size = compare_bboxes(image_id, pred_bboxes, gt_bboxes, pred_catids, gt_catids, iou_threshold=iou_deladd)
    deleted, added, changed_size, same = compare_bboxes(image_id, pred_bboxes, gt_bboxes, pred_catids, gt_catids, centroid_threshold, areadiff_deladd)
    changed_cats = compare_cats(image_id, pred_bboxes, gt_bboxes, pred_catids, gt_catids, iou_threshold=iou_chcat)

    # Calculate efforts for additions
    add_cat = [x[3] for x in added]
    add_click = [click_stat[x] for x in add_cat]
    effort_accum.append([image_id, len(deleted), len(added), len(changed_size), len(same), len(changed_cats), add_cat, sum(add_click)])

human_effort = pd.DataFrame(effort_accum, columns=['image_id','deleted','added', 'chsize', 'same', 'chcat', 'added_cat', 'clicks_add'])

# ----------------------------------------
# Calculate the number of clicks needed
# ----------------------------------------
human_effort['clicks_del'] = human_effort['deleted'] * del_click
human_effort['clicks_chsize'] = human_effort['chsize'] * chsize_click
human_effort['clicks_chcat'] = human_effort['chcat'] * chcat_click
human_effort['subtotal'] = human_effort['clicks_del'] + human_effort['clicks_add'] + human_effort['clicks_chsize'] + human_effort['clicks_chcat']

print("File:", ps_file)
print("Total clicks:",sum(human_effort['subtotal']),'\n'
      "Added:",sum(human_effort['added']),'\n'
      "Deleted:",sum(human_effort['deleted']),'\n'
      "Changed Size:", sum(human_effort['chsize']),'\n'
      "Same", sum(human_effort['same']),'\n'
      "Changed Category:",sum(human_effort['chcat'])
)

# human_effort


### Multiple annotation files

In [9]:
import pandas as pd
import numpy as np
import re
import json, os

# Files to be compared
# gt_file = './input/SS4_GT.json'
# ps_file = './input/230810_SS4_coninferred_c3_3.json'
gt_file = '../data/A12AL/A12AL_val4_SS4.json'
ps_dir = "../output/240218_al/labels"
ps_header = "Corr"     # no need to include the underscore
eval_dir = "/mnt/c/Users/phl25/Downloads/transit/240202_casac"

# click_stat = {1: 8.5, 2: 6, 3: 5, 4: 5, 5: 8, 6: 7, 7: 6.5, 10: 6, 11: 7, 12: 5}
click_stat = {1: 5, 2: 5, 3: 6, 4: 7, 5: 5}
del_click = 1
chsize_click = 4
chcat_click = 2
centroid_tolerance = 0.2
areadiff_deladd = 0.4
# iou_deladd = 0.1
iou_chcat = 0.1
result_accum = []

for it in range(0,28):
    ps_file = os.path.join(ps_dir,f'{ps_header}_{str(it).zfill(2)}.json')

    # Store categories, images and annotations in separate dataframes
    with open(gt_file, 'r') as file:
        data = json.load(file)
        category = findCategory(data)
        images = findImages(data)
        nos_image = images['id'].max()
        df = findAnnotations(data)
        df = df.merge(images[['id','file_name']], left_on='image_id', right_on='id').rename(columns={'id_x': 'id'}).drop(columns=['id_y'])

    with open(ps_file, 'r') as file:
        data2 = json.load(file)
        category2 = findCategory(data2)
        images2 = findImages(data2)
        nos_image2 = images2['id'].max()
        df2 = findAnnotations(data2)
        df2 = df2.merge(images2[['id','file_name']], left_on='image_id', right_on='id').rename(columns={'id_x': 'id'}).drop(columns=['id_y'])

    # -------------------------------------
    # Safety checks
    # -------------------------------------
    # Check if categories are the same
    # check length
    if len(category) != len(category2):
        print('categories not the same. Check before proceeding')
    # check each category
    for i in range(len(category['name'])):
        if category['name'][i] != category2['name'][i]:
            print('category id: {} , {} in file 1 different from category id: {} , {} in file 2. Please check'.format(category['category_id'][i], category['name'][i], category2['category_id'][i], category2['name'][i]))
    # clean category for json dump
    category, df = cleanForJson(category, df)
    category2, df2 = cleanForJson(category2, df2)

    # Update image_id
    df, df2, images_new = imageReindex(df, df2, images, images2)
    img_id_list = images_new['id'].to_list()

    # ---------------------------------------
    # Compare Annotations
    # ---------------------------------------
    effort_accum = []

    for image_id in img_id_list:
        # Extract inferred bboxes and GT bboxes
        gt_df = df[(df['image_id']==image_id)]
        ps_df = df2[(df2['image_id']==image_id)]
        gt_bboxes, gt_catids = extractLists(gt_df)
        pred_bboxes, pred_catids = extractLists(ps_df)

        # Extract image dimensions
        dim = images_new.loc[(images_new['id']==image_id), ['width', 'height']].values
        image_width = int(dim[0][0])
        image_height = int(dim[0][1])
        centroid_threshold = max(image_width, image_height) * centroid_tolerance

        # Compare bboxes
        # deleted, added, changed_size = compare_bboxes(image_id, pred_bboxes, gt_bboxes, pred_catids, gt_catids, iou_threshold=iou_deladd)
        deleted, added, changed_size, same = compare_bboxes(image_id, pred_bboxes, gt_bboxes, pred_catids, gt_catids, centroid_threshold, areadiff_deladd)
        changed_cats = compare_cats(image_id, pred_bboxes, gt_bboxes, pred_catids, gt_catids, iou_threshold=iou_chcat)

        # Calculate efforts for additions
        add_cat = [x[3] for x in added]
        add_click = [click_stat[x] for x in add_cat]
        effort_accum.append([image_id, len(deleted), len(added), len(changed_size), len(same), len(changed_cats), add_cat, sum(add_click)])

    human_effort = pd.DataFrame(effort_accum, columns=['image_id','deleted','added', 'chsize', 'same', 'chcat', 'added_cat', 'clicks_add'])

    # ----------------------------------------
    # Calculate the number of clicks needed
    # ----------------------------------------
    human_effort['clicks_del'] = human_effort['deleted'] * del_click
    human_effort['clicks_chsize'] = human_effort['chsize'] * chsize_click
    human_effort['clicks_chcat'] = human_effort['chcat'] * chcat_click
    human_effort['subtotal'] = human_effort['clicks_del'] + human_effort['clicks_add'] + human_effort['clicks_chsize'] + human_effort['clicks_chcat']
    result_accum.append([os.path.basename(ps_file), sum(human_effort['subtotal']), sum(human_effort['added']), sum(human_effort['deleted']), sum(human_effort['chsize']), sum(human_effort['same']), sum(human_effort['chcat'])])
    
result_df = pd.DataFrame(result_accum, columns=['file','total_clicks','added','deleted','changed_size','same','changed_cat'])
file_name = f'{ps_header}_clicks.xlsx'
result_df.to_excel(os.path.join(eval_dir, file_name),index=False)