Drive linking to import needed folders

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:

bench_mode = 'performance' # 'dataset' or 'performance'

if bench_mode == 'dataset':
    !rm -rf dataset

    !unzip drive/MyDrive/testing/skinny/20210521-204405_p.zip -d dataset # skinny full
    !mv dataset/20210521-204405/* dataset/skinny
    !unzip drive/MyDrive/testing/skinny/20210523-192002_p.zip -d dataset # skinny skintones full
    !mv dataset/20210523-192002/* dataset/skinny_st

    !unzip drive/MyDrive/testing/bayes/bayes.zip -d dataset # bayes full
    !unzip drive/MyDrive/testing/bayes/bayes_st.zip -d dataset # bayes skintones full

    !unzip drive/MyDrive/testing/dyc/dyc.zip -d dataset # dyc
    !unzip drive/MyDrive/testing/dyc/dyc_st.zip -d dataset # dyc
elif bench_mode == 'performance':
    !rm -rf performance

    !unzip drive/MyDrive/testing/benchmark/bench_skinny.zip -d performance
    !unzip drive/MyDrive/testing/benchmark/bench_bayes.zip -d performance
    !unzip drive/MyDrive/testing/benchmark/bench_dyc.zip -d performance


Main functions

In [None]:
import os
import numpy as np
from tqdm import tqdm # progress bar
from PIL import Image # used as image loader
import math
import cv2 # image processing
from statistics import mean, stdev, pstdev


# Metric Utils



# load images as numpy BOOL arrays (0-1)
def load_images(gt_path: str, pred_path: str, threshold: int = 128):
    # load as grayscale uint8
    gt_gray = np.array(Image.open(gt_path).convert('L')) # .convert('LA')
    pred_gray = np.array(Image.open(pred_path).convert('L')) #.convert('LA')
    # binarize and convert to bool
    gt_bool = gt_gray > threshold
    pred_bool = pred_gray > threshold

    # debug
    # print(f'Dims: gt-gray={gt_gray.shape} gt-bool={gt_bool.shape} p-gray={pred_gray.shape} p-bool={pred_bool.shape}')
    # display(Image.open(gt_path))
    # display(Image.open(gt_path).convert('L'))
    return gt_bool, pred_bool


# OLD AVERAGE: calculate average between all experiments of EVERY metric
# y and p files must have the same filename
def pd_metrics_old(gt_dir: str, pred_dir: str, metric_fns: list, threshold: int = 128) -> list:
    out = []

    for y_filename in tqdm(os.listdir(gt_dir)):
        y_path = os.path.join(gt_dir, y_filename)
        p_filename = os.path.splitext(y_filename)[0] + '.png'
        p_path = os.path.join(pred_dir, p_filename) # pred are always PNG


        # start adding item data into a structure
        idata = {}
        idata['y'] = y_path
        idata['p'] = p_path

        # load images from paths and apply threshold to binarize
        # the skin probabilities of predictions
        y_true, y_pred = load_images(y_path, p_path, threshold)
        confmat = confmat_scores(y_true, y_pred)

        # calculate metrics for the image couple loaded
        for metric_fn in metric_fns:
            f_name = metric_fn.__name__
            f_argcount = metric_fn.__code__.co_argcount # amount of argument in function definition

            # only one args: the metric only uses confusion matrix scores and is LUT-optimized
            if f_argcount == 1:
                idata[f_name] = metric_fn(confmat)
            # two args: confusion matrix scores aren't enough
            else:
                idata[f_name] = metric_fn(y_true, y_pred)

            # debug
            #display(Image.fromarray(y_true))
            #display(Image.fromarray(y_pred))
        
        # update the final data list
        out.append(idata)
    
    print(f'  Found {len(out)} matches')
    
    return out

# print human-readable metrics results data
# 
# !!NOTE!!
# the mean F1 value calculated by summing all experiments F1 and divinding by N elements
# is different than the mean calculated by applying the F1 formula on average REcall and PRecision
def print_pd_mean_old(total: list, metric_fns: list, desc: str, method: str) -> None:
    # prepare the JSON table
    # JSON table item example: {"name":"ecu", "F1":".9123 +- 0.25", "IOU":".8744 +- 0.11"}
    #{"method":"Skinny", "train":"ecu", "test":"hgr", "F1":".9123 +- 0.25", "IOU":".8744 +- 0.11"},
    #{"method":"Skinny", "train":"ecu", "test":"schmugge", "F1":".5123 +- 0.15", "IOU":".1744 +- 0.13"},
    #{"method":"Bayes", "train":"ecu", "test":"hgr", "F1":".9123 +- 0.25", "IOU":".8744 +- 0.11"}
    res = {}
    res['method'] = method
    desc = os.path.basename(desc)
    desc = desc.lower().replace('_small', '')
    dss = desc.split('_')
    ds_tr = dss[0]
    ds_te = dss[2]
    res['train'] = ds_tr
    res['test'] = ds_te

    print(f'{method}: {desc}') # output info

    for metric_fn in metric_fns:
        f_name = metric_fn.__name__
        f_data = [ d[f_name] for d in total ]
        #f_mean = sum(f_data) / len(total)
        f_mean = mean(f_data)
        #f_mean = round(f_mean, 4) # round to 4 decimals
        f_mean = '{:.4f}'.format(f_mean) # round and zerofill
        f_std = pstdev(f_data)
        #f_std = round(f_std, 2) # round to 2 decimals
        f_std = '{:.2f}'.format(f_std) # round and zerofill
        print(f'{f_name}: {f_mean} ± {f_std}')

        # add each metric data to the JSON table
        res[f_name] = f'{f_mean} ± {f_std}'
    
    return res # return JSON table


# NEW AVERAGE: calculate average only of medium-scores (PRecision, REcall, SPecificity)
# y and p files must have the same filename
def pd_metrics(gt_dir: str, pred_dir: str, metric_fns: list, threshold: int = 128) -> list:
    out = []

    for y_filename in tqdm(os.listdir(gt_dir)):
        y_path = os.path.join(gt_dir, y_filename)
        p_filename = os.path.splitext(y_filename)[0] + '.png'
        p_path = os.path.join(pred_dir, p_filename) # pred are always PNG


        # start adding item data into a structure
        idata = {}
        idata['y'] = y_path
        idata['p'] = p_path

        # load images from paths and apply threshold to binarize
        # the skin probabilities of predictions
        y_true, y_pred = load_images(y_path, p_path, threshold)
        confmat = confmat_scores(y_true, y_pred)

        # calculate metrics for the image couple loaded
        for metric_fn in metric_fns:
            f_name = metric_fn.__name__
            f_argcount = metric_fn.__code__.co_argcount # amount of argument in function definition

            if len(f_name.split('_')) > 1: # is a new-average metric, must not compute now
                continue

            # only one args: the metric only uses confusion matrix scores and is LUT-optimized
            if f_argcount == 1:
                idata[f_name] = metric_fn(confmat)
            # two args: confusion matrix scores aren't enough
            else:
                idata[f_name] = metric_fn(y_true, y_pred)

            # debug
            #display(Image.fromarray(y_true))
            #display(Image.fromarray(y_pred))
        
        # update the final data list
        out.append(idata)
    
    print(f'  Found {len(out)} matches')
    
    return out

# print human-readable metrics results data
# 
# !!NOTE!!
# the mean F1 value calculated by summing all experiments F1 and divinding by N elements
# is different than the mean calculated by applying the F1 formula on average REcall and PRecision
def print_pd_mean(total: list, metric_fns: list, desc: str) -> None:
    print(f'{desc}')
    res = {}
    new_avg = []

    for metric_fn in metric_fns:
        f_name = metric_fn.__name__
        f_score = -99

        if len(f_name.split('_')) > 1: # is a new-average metric
            new_avg.append(metric_fn)
            continue
        else:
            f_score = sum(d[f_name] for d in total) / len(total)
            res[f_name] = f_score
    
    for metric_fn in new_avg:
        f_name = metric_fn.__name__.split('_')[0]
        f_score = metric_fn(res['precision'], res['recall'], res['specificity'])
        res[f_name] = f_score
    
    for key, value in res.items():
        value = round(value, 4) # round to 4 decimals
        print(f'{key}: {value}')


# ALT AVERAGE: calculate average only of primitive confmat scores!
# y and p files must have the same filename
def pd_metrics_a(gt_dir: str, pred_dir: str, threshold: int = 128) -> list:
    out = []

    for y_filename in tqdm(os.listdir(gt_dir)):
        y_path = os.path.join(gt_dir, y_filename)
        p_filename = os.path.splitext(y_filename)[0] + '.png'
        p_path = os.path.join(pred_dir, p_filename) # pred are always PNG

        # load images from paths and apply threshold to binarize
        # the skin probabilities of predictions
        y_true, y_pred = load_images(y_path, p_path, threshold)
        confmat = confmat_scores(y_true, y_pred)
        # append some metadata to identify the images
        confmat['!y'] = y_path
        confmat['!p'] = p_path
        
        # update the final data list
        out.append(confmat)
    
    print(f'  Found {len(out)} matches')
    
    return out

# print human-readable metrics results data
# 
# !!NOTE!!
# the mean F1 value calculated by summing all experiments F1 and divinding by N elements
# is different than the mean calculated by applying the F1 formula on average REcall and PRecision
def print_pd_mean_aa(total: list, metric_fns: list, desc: str) -> None:
    print(f'{desc}')
    confmat_avg = {}

    # Calculate average confmat scores
    confmat_keys = total[0].keys()
    for confmat_key in confmat_keys:
        if not confmat_key.startswith('!'): # a key that starts with '!' it's metadata, not a metric
            confmat_key_avg = sum(d[confmat_key] for d in total) / len(total)
            confmat_avg[confmat_key] = confmat_key_avg
            #print(f'{confmat_key}: {sum(d[confmat_key] for d in total) / len(total)}')
    
    # Use average confmat scores to calculate given metrics
    for metric_fn in metric_fns:
        f_name = metric_fn.__name__ # get metric (function) name
        f_score = metric_fn(confmat_avg)
        print(f'{f_name}: {f_score}')

# print human-readable metrics results data
# 
# !!NOTE!!
# the mean F1 value calculated by summing all experiments F1 and divinding by N elements
# is different than the mean calculated by applying the F1 formula on average REcall and PRecision
def print_pd_mean_a(total: list, metric_fns: list, desc: str) -> None:
    print(f'{desc}')
    confmat_avg = {}

    # Calculate average confmat scores
    confmat_keys = total[0].keys()
    for confmat_key in confmat_keys:
        if not confmat_key.startswith('!'): # a key that starts with '!' it's metadata, not a metric
            confmat_avg[confmat_key] = sum(d[confmat_key] for d in total)
            #print(f'{confmat_key}: {sum(d[confmat_key] for d in total) / len(total)}')
    
    # Use average confmat scores to calculate given metrics
    for metric_fn in metric_fns:
        f_name = metric_fn.__name__ # get metric (function) name
        f_score = metric_fn(confmat_avg)
        print(f'{f_name}: {f_score}')



# Dataset converting Utils

# convert binary like GT in the VDM GT format (red where there is skin overlayed the ORI images)
# (and updates CSV file with the new gt path)
def bin2vdm(csv_file, out_dir):
    # read the images CSV
    file = open(csv_file)
    file3c = file.read().splitlines()
    file.close()

    # rewrite csv file
    with open(csv_file, 'w') as out:
        for entry in file3c:
            ori_path = entry.split(csv_sep)[0]
            gt_path = entry.split(csv_sep)[1]
            

            # process images
            # load images
            ori_im = cv2.imread(ori_path)
            gt_im = cv2.imread(gt_path, cv2.IMREAD_GRAYSCALE)
            # everything but skin
            butsk = cv2.copyTo(ori_im, cv2.bitwise_not(gt_im))
            # splitto il risultato
            b,g,r = cv2.split(butsk)
            # utilizzo la maschera come canale red (rossa al posto che bianca)
            r = cv2.bitwise_or(r, gt_im)
            # riunisco gli split con il nuovo canale red
            res = cv2.merge([b,g,r])

            cv2.imwrite(out_dir, res)
            # update gt path
            gt_path = out_dir

            note = entry.split(csv_sep)[2]
            skintone = ''
            if len(entry.split(csv_sep)) == 4:
                skintone = csv_sep + entry.split(csv_sep)[3]
            
            # aggiorno CSV
            out.write(f"{ori_path}{csv_sep}{gt_path}{csv_sep}{note}{skintone}\n")

# convert SDDMA output masks (green where skin is detected overlayed to the ORI image)
# to binary GT
def sddmaout2bin(in_dir, out_dir):
    # loop mask files
    for im_basename in os.listdir(in_dir):
        im_path = os.path.join(in_dir, im_basename)
        
        # controlla se e' un'immagine (per evitare problemi con files come thumbs.db)
        if not os.path.isdir(im_path) and imghdr.what(im_path) != None:
            im = cv2.imread(im_path)

            b = 0
            g = 255
            r = 0
            lower_val = (b, g, r)
            upper_val = lower_val
            # Threshold the image to get only selected colors
            mask = cv2.inRange(im, lower_val, upper_val)
            cv2.imwrite(out_dir, mask)

def is_better(value1, value2, mode: str):
    if mode == 'upper':
        return value1 > value2
    else:
        return value1 < value2

def bold_best(data: list, datas: list, base = False):
    maxv = {}
    # save the best values between the METHODS (skinny/bayes) for each metric and dataset combination
    for obj in data:
        o_m = obj['method']
        o_train = obj['train']
        o_test = obj['test']
        o_f1 = obj['f1']
        o_iou = obj['iou']
        o_dprs = obj['dprs']
        f1iou = float(o_f1.split(' ')[0]) - float(o_iou.split(' ')[0])
        obj['f1iou'] = round(f1iou, 4)

        print(obj)

        # they are cross predictions hence test != train
        if base == False and o_train == o_test:
            continue
        
        # for each table metric multirow group
        for f_name in ['f1', 'iou', 'dprs', 'f1iou']:
            fn_val = obj[f_name] # metric value of the current iteration
            fn_idformat = f'{f_name}_{o_train}_{o_test}' # ID format
            fnv = f'{fn_idformat}_v' # value of the max measurement
            fni = f'{fn_idformat}_i' # ID of the max measurement

            bmode = 'upper'
            if f_name == 'dprs' or f_name == 'f1iou':
                bmode = 'lower'

            # for each table column
            for trdata in datas:
                for tedata in datas:
                    if o_train == trdata and o_test == tedata:
                        # save best between methods

                        # if max does not exist, add its value and ID
                        if fnv not in maxv:
                            maxv[fnv] =  fn_val
                            maxv[fni] =  o_m
                        # if new max, save the measurement and its ID
                        elif f_name != 'f1iou' and is_better(float(fn_val.split(' ')[0]), float(maxv[fnv].split(' ')[0]), bmode):
                            maxv[fnv] =  fn_val
                            maxv[fni] =  o_m
                        elif f_name == 'f1iou':
                            if is_better(float(fn_val), float(maxv[fnv]), bmode):
                                maxv[fnv] =  fn_val
                                maxv[fni] =  o_m
    newdata = []
    # and now make them bold
    for obj in data:
        o_m = obj['method']
        o_train = obj['train']
        o_test = obj['test']
        o_f1 = obj['f1']
        o_iou = obj['iou']
        o_dprs = obj['dprs']
        f1iou = float(o_f1.split(' ')[0]) - float(o_iou.split(' ')[0])
        obj['f1iou'] = '{:.4f}'.format(f1iou) # round and zerofill

        for f_name in ['f1', 'iou', 'dprs', 'f1iou']:
            fn_val = obj[f_name] # metric value of the current iteration
            fn_idformat = f'{f_name}_{o_train}_{o_test}' # ID format
            fnv = f'{fn_idformat}_v' # value of the max measurement
            fni = f'{fn_idformat}_i' # ID of the max measurement

            #if maxv[fni] == fn_idformat:
            if fni in maxv and maxv[fni] == o_m:
                obj_formatted = '\\texttt{' + '\\textbf{' + str(obj[f_name]) + '}' + '}'
                obj[f_name] = obj_formatted # set bold and monospace
                newdata.append(obj)
            else:
                obj_formatted = '\\texttt{' + str(obj[f_name]) + '}'
                obj[f_name] = obj_formatted # set monospace
                newdata.append(obj)
    data = newdata
    print('newdata:')
    print(data)

    # change JSON format into a standalone data structure containing all table variables
    ff = {}
    for obj in data:
        o_m = obj['method']
        o_train = obj['train']
        o_test = obj['test']
        o_f1 = obj['f1']
        o_iou = obj['iou']
        o_dprs = obj['dprs']
        f1iou = obj['f1iou']

        ff[f'f1_{o_m}_{o_train}_{o_test}'] = o_f1
        ff[f'iou_{o_m}_{o_train}_{o_test}'] = o_iou
        ff[f'dprs_{o_m}_{o_train}_{o_test}'] = o_dprs
        ff[f'f1iou_{o_m}_{o_train}_{o_test}'] = f1iou
    
    print(ff)
    return ff

# data is a list of JSON items
# JSON item example: {"name":"ecu", "F1":".9123 +- 0.25", "IOU":".8744 +- 0.11"}
def get_latex_cross(data: list, datas = None):
    tex_body = ''

    if datas == None:
        datas = ['ecu', 'hgr', 'schmugge']

    ff = bold_best(data, datas)

    # start building the body string
    tex_ms = ['F1 $\\uparrow$', 'IOU $\\uparrow$', 'Dprs $\\downarrow$', 'F1 - IOU $\\downarrow$']
    i = 2
    for tm in tex_ms:
        mns = tm.split(' ')

        if len(mns) > 2: # f1 - iou
            mn = 'f1iou'
        else:
            mn = mns[0].lower()
        
        # for each metrics there are 2 lines(methods): Skinny and Bayes
        for j in range(2):
            pfix = ''

            if j == 0:
                met = 'skinny'
                mf = met[0].lower()
                metf = met.upper() + '\\rule{0pt}{14pt}' # spacing between multirows (metrics)
                metric_w_arrow = tm
                pfix = f'''\\multirow{{2}}{{*}}{{{{{metric_w_arrow}}}}}'''
            elif j == 1:
                met = 'bayes'
                mf = met[0].lower()
                metf = met.upper()
            
            if datas != ['ecu', 'hgr', 'schmugge']:
                met = f'{met}_st'

            # another data struct to gather all items necessary for writing a table line
            tmp = {}
            datas_startletter = []
            for ds_tr in datas:
                datas_startletter.append(ds_tr[0].lower())
                for ds_te in datas:
                    if ds_tr == ds_te:
                        continue
                    tmp[f'{mf}_{ds_tr[0].lower()}{ds_te[0].lower()}'] = ff[f'{mn}_{met}_{ds_tr}_{ds_te}']

            tex_body += f'{pfix}& {metf}'
            for letter_tr in datas_startletter:
                for letter_te in datas_startletter:
                    if letter_tr != letter_te:
                        table_item = tmp[f'{mf}_{letter_tr}{letter_te}']
                        tex_body += f' & {table_item}'
            tex_body += '\\\\'
    
    # string header
    tex_header = r'''
    \begin{tabular}{clcccccc}
    \toprule
    \multicolumn{1}{c}{} & \multicolumn{1}{c}{\head{Training}} 
    '''
    
    # add first row
    for dss in datas:
        tex_header += r'& \multicolumn{2}{c}{\head{' + dss.upper() + '}} '
    tex_header += r'\\'

    tex_header += r'\multicolumn{1}{c}{} & \multicolumn{1}{c}{\head{Testing}} '
    # add second row
    for dss in datas:
        for dssd in datas:
            if dssd != dss:
                tex_header += r' & \multicolumn{1}{c}{\head{' + dssd.upper() + '}} '
    tex_header += r'\\'
    tex_header += r'\midrule'

    # string end
    tex_end = r'''
    \bottomrule
    \end{tabular}
    '''

    tex = tex_header + tex_body + tex_end
    return tex

# data is a list of JSON items
# JSON item example: {"name":"ecu", "F1":".9123 +- 0.25", "IOU":".8744 +- 0.11"}
def get_latex_base(data: list, datas = None):
    tex_body = ''

    if datas == None:
        datas = ['ecu', 'hgr', 'schmugge']

    ff = bold_best(data, datas, True)

    # start building the body string
    metrics = ['f1', 'iou', 'dprs']
    
    # first loop ROWS
    # for each metrics there are 2 lines(methods): Skinny and Bayes, DYC
    for j in range(3):
        if j == 0:
            met = 'skinny'
        elif j == 1:
            met = 'bayes'
        else:
            met = 'dyc'
        metf = met.upper()
        mf = met[0].lower()

        if datas != ['ecu', 'hgr', 'schmugge']:
            met = f'{met}_st'

        # another data struct to gather all items necessary for writing a table line
        tmp = {}
        # then loop COLUMNS
        datas_startletter = []
        for ds in datas:
            datas_startletter.append(ds[0].lower())
            for tm in metrics:
                tmp[f'{mf}_{ds[0].lower()}{tm}'] = ff[f'{tm}_{met}_{ds}_{ds}']

        # m is method
        # eh = ecu_on_hgr, es = ecu_on_schmugge, ...
        tex_body += f'{metf}'
        for letter in datas_startletter:
            for tmm in metrics:
                table_item = tmp[f'{mf}_{letter}{tmm}']
                tex_body += f' & {table_item}'
        tex_body += '\\\\'
    
    # string header
    tex_header = r'''
    \begin{tabular}{lccccccccc}
    \toprule
    '''
    
    # add first row
    for dss in datas:
        tex_header += r'& \multicolumn{3}{c}{\head{' + dss.upper() + '}} '
    tex_header += r'\\'
    
    tex_header += r'''
    & \multicolumn{1}{c}{\head{F1 $\uparrow$}} & \multicolumn{1}{c}{\head{IOU $\uparrow$}} & \multicolumn{1}{c}{\head{Dprs $\downarrow$}}
    & \multicolumn{1}{c}{\head{F1 $\uparrow$}} & \multicolumn{1}{c}{\head{IOU $\uparrow$}} & \multicolumn{1}{c}{\head{Dprs $\downarrow$}}
    & \multicolumn{1}{c}{\head{F1 $\uparrow$}} & \multicolumn{1}{c}{\head{IOU $\uparrow$}} & \multicolumn{1}{c}{\head{Dprs $\downarrow$}}\\
    \midrule
    '''

    # string end
    tex_end = r'''
    \bottomrule
    \end{tabular}
    '''

    tex = tex_header + tex_body + tex_end
    return tex

def read_performance(perf_dir: str):
    csv_sep = ','

    # will contain the final mean between each observation's mean
    observations_means = []

    # do the mean of each observation
    for i in range(5):
        perf_filename = f'bench{i}.txt'
        perf_file = os.path.join(perf_dir, perf_filename)

        # read txt lines (as csv)
        file2c = open(perf_file)
        doubles = file2c.read().splitlines()
        file2c.close()

        intra_obs_timelist = []
        for entry in doubles: # ori_path, execution_time(s)
            ori_path = entry.split(csv_sep)[0]
            execution_time = entry.split(csv_sep)[1]
            intra_obs_timelist.append(float(execution_time))
        
        #print(execution_timelist)
        obs_mean = mean(intra_obs_timelist)
        obs_mean = '{:.6f}'.format(obs_mean) # round and zerofill
        obs_std = pstdev(intra_obs_timelist)
        obs_std = '{:.3f}'.format(obs_std) # round and zerofill

        obs_string = f'{obs_mean} ± {obs_std}'

        #execution_timelist_means.append(float(p_mean))
        observations_means.append(obs_string)
        print(f'{perf_dir} at {i}: {obs_string}')
    
    # get the means from observation means, without the std
    obs_mean_values = []
    for entry in observations_means:
        obs_mean_values.append(float(entry.split(' ')[0]))
    
    # do the final mean of the observation means
    fin_mean = mean(obs_mean_values)
    fin_mean = '{:.6f}'.format(fin_mean) # round and zerofill
    fin_std = pstdev(obs_mean_values)
    fin_std = '{:.3f}'.format(fin_std) # round and zerofill

    fin_string = f'{fin_mean} ± {fin_std}'

    print(f'{perf_dir} at FIN: {fin_string}\n')
    return fin_string


In [None]:
# Metrics

# false negatives weight more (cannot be fixed by post-processing whereas false positives can to a degree),
# choose metrics accordingly (maybe False negative rate?)
#
# The recall close to 1.0 effectively means false_negatives close to 0.0, which is what to want
# (precision_recall_curve)
#
# Fb measure (F1 generalized that can be weighted more on recall or precision)
# https://machinelearningmastery.com/fbeta-measure-for-machine-learning/


# prevents zero division
smooth = 1e-20 #1e-07


# returns a dict that can be used as a LUT-table of the confusion matrix scores
# for scores info, see quick graph https://en.wikipedia.org/wiki/Precision_and_recall
# dtype casting is used to prevent overflow long_scalars
def confmat_scores(y_true, y_pred) -> dict:
    data = {}
    cast_type = 'double'

    neg_y_true = 1 - y_true
    neg_y_pred = 1 - y_pred

    AP = np.sum(y_true, dtype=cast_type) # TP + FN
    AN = np.sum(neg_y_true, dtype='double') # TN + FP
    SE = np.sum(y_pred, dtype='double') #TP + FP
    TP = np.sum(y_true * y_pred, dtype='double')
    FP = SE - TP
    TN = np.sum(neg_y_true * neg_y_pred, dtype='double')
    FN = AP - TP

    data['ap'] = AP
    data['an'] = AN
    data['se'] = SE
    data['tp'] = TP
    data['fp'] = FP
    data['tn'] = TN
    data['fn'] = FN

    return data

def iou_old(y_true, y_pred) -> float:
    overlap = y_true*y_pred # Logical AND (or use np.logical_and(target, prediction))
    union = y_true + y_pred # Logical OR (or use np.logical_or(target, prediction))

    # (or iou_score = np.sum(intersection) / np.sum(union))
    IOU = overlap.sum() / (union.sum() + smooth) # Treats "True" as 1,
                                                      # sums number of Trues
                                                      # in overlap and union
                                                      # and divides
    return IOU

# Intersection over Union
# can be re-expressed in terms of precision and recall
# https://tomkwok.com/posts/iou-vs-f1/
def iou(cs):
    # precision_score = precision(cs)
    # recall_score = recall(cs)
    # return (precision_score * recall_score) / (precision_score + recall_score - precision_score * recall_score)
    return cs['tp'] / (cs['tp'] + cs['fp'] + cs['fn'] + smooth)

# Recall (aliases: TruePositiveRate, Sensitivity)
# how many relevant items are selected?
def recall(cs):
    return cs['tp'] / (cs['ap'] + smooth)

# Specificity (aliases: FalsePositiveRate)
# how many negative elements are truly negative?
def specificity(cs):
    return cs['tn'] / (cs['an'] + smooth)

# how many selected items are relevant?
def precision(cs):
    return cs['tp'] / (cs['se'] + smooth)

# Fb-measure: recall is considered Beta(b) times important as precision
# F2 weights recall higher than precision, F.5 weights precision higher than recall
# Beta(b) is a positive real factor
def fb(cs, b = 1):
    precision_score = precision(cs)
    recall_score = recall(cs)
    return (1 + b**2) * ((precision_score * recall_score) / ((b**2 * precision_score) + recall_score + smooth))

# F1-score (aliases: F1-measure, F-score with Beta=1)
def f1(cs):
    #precision_score = precision(cs)
    #recall_score = recall(cs)
    #return 2 * (float(precision_score * recall_score) / float(precision_score + recall_score + smooth))
    return fb(cs)

# F2-score
def f2(cs):
    return fb(cs, 2)

def dprs(cs):
    a = (1 - precision(cs))**2
    b = (1 - recall(cs))**2
    c = (1 - specificity(cs))**2
    
    return math.sqrt(a + b + c)

def f1_n(pr, re, sp):
    return 2 * ((pr * re) / (pr + re + smooth))

def dprs_n(pr, re, sp):
    a = (1 - pr)**2
    b = (1 - re)**2
    c = (1 - sp)**2
    return math.sqrt(a + b + c)

# range is [-1 1]
def mcc(cs):
    # explained in https://doi.org/10.1186/s12864-019-6413-7
    # the following fixes prevent where MCC could not be calculated normally
    M = np.matrix([[cs['tp'], cs['fn']], [cs['fp'], cs['tn']]]) # define confusion matrix
    nz = np.count_nonzero(M) # get non-zero elements of the matrix
    # fix 1
    if nz == 1: # 3 elements of M are 0
        # all samples of the dataset belong to 1 class
        if cs['tp'] != 0 or cs['tn'] != 0: # they either are all correctly classified
            return 1
        else:
            return -1 # or all uncorrectly classified
    
    # fix 2
    # where a row or a column of M are zero while the other true entries
    # are non zero, MCC takes the indefinite form 0/0
    if nz == 2 and np.sum(np.abs(M.diagonal())) != 0 and np.sum(np.abs(np.diag(np.fliplr(M)))) != 0:
        # replace the zero elements with an arbitrary small value 
        M[M == 0] = smooth
    
    # calculate MCC
    num = cs['tp'] * cs['tn'] - cs['fp'] * cs['fn']
    den = math.sqrt((cs['tp'] + cs['fp']) * (cs['tp'] + cs['fn']) * (cs['tn'] + cs['fp']) * (cs['tn'] + cs['fn']))

    # print(f'num={num} den={den} TP={TP} FP={FP} FN={FN} TN={TN}') # debug

    return num / (den + smooth)

In [None]:
if bench_mode == 'performance':
    detectors = ['skinny', 'bayes', 'dyc']

    for det in detectors:
        perf_dir = f'performance/bench_{det}'
        read_performance(perf_dir)

performance/bench_skinny at 0: 0.807026 ± 0.064
performance/bench_skinny at 1: 0.797650 ± 0.008
performance/bench_skinny at 2: 0.811103 ± 0.010
performance/bench_skinny at 3: 0.911956 ± 0.242
performance/bench_skinny at 4: 0.805169 ± 0.008
performance/bench_skinny at FIN: 0.826581 ± 0.043

performance/bench_bayes at 0: 0.459174 ± 0.001
performance/bench_bayes at 1: 0.457149 ± 0.003
performance/bench_bayes at 2: 0.458998 ± 0.002
performance/bench_bayes at 3: 0.458094 ± 0.001
performance/bench_bayes at 4: 0.454253 ± 0.002
performance/bench_bayes at FIN: 0.457534 ± 0.002

performance/bench_dyc at 0: 0.007665 ± 0.000
performance/bench_dyc at 1: 0.007677 ± 0.000
performance/bench_dyc at 2: 0.007730 ± 0.000
performance/bench_dyc at 3: 0.007763 ± 0.000
performance/bench_dyc at 4: 0.007752 ± 0.000
performance/bench_dyc at FIN: 0.007717 ± 0.000



In [None]:
#  method can be: base, cross
method = 'cross'
# mode can be: normal, skintones
mode = 'skintones'

if mode == 'normal':
    db_list = ['ecu', 'hgr', 'schmugge']
elif mode == 'skintones':
    db_list = ['dark', 'medium', 'light']

# paths resolving
if method == 'base':
    #db_list = ['ecu', 'hgr', 'schmugge']
    detectors = ['skinny', 'bayes', 'dyc']
    #detectors = ['skinny', 'bayes']
    db_paths = []
    for db in db_list:
        for sd in detectors:
            if db == 'hgr' and sd == 'dyc':
                db = 'hgr_small'
            if mode == 'skintones':
                sd += '_st'
            db_paths.append(f'dataset/{sd}/base/{db}')
elif method == 'cross':
    #db_list = ['ecu', 'hgr', 'schmugge']
    detectors = ['skinny', 'bayes']
    db_skinny = []
    db_bayes = []
    db_paths = []
    for db_tr in db_list:
        for db_te in db_list:
            if db_te != db_tr:
                for sd in detectors:
                    if mode == 'skintones':
                        sd += '_st'
                    db_paths.append(f'dataset/{sd}/cross/{db_tr}_on_{db_te}')



metrics = [f1, iou, dprs]
json_table = []

# compute metrics
for ds in db_paths:
    y_path = os.path.join(ds, 'y') # '{dataset}/y'
    p_path = os.path.join(ds, 'p') # '{dataset}/p'
    
    rpd = pd_metrics_old(y_path, p_path, metrics)
    # 'dataset/skinny/...'
    skin_detector = ds.split('/')[1]

    if method == 'base':
        ds = ds + '_on_' + os.path.basename(ds)

    table_item = print_pd_mean_old(rpd, metrics, desc=ds, method=skin_detector)

    json_table.append(table_item)

if method == 'base':
    print(get_latex_base(json_table, db_list))
else:
    print(get_latex_cross(json_table, db_list))

# save JSON table
# out_table = open("metrics.json", "w")
# json.dump(json_table, out_table)
# out_table.close()


100%|██████████| 101/101 [00:00<00:00, 476.37it/s]
 56%|█████▋    | 57/101 [00:00<00:00, 567.02it/s]

  Found 101 matches
skinny_st: dark_on_medium
f1: 0.7300 ± 0.25
iou: 0.6279 ± 0.27
dprs: 0.3805 ± 0.33


100%|██████████| 101/101 [00:00<00:00, 412.20it/s]
 13%|█▎        | 53/409 [00:00<00:00, 529.10it/s]

  Found 101 matches
bayes_st: dark_on_medium
f1: 0.7928 ± 0.11
iou: 0.6668 ± 0.11
dprs: 0.3481 ± 0.16


100%|██████████| 409/409 [00:00<00:00, 514.25it/s]
  8%|▊         | 31/409 [00:00<00:01, 303.30it/s]

  Found 409 matches
skinny_st: dark_on_light
f1: 0.7262 ± 0.26
iou: 0.6276 ± 0.28
dprs: 0.3934 ± 0.34


100%|██████████| 409/409 [00:00<00:00, 438.01it/s]
100%|██████████| 27/27 [00:00<00:00, 506.75it/s]
100%|██████████| 27/27 [00:00<00:00, 231.80it/s]


  Found 409 matches
bayes_st: dark_on_light
f1: 0.7577 ± 0.12
iou: 0.6229 ± 0.13
dprs: 0.4679 ± 0.18
  Found 27 matches
skinny_st: medium_on_dark
f1: 0.8447 ± 0.13
iou: 0.7486 ± 0.15
dprs: 0.2326 ± 0.17
  Found 27 matches
bayes_st: medium_on_dark
f1: 0.5628 ± 0.14
iou: 0.4042 ± 0.13
dprs: 0.6802 ± 0.20


100%|██████████| 409/409 [00:00<00:00, 519.37it/s]
  8%|▊         | 31/409 [00:00<00:01, 306.79it/s]

  Found 409 matches
skinny_st: medium_on_light
f1: 0.8904 ± 0.14
iou: 0.8214 ± 0.16
dprs: 0.1692 ± 0.18


100%|██████████| 409/409 [00:00<00:00, 453.80it/s]
100%|██████████| 27/27 [00:00<00:00, 555.22it/s]
100%|██████████| 27/27 [00:00<00:00, 251.33it/s]
  0%|          | 0/101 [00:00<?, ?it/s]

  Found 409 matches
bayes_st: medium_on_light
f1: 0.7032 ± 0.14
iou: 0.5571 ± 0.14
dprs: 0.5376 ± 0.23
  Found 27 matches
skinny_st: light_on_dark
f1: 0.7660 ± 0.17
iou: 0.6496 ± 0.21
dprs: 0.3402 ± 0.21
  Found 27 matches
bayes_st: light_on_dark
f1: 0.5293 ± 0.20
iou: 0.3852 ± 0.19
dprs: 0.6361 ± 0.22


100%|██████████| 101/101 [00:00<00:00, 511.69it/s]
 58%|█████▊    | 59/101 [00:00<00:00, 584.84it/s]

  Found 101 matches
skinny_st: light_on_medium
f1: 0.9229 ± 0.11
iou: 0.8705 ± 0.13
dprs: 0.1192 ± 0.16


100%|██████████| 101/101 [00:00<00:00, 434.41it/s]

  Found 101 matches
bayes_st: light_on_medium
f1: 0.7853 ± 0.11
iou: 0.6574 ± 0.12
dprs: 0.3199 ± 0.16
{'method': 'skinny_st', 'train': 'dark', 'test': 'medium', 'f1': '0.7300 ± 0.25', 'iou': '0.6279 ± 0.27', 'dprs': '0.3805 ± 0.33', 'f1iou': 0.1021}
{'method': 'bayes_st', 'train': 'dark', 'test': 'medium', 'f1': '0.7928 ± 0.11', 'iou': '0.6668 ± 0.11', 'dprs': '0.3481 ± 0.16', 'f1iou': 0.126}
{'method': 'skinny_st', 'train': 'dark', 'test': 'light', 'f1': '0.7262 ± 0.26', 'iou': '0.6276 ± 0.28', 'dprs': '0.3934 ± 0.34', 'f1iou': 0.0986}
{'method': 'bayes_st', 'train': 'dark', 'test': 'light', 'f1': '0.7577 ± 0.12', 'iou': '0.6229 ± 0.13', 'dprs': '0.4679 ± 0.18', 'f1iou': 0.1348}
{'method': 'skinny_st', 'train': 'medium', 'test': 'dark', 'f1': '0.8447 ± 0.13', 'iou': '0.7486 ± 0.15', 'dprs': '0.2326 ± 0.17', 'f1iou': 0.0961}
{'method': 'bayes_st', 'train': 'medium', 'test': 'dark', 'f1': '0.5628 ± 0.14', 'iou': '0.4042 ± 0.13', 'dprs': '0.6802 ± 0.20', 'f1iou': 0.1586}
{'method': 'ski




In [None]:
# paths resolving
# method can be: skinny_ecu, dyc
if method == 'skinny_ecu':
#    data_dir = 'dataset/20210428-225749'
#    datasets = ['ecu_on_abd-skin', 'ecu_on_ecu', 'ecu_on_hgr_small', 'ecu_on_pratheepan',
#                'ecu_on_uchile', 'ecu_on_vdm']
    data_dir = 'dataset/20210513-203631'
    datasets = ['ecu_on_ecu', 'ecu_on_hgr_small', 'ecu_on_schmugge']
elif method == 'skinny_hgr':
    data_dir = 'dataset/20210513-190432'
    datasets = ['hgr_small_on_ecu', 'hgr_small_on_hgr_small', 'hgr_small_on_schmugge']
elif method == 'skinny_sch':
    data_dir = 'dataset/20210512-205005'
    datasets = ['Schmugge_on_ecu', 'Schmugge_on_hgr_small', 'Schmugge_on_schmugge']
elif method == 'skinny':
    data_dir = 'dataset/skinny'
    datasets = ['ecu_on_ecu', 'ecu_on_hgr_small', 'ecu_on_schmugge',
                'hgr_small_on_ecu', 'hgr_small_on_hgr_small', 'hgr_small_on_schmugge',
                'Schmugge_on_ecu', 'Schmugge_on_hgr_small', 'Schmugge_on_schmugge']
elif method == 'skinny_st':
    data_dir = 'dataset/skinnyst'
    datasets = ['dark_on_dark', 'dark_on_medium', 'dark_on_light',
                'medium_on_dark', 'medium_on_medium', 'medium_on_light',
                'light_on_dark', 'light_on_medium', 'light_on_light']
elif method == 'dyc':
    data_dir = 'dataset/predicted'
    #datasets = ['abd-skin', 'ECU', 'HGR_small', 'HGR_big',
    #            'Pratheepan', 'Uchile', 'Schmugge', 'VDM', 'VDM_test']
    datasets = ['ecu', 'hgr_small', 'schmugge']
# elif method == 'bayes':
#     data_dir = 'dataset'
#     datasets = ['ECU_on_ECU', 'ECU_on_HGR_small', 'ECU_on_Schmugge',
#                 'HGR_small_on_ECU', 'HGR_small_on_HGR_small', 'HGR_small_on_Schmugge',
#                 'Schmugge_on_ECU', 'Schmugge_on_HGR_small', 'Schmugge_on_Schmugge']
elif method == 'bayes':
    data_dir = 'dataset/bayesst'
    datasets = ['ECU_on_ECU', 'ECU_on_HGR_small', 'ECU_on_Schmugge',
                'HGR_small_on_ECU', 'HGR_small_on_HGR_small', 'HGR_small_on_Schmugge',
                'Schmugge_on_ECU', 'Schmugge_on_HGR_small', 'Schmugge_on_Schmugge']
elif method == 'bayes_st':
    data_dir = 'dataset/bayesst'
    datasets = ['dark_on_dark', 'dark_on_medium', 'dark_on_light',
                'medium_on_dark', 'medium_on_medium', 'medium_on_light',
                'light_on_dark', 'light_on_medium', 'light_on_light']


#metrics = [mcc, iou, f1, f2, dprs, recall, precision, specificity]
#metrics = [f1_n, dprs_n, recall, precision, specificity]
metrics = [f1, iou, dprs]
json_table = []

# compute metrics
for ds in datasets:
    if method != 'skinny_sch' and method != 'bayes' and method != 'skinny':
        ds = ds.lower() # in case it wasn't lowercase

    #y_path = 'dataset/20210428-225749/ecu_on_vdm/y' # 'dataset/y'
    #p_path = 'dataset/20210428-225749/ecu_on_vdm/p' # 'dataset/pred'

    y_path = os.path.join(data_dir, ds, 'y') # 'dataset/y'
    p_path = os.path.join(data_dir, ds, 'p') # 'dataset/pred'
    
    rpd = pd_metrics_old(y_path, p_path, metrics)
    table_item = print_pd_mean_old(rpd, metrics, desc=ds, method=method)
    #rpd = pd_metrics(y_path, p_path, metrics)
    #print_pd_mean(rpd, metrics, desc=ds)

    json_table.append(table_item)

# save JSON table
out_table = open("metrics.json", "w")
json.dump(json_table, out_table)
out_table.close()


100%|██████████| 2000/2000 [00:15<00:00, 131.46it/s]
  7%|▋         | 16/234 [00:00<00:01, 158.85it/s]

  Found 2000 matches
ecu_on_ecu
f1: 0.9133 ± 0.08
iou: 0.8489 ± 0.12
dprs: 0.1333 ± 0.12


100%|██████████| 234/234 [00:01<00:00, 146.55it/s]
 47%|████▋     | 59/126 [00:00<00:00, 589.52it/s]

  Found 234 matches
ecu_on_hgr_small
f1: 0.9284 ± 0.11
iou: 0.8818 ± 0.15
dprs: 0.1134 ± 0.15


100%|██████████| 126/126 [00:00<00:00, 619.15it/s]
  1%|          | 13/2000 [00:00<00:16, 124.02it/s]

  Found 126 matches
ecu_on_schmugge
f1: 0.4862 ± 0.41
iou: 0.4192 ± 0.37
dprs: 0.7238 ± 0.56


100%|██████████| 2000/2000 [00:14<00:00, 133.60it/s]
  7%|▋         | 16/234 [00:00<00:01, 150.44it/s]

  Found 2000 matches
hgr_small_on_ecu
f1: 0.7513 ± 0.19
iou: 0.6339 ± 0.21
dprs: 0.3588 ± 0.25


100%|██████████| 234/234 [00:01<00:00, 150.99it/s]
 44%|████▎     | 55/126 [00:00<00:00, 547.68it/s]

  Found 234 matches
hgr_small_on_hgr_small
f1: 0.9848 ± 0.02
iou: 0.9705 ± 0.03
dprs: 0.0251 ± 0.03


100%|██████████| 126/126 [00:00<00:00, 604.09it/s]
  1%|          | 13/2000 [00:00<00:16, 123.63it/s]

  Found 126 matches
hgr_small_on_schmugge
f1: 0.2671 ± 0.31
iou: 0.1969 ± 0.24
dprs: 1.0009 ± 0.43


100%|██████████| 2000/2000 [00:16<00:00, 122.16it/s]
  7%|▋         | 16/234 [00:00<00:01, 150.12it/s]

  Found 2000 matches
Schmugge_on_ecu
f1: 0.6337 ± 0.21
iou: 0.4963 ± 0.22
dprs: 0.5336 ± 0.27


100%|██████████| 234/234 [00:01<00:00, 138.14it/s]
 44%|████▎     | 55/126 [00:00<00:00, 543.25it/s]

  Found 234 matches
Schmugge_on_hgr_small
f1: 0.7879 ± 0.21
iou: 0.6933 ± 0.25
dprs: 0.3185 ± 0.29


100%|██████████| 126/126 [00:00<00:00, 588.31it/s]

  Found 126 matches
Schmugge_on_schmugge
f1: 0.6121 ± 0.45
iou: 0.585 ± 0.44
dprs: 0.552 ± 0.64





ADDITIONAL INFO

In [None]:
# test MCC and F1 correctness based on paper data


# Summary: F1 doesn't care much about TN and could signal 
#          over-optimistic data to the classifier


print('Use Case A1: Positively imbalanced dataset')
data = {}
data['ap'] = 91   # 91 sick patients
data['an'] = 9    # 9 healthy individuals
data['se'] = 99
data['tp'] = 90   # algorithm is good at predicting positive data
data['fp'] = 9
data['tn'] = 0
data['fn'] = 1    # algorithm is bad at predicting negative data
# F1 measures an almost perfect score, MCC instead measures a bad score
# F1 0.95    MCC -0.03
print(f'f1: {round(f1(data), 2)}\nmcc: {round(mcc(data), 2)}')


print('\nUse Case A2: Positively imbalanced dataset')
data = {}
data['ap'] = 75   # 75 positives
data['an'] = 25   # 25 negatives
data['se'] = 11
data['tp'] = 5    # classifier unable to predict positives
data['fp'] = 6
data['tn'] = 19   # classifier was able to predict negatives
data['fn'] = 70
# In this case both the metrics measure a bad score
# F1 0.12    MCC -0.24
print(f'f1: {round(f1(data), 2)}\nmcc: {round(mcc(data), 2)}')


print('\nUse Case B1: Balanced dataset')
data = {}
data['ap'] = 50   # 50 positives
data['an'] = 50   # 50 negatives
data['se'] = 92
data['tp'] = 47   # classifier able to predict positives
data['fp'] = 45
data['tn'] = 5    # classifier was unable to predict negatives
data['fn'] = 3
# F1 measures a good score, MCC doesn't
# F1 0.66    MCC 0.07
print(f'f1: {round(f1(data), 2)}\nmcc: {round(mcc(data), 2)}')


print('\nUse Case B2: Balanced dataset')
data = {}
data['ap'] = 50   # 50 positives
data['an'] = 50   # 50 negatives
data['se'] = 14
data['tp'] = 10   # classifier was unable to predict positives
data['fp'] = 4
data['tn'] = 46    # classifier able to predict negatives
data['fn'] = 40
# F1 measures a good score, MCC doesn't
# F1 0.31    MCC 0.17
print(f'f1: {round(f1(data), 2)}\nmcc: {round(mcc(data), 2)}')


print('\nUse Case C1: Negatively imbalanced dataset')
data = {}
data['ap'] = 10   # 10 positives
data['an'] = 90   # 90 negatives
data['se'] = 98
data['tp'] = 9    # classifier was unable to predict positives
data['fp'] = 89
data['tn'] = 1    # classifier able to predict negatives
data['fn'] = 1
# Both the scores gives bad measure
# F1 0.17    MCC -0.19
print(f'f1: {round(f1(data), 2)}\nmcc: {round(mcc(data), 2)}')


print('\nUse Case C2: Negatively imbalanced dataset')
data = {}
data['ap'] = 11   # 10 positives
data['an'] = 89   # 89 negatives
data['se'] = 3
data['tp'] = 2   # classifier was unable to predict positives
data['fp'] = 1
data['tn'] = 88    # classifier able to predict negatives
data['fn'] = 9
# Both the scores gives bad measure
# F1 0.29    MCC 0.31
print(f'f1: {round(f1(data), 2)}\nmcc: {round(mcc(data), 2)}')

Use Case A1: Positively imbalanced dataset
f1: 0.95
mcc: -0.03

Use Case A2: Positively imbalanced dataset
f1: 0.12
mcc: -0.24

Use Case B1: Balanced dataset
f1: 0.66
mcc: 0.07

Use Case B2: Balanced dataset
f1: 0.31
mcc: 0.17

Use Case C1: Negatively imbalanced dataset
f1: 0.17
mcc: -0.19

Use Case C2: Negatively imbalanced dataset
f1: 0.29
mcc: 0.31


In [None]:
import json

a_file = open("data.json", "w")
json.dump(reso, a_file)
a_file.close()