# Mount Google Drive and get predictions

Modify the code to use it locally.  
The code was aimed at producing performance tables featured in the thesis, hence it is very specific and rigid

---
The paths of the predictions must respect the following:  
- Folder structure must be `detector_name/pred_type/pred_folder` and be placed in the `dataset` folder  
Where `detector_name` refers to the old names of the approaches and may assume: `skinny`, `bayes` (probabilistic), `dyc` (nbrancati)  
Where `pred_type` is either: `base` or `cross`  
Where `pred_folder` is the folder containing the actual predictions (within `p`, `x`, `y` subfolders); eg. `ecu` (representing a base prediction), or `hgr_on_schmugge` (representing a cross prediction)  
- If evaluating skin tones, append `_st` to `detector_name`
- `dyc` must use `hgr_small` instead of simply `hgr` in the base predictions: `dataset/dyc/base/hgr_small`
- All path names must be in lowercase

Some complete examples are the following:  
```
dataset/skinny/base/ecu
dataset/skinny/base/hgr
dataset/skinny/base/schmugge

dataset/bayes/cross/ecu_on_hgr
dataset/bayes/cross/ecu_on_schmugge
dataset/bayes/cross/hgr_on_ecu
dataset/bayes/cross/hgr_on_schmugge
dataset/bayes/cross/schmugge_on_ecu
dataset/bayes/cross/schmugge_on_hgr

dataset/dyc_st/base/dark
dataset/dyc_st/base/light
dataset/dyc_st/base/medium
```

---
The paths of performance dumps must respect the following:
- Folder structure must be `bench_detector_name` and be placed in the `performance` folder  
Where `detector_name` refers to the old names of the approaches and may assume: `skinny`, `bayes` (probabilistic), `dyc` (nbrancati)  
- Dumps must be named `benchN.txt` where N is an integer starting from `0`
- All path names must be in lowercase

Some complete examples are the following:  
```
bench_bayes/bench0.txt
bench_bayes/bench1.txt
..
bench_bayes/bench4.txt

bench_skinny/bench0.txt
bench_dyc/bench0.txt
```




In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
bench_mode = 'dataset' # 'dataset' or 'performance'

if bench_mode == 'dataset':
    !rm -rf dataset

    !unzip drive/MyDrive/testing/skinny/20210521-204405_p.zip -d dataset # skinny full
    !mv dataset/20210521-204405/* dataset/skinny
    !unzip drive/MyDrive/testing/skinny/20210523-192002_p.zip -d dataset # skinny skintones full
    !mv dataset/20210523-192002/* dataset/skinny_st

    !unzip drive/MyDrive/testing/bayes/bayes.zip -d dataset # probabilistic full
    !unzip drive/MyDrive/testing/bayes/bayes_st.zip -d dataset # probabilistic skintones full

    !unzip drive/MyDrive/testing/dyc/dyc.zip -d dataset # nbrancati
    !unzip drive/MyDrive/testing/dyc/dyc_st.zip -d dataset # nbrancati
elif bench_mode == 'performance':
    !rm -rf performance

    !unzip drive/MyDrive/testing/benchmark/bench_skinny.zip -d performance
    !unzip drive/MyDrive/testing/benchmark/bench_bayes.zip -d performance
    !unzip drive/MyDrive/testing/benchmark/bench_dyc.zip -d performance


# Define Metrics

In [8]:
import math

import numpy as np

#  Measure the goodness of the classifier by comparing predictions with groundtruths

#  In skin detection, false negatives may weight more as they cannot be fixed by post-processing
#  whereas false positives can to a degree


# Prevent zero division
smooth = 1e-20


def confmat_scores(y_true, y_pred) -> dict:
    '''
    Return a dict that can be used as a LUT-table of the confusion matrix scores
    For info on each score, see https://en.wikipedia.org/wiki/Precision_and_recall
    '''
    data = {}
    cast_type = 'double'

    neg_y_true = 1 - y_true
    neg_y_pred = 1 - y_pred

    # dtype casting is used to prevent overflow long_scalars
    AP = np.sum(y_true, dtype=cast_type) # TP + FN
    AN = np.sum(neg_y_true, dtype='double') # TN + FP
    SE = np.sum(y_pred, dtype='double') #TP + FP
    TP = np.sum(y_true * y_pred, dtype='double')
    FP = SE - TP
    TN = np.sum(neg_y_true * neg_y_pred, dtype='double')
    FN = AP - TP

    data['ap'] = AP
    data['an'] = AN
    data['se'] = SE
    data['tp'] = TP
    data['fp'] = FP
    data['tn'] = TN
    data['fn'] = FN

    return data

def iou_logical(y_true, y_pred) -> float:
    '''Intersection over Union'''
    overlap = y_true * y_pred # Logical AND
    union =   y_true + y_pred # Logical OR
    # Note that matrices are bool due to '> threshold' in load_images(),
    # it they were not, for union must to use bitwise OR '|'
    
    # Treats "True" as 1, sums number of Trues
    # in overlap and union and divides
    IOU = overlap.sum() / (union.sum() + smooth) 
    return IOU

def iou(cs):
    '''
    Intersection over Union can be re-expressed in terms of precision and recall
    Credit to https://tomkwok.com/posts/iou-vs-f1/
    '''
    return cs['tp'] / (cs['tp'] + cs['fp'] + cs['fn'] + smooth)

def recall(cs):
    '''
    Recall (aliases: TruePositiveRate, Sensitivity)

    How many relevant items are selected?
    '''
    return cs['tp'] / (cs['ap'] + smooth)

def specificity(cs):
    '''
    Specificity (aliases: FalsePositiveRate)

    How many negative elements are truly negative?
    '''
    return cs['tn'] / (cs['an'] + smooth)

def precision(cs):
    '''How many selected items are relevant?'''
    return cs['tp'] / (cs['se'] + smooth)

def fb(cs, b = 1):
    '''
    Fb-measure: recall is considered Beta(b) times important as precision.
    For example, F2 weights recall higher than precision, while
    F0.5 weights precision higher than recall.
    
    Beta(b) is a positive real factor
    '''
    precision_score = precision(cs)
    recall_score = recall(cs)
    return (1 + b**2) * ((precision_score * recall_score) / ((b**2 * precision_score) + recall_score + smooth))

def f1(cs):
    '''F1-score (aliases: F1-measure, F-score with Beta=1)'''
    return fb(cs)

def f2(cs):
    '''F2-score'''
    return fb(cs, 2)

def f1_medium(pr, re, sp):
    '''
    F1-score (aliases: F1-measure, F-score with Beta=1)
    ---
    Implementation suited for medium averaging
    '''
    return 2 * ((pr * re) / (pr + re + smooth))

def dprs(cs):
    '''
    Measures the Euclidean distance between the segmentation,
    represented by the point (PR, RE, SP), and the ground truth, the ideal point(1, 1, 1),
    hence lower values are better.
    Note: it considers all three of Precision, Recall, and Specificity.

    Can be higher than 1 in extremely bad cases

    ---
    Intawong, K., Scuturici, M., & Miguet, S. (2013). A New Pixel-Based Quality Measure
    for Segmentation Algorithms Integrating Precision, Recall and Specificity.
    Computer Analysis of Images and Patterns, 188-195.
    https://doi.org/10.1007/978-3-642-40261-6_22
    '''
    a = (1 - precision(cs))**2
    b = (1 - recall(cs))**2
    c = (1 - specificity(cs))**2
    
    return math.sqrt(a + b + c)

def dprs_medium(pr, re, sp):
    '''
    Measures the Euclidean distance between the segmentation,
    represented by the point (PR, RE, SP), and the ground truth, the ideal point(1, 1, 1),
    hence lower values are better.
    Note: it considers all three of Precision, Recall, and Specificity.

    Can be higher than 1 in extremely bad cases

    ---
    Intawong, K., Scuturici, M., & Miguet, S. (2013). A New Pixel-Based Quality Measure
    for Segmentation Algorithms Integrating Precision, Recall and Specificity.
    Computer Analysis of Images and Patterns, 188-195.
    https://doi.org/10.1007/978-3-642-40261-6_22

    ---
    Implementation suited for medium averaging
    '''
    a = (1 - pr)**2
    b = (1 - re)**2
    c = (1 - sp)**2
    return math.sqrt(a + b + c)

# Note: the function has not been tested thoroughly and needs to be verified
# range is [-1 1]
def mcc(cs):
    '''
    Common statistical measures can dangerously show overoptimistic inflated results,
    especially on imbalanced datasets.

    The Matthews correlation coefficient (MCC), instead, is a more reliable statistical
    rate which produces a high score only if the prediction obtained good results
    in all of the four confusion matrix categories (true positives, false negatives,
    true negatives, and false positives), proportionally both to the size of positive
    elements and the size of negative elements in the dataset.

    Range of values is [-1 1]

    ---
    Chicco, D., & Jurman, G. (2020). The advantages of the Matthews correlation
    coefficient (MCC) over F1 score and accuracy in binary classification evaluation.
    BMC Genomics, 21(1).
    https://doi.org/10.1186/s12864-019-6413-7

    ---
    Info on F1 vs MCC from the paper analysis is simulated in
    `tests/test_metrics.py -> mcc_unittest()`
    '''

    # The following fixes prevent where MCC could not be calculated normally
    M = np.matrix([[cs['tp'], cs['fn']], [cs['fp'], cs['tn']]]) # define confusion matrix
    nz = np.count_nonzero(M) # get non-zero elements of the matrix
    
    # Fix 1
    if nz == 1: # 3 elements of M are 0
        # all samples of the dataset belong to 1 class
        if cs['tp'] != 0 or cs['tn'] != 0: # they either are all correctly classified
            return 1
        else:
            return -1 # or all uncorrectly classified
    
    # Fix 2
    # Where a row or a column of M are zero while the other true entries
    # are non zero, MCC takes the indefinite form 0/0
    if nz == 2 and np.sum(np.abs(M.diagonal())) != 0 and np.sum(np.abs(np.diag(np.fliplr(M)))) != 0:
        # replace the zero elements with an arbitrary small value 
        M[M == 0] = smooth
    
    # Calculate MCC
    num = cs['tp'] * cs['tn'] - cs['fp'] * cs['fn']
    den = math.sqrt((cs['tp'] + cs['fp']) * (cs['tp'] + cs['fn']) * (cs['tn'] + cs['fp']) * (cs['tn'] + cs['fn']))

    return num / (den + smooth)


# Define Metric Utils

In [9]:
import os
from statistics import mean, pstdev

import numpy as np
from PIL import Image
from tqdm import tqdm


def load_images(gt_path: str, pred_path: str, threshold: int = 128):
    '''Load images as numpy boolean arrays'''
    # Load as grayscale uint8
    gt_gray = np.array(Image.open(gt_path).convert('L'))
    pred_gray = np.array(Image.open(pred_path).convert('L'))
    # Binarize and convert to bool
    gt_bool = gt_gray > threshold
    pred_bool = pred_gray > threshold
    return gt_bool, pred_bool

# MEDIUM AVERAGE: calculate average only of medium-scores (PRecision, REcall, SPecificity)
# Note: y and p files must have the same filename
def calc_metrics(gt_dir: str, pred_dir: str, metric_fns: list, threshold: int = 128) -> list:
    '''
    Compute all the given metric functions over all images in a folder
    by considering a single image at a time and comparing
    its groundtruth to its prediction map

    Return a list of dicts.
    Each dict represents the metrics measurement on a single image

    Medium-averaging metric functions get skipped as they cannot be computed on a single image
    '''
    out = []

    # Loop images
    for y_filename in tqdm(os.listdir(gt_dir)):
        y_path = os.path.join(gt_dir, y_filename)
        p_filename = os.path.splitext(y_filename)[0] + '.png'
        p_path = os.path.join(pred_dir, p_filename) # pred are always PNG

        # Start adding current image data into a dict structure
        idata = {}
        idata['y'] = y_path
        idata['p'] = p_path

        # Load images from paths and apply threshold to binarize
        # the skin probability maps obtained from predictions
        y_true, y_pred = load_images(y_path, p_path, threshold)
        # Calculate confusion matrix scores for current image
        confmat = confmat_scores(y_true, y_pred)

        # Calculate metrics for current image and add them to the dict structure
        for metric_fn in metric_fns:
            f_name = metric_fn.__name__
            f_argcount = metric_fn.__code__.co_argcount # amount of argument in function definition

            if f_name.endswith('_medium'): # is a medium-average metric, must not compute now
                continue

            # only one args: the metric only uses confusion matrix scores and is LUT-optimized
            if f_argcount == 1:
                idata[f_name] = metric_fn(confmat)
            # two args: confusion matrix scores aren't enough
            else:
                idata[f_name] = metric_fn(y_true, y_pred)
        
        # Update the final list with current image data
        out.append(idata)
    
    print(f'  Found {len(out)} matches')
    return out


def calc_mean_metrics(measurements_list: list, metric_fns: list, desc: str, method: str) -> None:
    '''
    Print human-readable metrics results data
    
    Process a list of single measurements and
    return a dict containing each metric mean and stdev values
    
    PLEASE NOTE
    ---
    The mean F1 value calculated by summing all experiments F1 and dividing by N elements
    is different than the mean calculated by applying the F1 formula on average REcall and PRecision!

    #### Medium Averaging
    In the code I call 'medium average' the metrics in which I average only the
    medium-scores (PRecision, REcall, SPecificity)
    and in the end I calculate the functions of the 'final' metrics (F1, dprs) using these averages

    'Medium' as in their formulas they use the basic metrics (the ones in a confusion matrix:
    True Positives, False Negatives, ..), while 'final' metrics
    use the medium metrics themselves in their formulas.

    By following this logic, 'final' averaging means calculating the final metrics
    at the first step, along with the medium metrics, for each image and averaging
    these values on the batch of images.

    In a mathematical way:
    f1: 2 * precision * recall / (precision + recall)
    f1_finavg: avg(f1)
    f1_medavg: 2 * avg(precision) * avg(recall) / (avg(precision) + avg(recall))

    '''
    print(desc)
    res = {}
    # Insert datasets and method data into the resulting dict
    res['method'] = method
    try:
        desc = desc.split(' ')[0] # remove hash string
        desc = os.path.normpath(desc) # remove trailing slash
        desc = os.path.basename(desc) # get prediction folder name
        desc = desc.lower().replace('_small', '') # lower case and rename HGR_small to HGR
        dss = desc.split('_on_') # split training and predicting datasets
        ds_tr = dss[0]
        ds_te = dss[1]
        res['train'] = ds_tr
        res['test'] = ds_te
    except: # eg. for testing
        pass

    medium_avg = []

    for metric_fn in metric_fns:
        f_name = metric_fn.__name__
        f_score = -99

        if f_name.endswith('_medium'): # is a medium-average metric
            medium_avg.append(metric_fn)
            continue
        else:
            f_data = [ d[f_name] for d in measurements_list ]
            f_mean = mean(f_data)
            f_mean = '{:.4f}'.format(f_mean) # round to 4 decimals and zerofill
            f_std = pstdev(f_data)
            f_std = '{:.2f}'.format(f_std) # round to 2 decimals and zerofill
            #print(f'{f_name}: {f_mean} ± {f_std}')
            # add each metric data to the dict
            res[f_name] = f'{f_mean} ± {f_std}'
    
    # The 'medium average' metrics average only the intermediate-scores (PRecision, REcall, SPecificity)
    # and then calculate the functions of the final metrics
    for metric_fn in medium_avg:
        f_name = metric_fn.__name__
        # Calculate the medium-average score using medium-scores averages
        pr = float(res['precision'].split(' ')[0])
        re = float(res['recall'].split(' ')[0])
        sp = float(res['specificity'].split(' ')[0])
        f_score = metric_fn(pr, re, sp)
        f_score = '{:.4f}'.format(f_score)
        res[f_name] = f_score
    
    for key, value in sorted(res.items()):
        print(f'{key}: {value}')
    
    return res

def read_performance(perf_dir: str):
    '''Read inference time from performance benchmark files, and print it'''
    csv_sep = ','

    # will contain the final mean between each observation's mean
    observations_means = []

    # do the mean of each observation
    for i in range(5000):
        perf_filename = f'bench{i}.txt'
        perf_file = os.path.join(perf_dir, perf_filename)

        if not os.path.isfile(perf_file):
            break

        # read txt lines (as csv)
        file2c = open(perf_file)
        doubles = file2c.read().splitlines()
        file2c.close()

        intra_obs_timelist = []
        for entry in doubles: # ori_path, execution_time(s)
            ori_path = entry.split(csv_sep)[0]
            execution_time = entry.split(csv_sep)[1]
            intra_obs_timelist.append(float(execution_time))
        
        obs_mean = mean(intra_obs_timelist)
        obs_mean = '{:.6f}'.format(obs_mean) # round and zerofill
        obs_std = pstdev(intra_obs_timelist)
        obs_std = '{:.3f}'.format(obs_std) # round and zerofill

        obs_string = f'{obs_mean} ± {obs_std}'

        observations_means.append(obs_string)
        print(f'{perf_dir} at {i}: {obs_string}')
    
    # get the means from observation means, without the std
    obs_mean_values = []
    for entry in observations_means:
        obs_mean_values.append(float(entry.split(' ')[0]))
    
    # do the final mean of the observation means
    fin_mean = mean(obs_mean_values)
    fin_mean = '{:.6f}'.format(fin_mean) # round and zerofill
    fin_std = pstdev(obs_mean_values)
    fin_std = '{:.3f}'.format(fin_std) # round and zerofill

    fin_string = f'{fin_mean} ± {fin_std}'

    print(f'{perf_dir} at FIN: {fin_string}\n')
    return fin_string


# Define Latex Utils

In [10]:
def print_latex(cross_preds: bool, skintones: bool, db_paths: list):
    '''
    Function used to print latex table featured in thesis

    Requires predictions to follow a rigid folder tree structure
    '''
    if not skintones:
        db_list = ['ecu', 'hgr', 'schmugge']
    else:
        db_list = ['dark', 'medium', 'light']

    # paths resolving
    if not cross_preds:
        detectors = ['skinny', 'probabilistic', 'dyc']
        db_paths = []
        for db in db_list:
            for sd in detectors:
                if db == 'hgr' and sd == 'dyc':
                    db = 'hgr_small'
                if skintones:
                    sd += '_st'
                db_paths.append(f'dataset/{sd}/base/{db}')
    else:
        detectors = ['skinny', 'bayes']
        db_paths = []
        for db_tr in db_list:
            for db_te in db_list:
                if db_te != db_tr:
                    for sd in detectors:
                        if skintones:
                            sd += '_st'
                        db_paths.append(f'dataset/{sd}/cross/{db_tr}_on_{db_te}')

    metrics = [f1, iou, dprs]
    json_table = []

    # compute metrics
    for ds in db_paths:
        y_path = os.path.join(ds, 'y') # '{dataset}/y'
        p_path = os.path.join(ds, 'p') # '{dataset}/p'
        
        singles = calc_metrics(y_path, p_path, metrics)
        # 'dataset/skinny/...'
        skin_detector = ds.split('/')[1]

        if not cross_preds:
            ds = ds + '_on_' + os.path.basename(ds)

        table_item = calc_mean_metrics(singles, metrics, desc=ds, method=skin_detector)

        json_table.append(table_item)

    if not cross_preds:
        print(get_latex_base(json_table, db_list))
    else:
        print(get_latex_cross(json_table, db_list))


def is_better(value1, value2, mode: str):
    '''Return whether a value is better than another value'''
    if mode == 'upper':
        return value1 > value2
    else:
        return value1 < value2

def bold_best(data: list, datas: list, base = False):
    '''Make the best values bold'''
    maxv = {}
    # Save the best values between the METHODS (skinny/probabilistic)
    # for each metric and dataset combination
    for obj in data:
        o_m = obj['method']
        o_train = obj['train']
        o_test = obj['test']
        o_f1 = obj['f1']
        o_iou = obj['iou']
        o_dprs = obj['dprs']
        f1iou = float(o_f1.split(' ')[0]) - float(o_iou.split(' ')[0])
        obj['f1iou'] = round(f1iou, 4)

        print(obj)

        # They are cross predictions hence test != train
        if base == False and o_train == o_test:
            continue
        
        # For each table metric multirow group
        for f_name in ['f1', 'iou', 'dprs', 'f1iou']:
            fn_val = obj[f_name] # metric value of the current iteration
            fn_idformat = f'{f_name}_{o_train}_{o_test}' # ID format
            fnv = f'{fn_idformat}_v' # value of the max measurement
            fni = f'{fn_idformat}_i' # ID of the max measurement

            bmode = 'upper'
            if f_name == 'dprs' or f_name == 'f1iou':
                bmode = 'lower'

            # For each table column
            for trdata in datas:
                for tedata in datas:
                    if o_train == trdata and o_test == tedata:
                        # Save best between methods

                        # if max does not exist, add its value and ID
                        if fnv not in maxv:
                            maxv[fnv] =  fn_val
                            maxv[fni] =  o_m
                        # if new max, save the measurement and its ID
                        elif f_name != 'f1iou' and is_better(float(fn_val.split(' ')[0]), float(maxv[fnv].split(' ')[0]), bmode):
                            maxv[fnv] =  fn_val
                            maxv[fni] =  o_m
                        elif f_name == 'f1iou':
                            if is_better(float(fn_val), float(maxv[fnv]), bmode):
                                maxv[fnv] =  fn_val
                                maxv[fni] =  o_m
    newdata = []
    # And now make them bold
    for obj in data:
        o_m = obj['method']
        o_train = obj['train']
        o_test = obj['test']
        o_f1 = obj['f1']
        o_iou = obj['iou']
        o_dprs = obj['dprs']
        f1iou = float(o_f1.split(' ')[0]) - float(o_iou.split(' ')[0])
        obj['f1iou'] = '{:.4f}'.format(f1iou) # round and zerofill

        for f_name in ['f1', 'iou', 'dprs', 'f1iou']:
            fn_val = obj[f_name] # metric value of the current iteration
            fn_idformat = f'{f_name}_{o_train}_{o_test}' # ID format
            fnv = f'{fn_idformat}_v' # value of the max measurement
            fni = f'{fn_idformat}_i' # ID of the max measurement

            if fni in maxv and maxv[fni] == o_m:
                obj_formatted = '\\texttt{' + '\\textbf{' + str(obj[f_name]) + '}' + '}'
                obj[f_name] = obj_formatted # set bold and monospace
                newdata.append(obj)
            else:
                obj_formatted = '\\texttt{' + str(obj[f_name]) + '}'
                obj[f_name] = obj_formatted # set monospace
                newdata.append(obj)
    data = newdata
    print('newdata:')
    print(data)

    # Change JSON format into a standalone data structure containing all table variables
    ff = {}
    for obj in data:
        o_m = obj['method']
        o_train = obj['train']
        o_test = obj['test']
        o_f1 = obj['f1']
        o_iou = obj['iou']
        o_dprs = obj['dprs']
        f1iou = obj['f1iou']

        ff[f'f1_{o_m}_{o_train}_{o_test}'] = o_f1
        ff[f'iou_{o_m}_{o_train}_{o_test}'] = o_iou
        ff[f'dprs_{o_m}_{o_train}_{o_test}'] = o_dprs
        ff[f'f1iou_{o_m}_{o_train}_{o_test}'] = f1iou
    
    print(ff)
    return ff

# Data is a list of JSON items
# JSON item example: {"name":"ecu", "F1":".9123 +- 0.25", "IOU":".8744 +- 0.11"}
def get_latex_cross(data: list, datas = None):
    '''Return latex table containing cross-datasets metrics measurements'''
    tex_body = ''

    if datas == None:
        datas = ['ecu', 'hgr', 'schmugge']

    ff = bold_best(data, datas)

    # Start building the body string
    tex_ms = ['F1 $\\uparrow$', 'IOU $\\uparrow$', 'Dprs $\\downarrow$', 'F1 - IOU $\\downarrow$']
    i = 2
    for tm in tex_ms:
        mns = tm.split(' ')

        if len(mns) > 2: # f1 - iou
            mn = 'f1iou'
        else:
            mn = mns[0].lower()
        
        # For each metrics there are 2 lines(methods): Skinny and Probabilistic
        for j in range(2):
            pfix = ''

            if j == 0:
                met = 'skinny'
                mf = met[0].lower()
                metf = met.upper() + '\\rule{0pt}{14pt}' # spacing between multirows (metrics)
                metric_w_arrow = tm
                pfix = f'''\\multirow{{2}}{{*}}{{{{{metric_w_arrow}}}}}'''
            elif j == 1:
                met = 'bayes'
                mf = met[0].lower()
                metf = met.upper()
            
            if datas != ['ecu', 'hgr', 'schmugge']:
                met = f'{met}_st'

            # Another data struct to gather all items necessary for writing a table line
            tmp = {}
            datas_startletter = []
            for ds_tr in datas:
                datas_startletter.append(ds_tr[0].lower())
                for ds_te in datas:
                    if ds_tr == ds_te:
                        continue
                    tmp[f'{mf}_{ds_tr[0].lower()}{ds_te[0].lower()}'] = ff[f'{mn}_{met}_{ds_tr}_{ds_te}']

            tex_body += f'{pfix}& {metf}'
            for letter_tr in datas_startletter:
                for letter_te in datas_startletter:
                    if letter_tr != letter_te:
                        table_item = tmp[f'{mf}_{letter_tr}{letter_te}']
                        tex_body += f' & {table_item}'
            tex_body += '\\\\'
    
    # String header
    tex_header = r'''
    \begin{tabular}{clcccccc}
    \toprule
    \multicolumn{1}{c}{} & \multicolumn{1}{c}{\head{Training}} 
    '''
    
    # Add first row
    for dss in datas:
        tex_header += r'& \multicolumn{2}{c}{\head{' + dss.upper() + '}} '
    tex_header += r'\\'

    tex_header += r'\multicolumn{1}{c}{} & \multicolumn{1}{c}{\head{Testing}} '
    # Add second row
    for dss in datas:
        for dssd in datas:
            if dssd != dss:
                tex_header += r' & \multicolumn{1}{c}{\head{' + dssd.upper() + '}} '
    tex_header += r'\\'
    tex_header += r'\midrule'

    # String end
    tex_end = r'''
    \bottomrule
    \end{tabular}
    '''

    tex = tex_header + tex_body + tex_end
    return tex

# data is a list of JSON items
# JSON item example: {"name":"ecu", "F1":".9123 +- 0.25", "IOU":".8744 +- 0.11"}
def get_latex_base(data: list, datas = None):
    '''Return latex table containing base-datasets metrics measurements'''
    tex_body = ''

    if datas == None:
        datas = ['ecu', 'hgr', 'schmugge']

    ff = bold_best(data, datas, True)

    # Start building the body string
    metrics = ['f1', 'iou', 'dprs']
    
    # At first loop ROWS
    # for each metrics there are 2 lines(methods): Skinny and Bayes, DYC
    for j in range(3):
        if j == 0:
            met = 'skinny'
        elif j == 1:
            met = 'bayes'
        else:
            met = 'dyc'
        metf = met.upper()
        mf = met[0].lower()

        if datas != ['ecu', 'hgr', 'schmugge']:
            met = f'{met}_st'

        # Another data struct to gather all items necessary for writing a table line
        tmp = {}
        # Then loop COLUMNS
        datas_startletter = []
        for ds in datas:
            datas_startletter.append(ds[0].lower())
            for tm in metrics:
                tmp[f'{mf}_{ds[0].lower()}{tm}'] = ff[f'{tm}_{met}_{ds}_{ds}']

        # m is method
        # eh = ecu_on_hgr, es = ecu_on_schmugge, ...
        tex_body += f'{metf}'
        for letter in datas_startletter:
            for tmm in metrics:
                table_item = tmp[f'{mf}_{letter}{tmm}']
                tex_body += f' & {table_item}'
        tex_body += '\\\\'
    
    # String header
    tex_header = r'''
    \begin{tabular}{lccccccccc}
    \toprule
    '''
    
    # Add first row
    for dss in datas:
        tex_header += r'& \multicolumn{3}{c}{\head{' + dss.upper() + '}} '
    tex_header += r'\\'
    
    tex_header += r'''
    & \multicolumn{1}{c}{\head{F1 $\uparrow$}} & \multicolumn{1}{c}{\head{IOU $\uparrow$}} & \multicolumn{1}{c}{\head{Dprs $\downarrow$}}
    & \multicolumn{1}{c}{\head{F1 $\uparrow$}} & \multicolumn{1}{c}{\head{IOU $\uparrow$}} & \multicolumn{1}{c}{\head{Dprs $\downarrow$}}
    & \multicolumn{1}{c}{\head{F1 $\uparrow$}} & \multicolumn{1}{c}{\head{IOU $\uparrow$}} & \multicolumn{1}{c}{\head{Dprs $\downarrow$}}\\
    \midrule
    '''

    # String end
    tex_end = r'''
    \bottomrule
    \end{tabular}
    '''

    tex = tex_header + tex_body + tex_end
    return tex

# Run Measurements

In [17]:
#  Print skin detectors' inference times
if bench_mode == 'performance':
    detectors = ['skinny', 'bayes', 'dyc']

    for det in detectors:
        perf_dir = f'performance/bench_{det}'
        read_performance(perf_dir)

performance/bench_skinny at 0: 0.807026 ± 0.064
performance/bench_skinny at 1: 0.797650 ± 0.008
performance/bench_skinny at 2: 0.811103 ± 0.010
performance/bench_skinny at 3: 0.911956 ± 0.242
performance/bench_skinny at 4: 0.805169 ± 0.008
performance/bench_skinny at FIN: 0.826581 ± 0.043

performance/bench_bayes at 0: 0.459174 ± 0.001
performance/bench_bayes at 1: 0.457149 ± 0.003
performance/bench_bayes at 2: 0.458998 ± 0.002
performance/bench_bayes at 3: 0.458094 ± 0.001
performance/bench_bayes at 4: 0.454253 ± 0.002
performance/bench_bayes at FIN: 0.457534 ± 0.002

performance/bench_dyc at 0: 0.007665 ± 0.000
performance/bench_dyc at 1: 0.007677 ± 0.000
performance/bench_dyc at 2: 0.007730 ± 0.000
performance/bench_dyc at 3: 0.007763 ± 0.000
performance/bench_dyc at 4: 0.007752 ± 0.000
performance/bench_dyc at FIN: 0.007717 ± 0.000



In [14]:
import json


#  Settings

#  Method can be: base, cross
method = 'cross'
#  Mode can be: normal, skintones
mode = 'normal'
#  Whether to dump results to a json file
dump_to_file = False



#  Print skin detectors' performance on datasets
if bench_mode == 'dataset':
    #  Resolve datasets
    if mode == 'normal':
        db_list = ['ecu', 'hgr', 'schmugge']
    elif mode == 'skintones':
        db_list = ['dark', 'medium', 'light']


    #  Resolve paths
    if method == 'base':
        detectors = ['skinny', 'bayes', 'dyc']
        db_paths = []
        for db in db_list:
            for sd in detectors:
                if db == 'hgr' and sd == 'dyc':
                    db = 'hgr_small'
                if mode == 'skintones':
                    sd += '_st'
                db_paths.append(f'dataset/{sd}/base/{db}')
    elif method == 'cross':
        detectors = ['skinny', 'bayes']
        db_skinny = []
        db_bayes = []
        db_paths = []
        for db_tr in db_list:
            for db_te in db_list:
                if db_te != db_tr:
                    for sd in detectors:
                        if mode == 'skintones':
                            sd += '_st'
                        db_paths.append(f'dataset/{sd}/cross/{db_tr}_on_{db_te}')


    #  Compute metrics
    metrics = [f1, iou, dprs] #  metrics featured in the thesis
    json_table = []
    for ds in db_paths:
        y_path = os.path.join(ds, 'y') # '{dataset}/y'
        p_path = os.path.join(ds, 'p') # '{dataset}/p'
        
        singles = calc_metrics(y_path, p_path, metrics)
        # 'dataset/skinny/...'
        skin_detector = ds.split('/')[1]

        if method == 'base':
            ds = ds + '_on_' + os.path.basename(ds)

        avg = calc_mean_metrics(singles, metrics, desc=ds, method=skin_detector)
        json_table.append(avg)

    if method == 'base':
        print(get_latex_base(json_table, db_list))
    else:
        print(get_latex_cross(json_table, db_list))


    #  Save JSON table
    if dump_to_file == True:
        out_table = open("metrics.json", "w")
        json.dump(json_table, out_table)
        out_table.close()


100%|██████████| 1558/1558 [00:11<00:00, 138.69it/s]


  Found 1558 matches
dataset/skinny/cross/ecu_on_hgr
dprs: 0.1098 ± 0.15
f1: 0.9308 ± 0.11
iou: 0.8851 ± 0.15
method: skinny
test: hgr
train: ecu


100%|██████████| 1558/1558 [00:11<00:00, 134.27it/s]


  Found 1558 matches
dataset/bayes/cross/ecu_on_hgr
dprs: 0.5701 ± 0.29
f1: 0.5577 ± 0.29
iou: 0.4393 ± 0.27
method: bayes
test: hgr
train: ecu


100%|██████████| 840/840 [00:01<00:00, 444.59it/s]


  Found 840 matches
dataset/skinny/cross/ecu_on_schmugge
dprs: 0.7570 ± 0.56
f1: 0.4625 ± 0.41
iou: 0.3986 ± 0.37
method: skinny
test: schmugge
train: ecu


100%|██████████| 840/840 [00:01<00:00, 553.16it/s]


  Found 840 matches
dataset/bayes/cross/ecu_on_schmugge
dprs: 1.0477 ± 0.35
f1: 0.3319 ± 0.28
iou: 0.2346 ± 0.21
method: bayes
test: schmugge
train: ecu


100%|██████████| 3998/3998 [00:32<00:00, 123.01it/s]


  Found 3998 matches
dataset/skinny/cross/hgr_on_ecu
dprs: 0.3913 ± 0.26
f1: 0.7252 ± 0.20
iou: 0.6038 ± 0.22
method: skinny
test: ecu
train: hgr


100%|██████████| 3998/3998 [00:56<00:00, 70.51it/s]


  Found 3998 matches
dataset/bayes/cross/hgr_on_ecu
dprs: 0.8830 ± 0.23
f1: 0.4279 ± 0.19
iou: 0.2929 ± 0.17
method: bayes
test: ecu
train: hgr


100%|██████████| 840/840 [00:02<00:00, 408.11it/s]


  Found 840 matches
dataset/skinny/cross/hgr_on_schmugge
dprs: 0.9695 ± 0.44
f1: 0.2918 ± 0.31
iou: 0.2168 ± 0.25
method: skinny
test: schmugge
train: hgr


100%|██████████| 840/840 [00:01<00:00, 520.08it/s]


  Found 840 matches
dataset/bayes/cross/hgr_on_schmugge
dprs: 1.0219 ± 0.42
f1: 0.4000 ± 0.32
iou: 0.2981 ± 0.24
method: bayes
test: schmugge
train: hgr


100%|██████████| 3998/3998 [00:35<00:00, 111.94it/s]


  Found 3998 matches
dataset/skinny/cross/schmugge_on_ecu
dprs: 0.5537 ± 0.27
f1: 0.6133 ± 0.21
iou: 0.4754 ± 0.22
method: skinny
test: ecu
train: schmugge


100%|██████████| 3998/3998 [00:52<00:00, 75.98it/s]


  Found 3998 matches
dataset/bayes/cross/schmugge_on_ecu
dprs: 0.7542 ± 0.30
f1: 0.4638 ± 0.23
iou: 0.3318 ± 0.20
method: bayes
test: ecu
train: schmugge


100%|██████████| 1558/1558 [00:12<00:00, 124.88it/s]


  Found 1558 matches
dataset/skinny/cross/schmugge_on_hgr
dprs: 0.2846 ± 0.27
f1: 0.8106 ± 0.19
iou: 0.7191 ± 0.23
method: skinny
test: hgr
train: schmugge


100%|██████████| 1558/1558 [00:12<00:00, 124.06it/s]


  Found 1558 matches
dataset/bayes/cross/schmugge_on_hgr
dprs: 0.6523 ± 0.27
f1: 0.5060 ± 0.25
iou: 0.3752 ± 0.22
method: bayes
test: hgr
train: schmugge
{'method': 'skinny', 'train': 'ecu', 'test': 'hgr', 'f1': '0.9308 ± 0.11', 'iou': '0.8851 ± 0.15', 'dprs': '0.1098 ± 0.15', 'f1iou': 0.0457}
{'method': 'bayes', 'train': 'ecu', 'test': 'hgr', 'f1': '0.5577 ± 0.29', 'iou': '0.4393 ± 0.27', 'dprs': '0.5701 ± 0.29', 'f1iou': 0.1184}
{'method': 'skinny', 'train': 'ecu', 'test': 'schmugge', 'f1': '0.4625 ± 0.41', 'iou': '0.3986 ± 0.37', 'dprs': '0.7570 ± 0.56', 'f1iou': 0.0639}
{'method': 'bayes', 'train': 'ecu', 'test': 'schmugge', 'f1': '0.3319 ± 0.28', 'iou': '0.2346 ± 0.21', 'dprs': '1.0477 ± 0.35', 'f1iou': 0.0973}
{'method': 'skinny', 'train': 'hgr', 'test': 'ecu', 'f1': '0.7252 ± 0.20', 'iou': '0.6038 ± 0.22', 'dprs': '0.3913 ± 0.26', 'f1iou': 0.1214}
{'method': 'bayes', 'train': 'hgr', 'test': 'ecu', 'f1': '0.4279 ± 0.19', 'iou': '0.2929 ± 0.17', 'dprs': '0.8830 ± 0.23', 'f1iou': 0

# Unittest Metrics

In [15]:
import unittest

import numpy as np


class TestMetrics(unittest.TestCase):
    '''Unit testing for metrics measurements'''


    def metrics_unittesting(self):
        # singles
        y_true = np.array([[1,0,0],[0,1,0],[0,0,1]]) > 0 # > 0 to cast as bool
        y_pred = np.array([[1,0,0],[1,0,0],[0,0,0]]) > 0
        tp = 1
        tn = 5
        fp = 1
        fn = 2
        cs = confmat_scores(y_true, y_pred)
        pr = tp / (tp+fp) # 0.5
        re = tp / (tp+fn) # 0.33
        sp = tn / (tn+fp) # 0.83
        f1_ = 2*re*pr / (re+pr)
        overlap = y_true * y_pred
        union =   y_true | y_pred # with bitwise or it would work even without casting matrix as bool
        iou_ = overlap.sum() / (union.sum())
        a = (1 - pr)**2 # 0.25
        b = (1 - re)**2 # 0.4489
        c = (1 - sp)**2 # 0.0289
        dprs_ = math.sqrt(a + b + c)
        #           3               /  V        2       *    3      *     6     *    7
        mcc_ = (tp * tn - fp * fn) / math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
        print(cs)
        self.assertEqual(round(pr, 2), 0.5)
        self.assertEqual(round(pr, 2), round(precision(cs), 2), 'pr not equal to its function')
        self.assertEqual(round(re, 2), 0.33)
        self.assertEqual(round(re, 2), round(recall(cs), 2), 're not equal to its function')
        self.assertEqual(round(sp, 2), 0.83)
        self.assertEqual(round(sp, 2), round(specificity(cs), 2), 'sp not equal to its function')
        self.assertEqual(round(f1_, 2), 0.40)
        self.assertEqual(round(f1_, 2), round(f1(cs), 2), 'f1 not equal to its function')
        self.assertEqual(round(iou_, 2), 0.25)
        self.assertEqual(round(iou_, 2), round(iou(cs), 2), 'iou not equal to its function')
        self.assertEqual(round(iou_, 2), round(iou_logical(y_true, y_pred), 2), 'iou not equal to its function (alt)')
        self.assertEqual(round(dprs_, 2), 0.85)
        self.assertEqual(round(dprs_, 2), round(dprs(cs), 2), 'dprs not equal to its function')
        self.assertEqual(round(mcc_, 2), 0.19)
        self.assertEqual(round(mcc_, 2), round(mcc(cs), 2), 'mcc not equal to its function')
        # medium avg
        #metrics = [f1_medium, f1, f2, iou, iou_logical, dprs_medium, dprs, mcc, recall, precision, specificity]
        #rpd = pd_metrics(docs_y_path, docs_p_path, metrics)
        #res = print_pd_mean(rpd, metrics, desc='unit testing')
        pr_1 = 0.51
        pr_2 = 0.82
        pr_avg = (pr_1 + pr_2) /2   # 0.67
        re_1 = 0.61
        re_2 = 0.45
        re_avg = (re_1 + re_2) /2   # 0.53
        sp_1 = 0.14
        sp_2 = 0.62
        sp_avg = (sp_1 + sp_2) /2   # 0.38
        f1_med_avg = pr_avg * re_avg * 2 / (pr_avg + re_avg)
        a_ = (1 - pr_avg)**2 # 0.10
        b_ = (1 - re_avg)**2 # 0.22
        c_ = (1 - sp_avg)**2 # 0.38
        dprs_med_avg = math.sqrt(a_ + b_ + c_)
        self.assertEqual(round(f1_med_avg, 2), 0.59)
        self.assertEqual(round(f1_med_avg, 2), round(f1_medium(pr_avg, re_avg, sp_avg), 2), 'f1-medium not equal to its function')
        self.assertEqual(round(dprs_med_avg, 2), 0.85)
        self.assertEqual(round(dprs_med_avg, 2), round(dprs_medium(pr_avg, re_avg, sp_avg), 2), 'dprs-medium not equal to its function')

    def mcc_unittest(self):
        '''
        Unit testing based on MCC's paper analysis
        
        The analysis shows how F1 doesn't care much about TN and could signal
        over-optimistic data to the classifier
        '''
        # Use Case A1: Positively imbalanced dataset
        data = {}
        data['ap'] = 91   # 91 sick patients
        data['an'] = 9    # 9 healthy individuals
        data['se'] = 99
        data['tp'] = 90   # algorithm is good at predicting positive data
        data['fp'] = 9
        data['tn'] = 0
        data['fn'] = 1    # algorithm is bad at predicting negative data
        # F1 measures an almost perfect score, MCC instead measures a bad score
        # F1 0.95    MCC -0.03
        f1_ = round(f1(data), 2)
        mcc_ = round(mcc(data), 2)
        self.assertEqual(f1_, 0.95)
        self.assertEqual(mcc_, -0.03)

        # Use Case A2: Positively imbalanced dataset
        data = {}
        data['ap'] = 75   # 75 positives
        data['an'] = 25   # 25 negatives
        data['se'] = 11
        data['tp'] = 5    # classifier unable to predict positives
        data['fp'] = 6
        data['tn'] = 19   # classifier was able to predict negatives
        data['fn'] = 70
        # In this case both the metrics measure a bad score
        # F1 0.12    MCC -0.24
        f1_ = round(f1(data), 2)
        mcc_ = round(mcc(data), 2)
        self.assertEqual(f1_, 0.12)
        self.assertEqual(mcc_, -0.24)

        # Use Case B1: Balanced dataset
        data = {}
        data['ap'] = 50   # 50 positives
        data['an'] = 50   # 50 negatives
        data['se'] = 92
        data['tp'] = 47   # classifier able to predict positives
        data['fp'] = 45
        data['tn'] = 5    # classifier was unable to predict negatives
        data['fn'] = 3
        # F1 measures a good score, MCC doesn't
        # F1 0.66    MCC 0.07
        f1_ = round(f1(data), 2)
        mcc_ = round(mcc(data), 2)
        self.assertEqual(f1_, 0.66)
        self.assertEqual(mcc_, 0.07)

        # Use Case B2: Balanced dataset
        data = {}
        data['ap'] = 50   # 50 positives
        data['an'] = 50   # 50 negatives
        data['se'] = 14
        data['tp'] = 10   # classifier was unable to predict positives
        data['fp'] = 4
        data['tn'] = 46    # classifier able to predict negatives
        data['fn'] = 40
        # F1 measures a good score, MCC doesn't
        # F1 0.31    MCC 0.17
        f1_ = round(f1(data), 2)
        mcc_ = round(mcc(data), 2)
        self.assertEqual(f1_, 0.31)
        self.assertEqual(mcc_, 0.17)

        # Use Case C1: Negatively imbalanced dataset
        data = {}
        data['ap'] = 10   # 10 positives
        data['an'] = 90   # 90 negatives
        data['se'] = 98
        data['tp'] = 9    # classifier was unable to predict positives
        data['fp'] = 89
        data['tn'] = 1    # classifier able to predict negatives
        data['fn'] = 1
        # Both the scores gives bad measure
        # F1 0.17    MCC -0.19
        f1_ = round(f1(data), 2)
        mcc_ = round(mcc(data), 2)
        self.assertEqual(f1_, 0.17)
        self.assertEqual(mcc_, -0.19)

        # Use Case C2: Negatively imbalanced dataset
        data = {}
        data['ap'] = 11   # 10 positives
        data['an'] = 89   # 89 negatives
        data['se'] = 3
        data['tp'] = 2   # classifier was unable to predict positives
        data['fp'] = 1
        data['tn'] = 88    # classifier able to predict negatives
        data['fn'] = 9
        # Both the scores gives bad measure
        # F1 0.29    MCC 0.31
        f1_ = round(f1(data), 2)
        mcc_ = round(mcc(data), 2)
        self.assertEqual(f1_, 0.29)
        self.assertEqual(mcc_, 0.31)


unittest.main(argv=[''], verbosity=2, exit=False)


----------------------------------------------------------------------
Ran 0 tests in 0.000s

OK


<unittest.main.TestProgram at 0x7fc77c0e4110>