In [336]:
import pandas as pd
import numpy as np
import logging
logging.getLogger('mlscorecheck').setLevel(logging.ERROR)
from mlscorecheck.check import check_1_testset_no_kfold_scores

In [337]:
results = []

In [338]:
isic2016 = {'p': 75, 'n': 304}
isic2017m = {'p': 117, 'n': 393 + 90}
isic2017sk = {'p': 90, 'n': 393 + 117}

# Automated melanoma recognition in dermoscopy images via very deep residual networks 

doi: 10.1109/TMI.2016.2642839

dataset: ISBI 2016

In [339]:
table = """Method AC AUC AP SE SP
CUMED 0.855 0.804 0.637 0.507 0.941
GTDL 0.813 0.802 0.619 0.573 0.872
BF-TB 0.834 0.826 0.598 0.320 0.961
ThrunLab 0.786 0.796 0.563 0.667 0.816
JordanYap 0.844 0.775 0.559 0.240 0.993
HaebeomLee 0.821 0.793 0.555 0.200 0.974
GT-DL1 0.815 0.813 0.552 0.467 0.901
GT-DL2 0.681 0.800 0.545 0.787 0.655
SebastienPARIS 0.731 0.793 0.542 0.773 0.720
USYD-BMIT 0.599 0.780 0.537 0.853 0.536"""

df = pd.DataFrame([row.split(' ') for row in table.split('\n')])
df.columns = df.iloc[0]
df = df.drop(0)
df = df.rename({'AC': 'acc', 'DI': 'f1', 'JA': 'ji', 'SE': 'sens', 'SP': 'spec'}, axis='columns')
for score in ['acc', 'sens', 'spec', 'AUC', 'AP']:
    df[score] = df[score].astype(float)
df = df.set_index('Method')
df = df.drop(['AUC', 'AP'], axis='columns')

In [340]:
df

Unnamed: 0_level_0,acc,sens,spec
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CUMED,0.855,0.507,0.941
GTDL,0.813,0.573,0.872
BF-TB,0.834,0.32,0.961
ThrunLab,0.786,0.667,0.816
JordanYap,0.844,0.24,0.993
HaebeomLee,0.821,0.2,0.974
GT-DL1,0.815,0.467,0.901
GT-DL2,0.681,0.787,0.655
SebastienPARIS,0.731,0.773,0.72
USYD-BMIT,0.599,0.853,0.536


In [341]:
n_inconsistent = 0
for idx, row in df.iterrows():
    res = check_1_testset_no_kfold_scores(testset=isic2016,
                                            scores=row,
                                            eps=1e-3)

    if res['inconsistency']:
        n_inconsistent += 1
    
    print(idx, res['inconsistency'])

CUMED False
GTDL False
BF-TB False
ThrunLab False
JordanYap False
HaebeomLee False
GT-DL1 False
GT-DL2 False
SebastienPARIS False
USYD-BMIT False


In [342]:
results.append({'key': 'skin0',
                'citations': 991,
                'dataset': 'isic2016',
                'epsilon': 1e-3,
                'scores': ['acc', 'sens', 'spec'],
                'n_scores': len(df),
                'n_inconsistent': n_inconsistent,
                'remark': None})

# Automatic Skin Lesion Segmentation Using Deep Fully Convolutional Networks with Jaccard Distance

doi: 10.1109/TMI.2017.2695227

Does not carry out classification. Only segmentation.


In [343]:
results.append({'key': 'skin1',
                'citations': 603,
                'dataset': 'isic2016',
                'epsilon': 1e-3,
                'scores': ['acc', 'sens', 'spec'],
                'n_scores': None,
                'n_inconsistent': None,
                'remark': 'The paper is about image segmentation performance.'})

# Deep learning ensembles for melanoma recognition in dermoscopy images

doi:  10.1147/JRD.2017.2708299

dataset: ISBI 2016

In [344]:
df0 = pd.DataFrame({'acc': [0.755, 0.81, 0.781, 0.834, 0.834, 0.807, 0.805, 0.855, 0.855, 0.836, 0.77, 0.726, 0.752],
                    'sens': [0.627, 0.72, 0.707, 0.533, 0.52, 0.693, 0.693, 0.547, 0.507, 0.253, 0.72, 0.693, 0.64],
                    'spec': [0.796, 0.832, 0.799, 0.9079, 0.9112, 0.836, 0.832, 0.931, 0.941, 0.98, 0.723, 0.734, 0.78]})

In [345]:
df1 = pd.DataFrame({'acc': [0.789, 0.734, 0.694, 0.702, 0.655, 0.694, 0.776, 0.723, 0.678, 0.699, 0.726, 0.686, 0.694, 0.702],
                    'sens': [0.213, 0.707, 0.707, 0.64, 0.4, 0.6, 0.267, 0.693, 0.627, 0.6, 0.507, 0.48, 0.587, 0.573],
                    'spec': [0.9309, 0.74, 0.691, 0.717, 0.73, 0.717, 0.9013, 0.73, 0.691, 0.724, 0.78, 0.737, 0.72, 0.734]})

In [346]:
n_inconsistent = 0
for idx, row in df0.iterrows():
    res = check_1_testset_no_kfold_scores(testset=isic2016,
                                            scores=row,
                                            eps=1e-3)
    if res['inconsistency']:
        n_inconsistent += 1
        
    print(idx, res['inconsistency'])

0 True
1 False
2 False
3 False
4 False
5 False
6 False
7 False
8 False
9 False
10 True
11 False
12 False


In [347]:
for idx, row in df1.iterrows():
    res = check_1_testset_no_kfold_scores(testset=isic2016,
                                            scores=row,
                                            eps=1e-3)
    if res['inconsistency']:
        n_inconsistent += 1
        
    print(idx, res['inconsistency'])

0 False
1 False
2 False
3 False
4 True
5 False
6 False
7 False
8 False
9 False
10 False
11 False
12 False
13 False


In [348]:
results.append({'key': 'skin2',
                'citations': 574,
                'dataset': 'isic2016',
                'epsilon': 1e-3,
                'scores': ['acc', 'sens', 'spec'],
                'n_scores': len(df0) + len(df1),
                'n_inconsistent': n_inconsistent,
                'remark': None})

# Attention Residual Learning for Skin Lesion Classification

doi: 10.1109/TMI.2019.2893944

dataset: isic2017

In [349]:
scores_m = pd.DataFrame({'acc': [0.748, 0.762, 0.757, 0.778, 0.838, 0.850, 0.848, 0.850],
                            'sens': [0.538, 0.615, 0.598, 0.615, 0.632, 0.624, 0.624, 0.658],
                            'spec': [0.799, 0.797, 0.795, 0.818, 0.888, 0.906, 0.903, 0.896]})
scores_sk = pd.DataFrame({'acc': [0.711, 0.758, 0.727, 0.763, 0.842, 0.862, 0.863, 0.868],
                            'sens': [0.8, 0.833, 0.811, 0.822, 0.867, 0.878, 0.856, 0.878],
                            'spec': [0.696, 0.745, 0.712, 0.753, 0.837, 0.859, 0.865, 0.867]})
scores_m_comp = pd.DataFrame({'acc': [0.850, 0.828, 0.823, 0.872, 0.858, 0.845, 0.837, 0.830],
                                'sens': [0.658, 0.735, 0.103, 0.547, 0.427, 0.350, 0.590, 0.436],
                                'spec': [0.896, 0.851, 0.998, 0.950, 0.963, 0.965, 0.896, 0.925]})
scores_sk_comp = pd.DataFrame({'acc': [0.868, 0.803, 0.875, 0.895, 0.918, 0.913, 0.908, 0.917],
                                'sens': [0.878, 0.978, 0.178, 0.356, 0.589, 0.556, 0.778, 0.7],
                                'spec': [0.867, 0.773, 0.998, 0.99, 0.976, 0.976, 0.931, 0.995]})

In [350]:
n_inconsistent = 0

for idx, row in scores_m.iterrows():
    res = check_1_testset_no_kfold_scores(testset=isic2017m,
                                            scores=row,
                                            eps=1e-3)
    if res['inconsistency']:
        n_inconsistent += 1
        
    print(idx, res['inconsistency'])

for idx, row in scores_sk.iterrows():
    res = check_1_testset_no_kfold_scores(testset=isic2017sk,
                                            scores=row,
                                            eps=1e-3)
    if res['inconsistency']:
        n_inconsistent += 1
        
    print(idx, res['inconsistency'])

0 False
1 False
2 False
3 False
4 False
5 True
6 False
7 False
0 False
1 False
2 False
3 False
4 False
5 False
6 False
7 False


In [351]:
for idx, row in scores_m_comp.iterrows():
    res = check_1_testset_no_kfold_scores(testset=isic2017m,
                                            scores=row,
                                            eps=1e-3)
    if res['inconsistency']:
        n_inconsistent += 1
        
    print(idx, res['inconsistency'])

for idx, row in scores_sk_comp.iterrows():
    res = check_1_testset_no_kfold_scores(testset=isic2017sk,
                                            scores=row,
                                            eps=1e-3)
    if res['inconsistency']:
        n_inconsistent += 1
        
    print(idx, res['inconsistency'])

0 False
1 False
2 False
3 False
4 False
5 False
6 False
7 False
0 False
1 False
2 False
3 False
4 False
5 False
6 False
7 True


In [352]:
results.append({'key': 'skin3',
                'citations': 389,
                'dataset': 'isic2017 m/sk',
                'epsilon': 1e-3,
                'scores': ['acc', 'sens', 'spec'],
                'n_scores': len(scores_m) + len(scores_sk) + len(scores_m_comp) + len(scores_sk_comp),
                'n_inconsistent': n_inconsistent,
                'remark': None})

# Deep learning outperformed 136 of 157 dermatologists in a head-to-head dermoscopic melanoma image classification task

doi : 10.1016/j.ejca.2019.04.001

In [353]:
results.append({'key': 'skin4',
                'citations': 389,
                'dataset': 'custom',
                'epsilon': 1e-3,
                'scores': ['sens', 'spec'],
                'n_scores': None,
                'n_inconsistent': None,
                'remark': 'Unclear evaluation methodology'})

# Seven-Point Checklist and Skin Lesion Classification Using Multitask Multimodal Neural Nets

doi: 10.1109/JBHI.2018.2824327

dataset: ? 395 test images

In [354]:
scores_bwv_prs = pd.DataFrame({'sens': [0.493, 0.653, 0.773, 0.653],
                                'spec': [0.966, 0.925, 0.894, 0.919],
                                'ppv': [0.771, 0.671, 0.63, 0.653]})
scores_bwv_abs = pd.DataFrame({'sens': [0.966, 0.925, 0.894, 0.919],
                                'spec': [0.493, 0.653, 0.773, 0.653],
                                'ppv': [0.89, 0.919, 0.944, 0.919]})

scores_mel = pd.DataFrame({'sens': [0.901, 0.475, 0.96, 0.693, 0.96, 0.693, 0.941, 0.733],
                            'spec': [0.401, 0.874, 0.33, 0.789, 0.361, 0.776, 0.361, 0.786],
                            'ppv': [0.341, 0.565, 0.33, 0.53, 0.34, 0.515, 0.336, 0.54]})

In [355]:
testset_bwv = {'n': int(np.ceil(395/1011 * 816))}
testset_bwv['p'] = 395 - testset_bwv['n']

In [356]:
for p in range(testset_bwv['p']-20, testset_bwv['p']+20):
    n = 395 - p
    tmp = check_1_testset_no_kfold_scores(testset={'p': p, 'n': n},
                                    scores=scores_bwv_prs.iloc[0],
                                    eps=1e-3)
    if not tmp['inconsistency']:
        print(p, n)

75 320


In [357]:
testset_bwv_prs = {'p': 75, 'n': 320}
testset_bwv_abs = {'p': 320, 'n': 75}

In [358]:
n_inconsistent = 0

for idx, row in scores_bwv_prs.iterrows():
    res = check_1_testset_no_kfold_scores(testset=testset_bwv_prs,
                                            scores=row,
                                            eps=1e-3)
    if res['inconsistency']:
        n_inconsistent += 1
        
    print(idx, res['inconsistency'])

for idx, row in scores_bwv_abs.iterrows():
    res = check_1_testset_no_kfold_scores(testset=testset_bwv_abs,
                                            scores=row,
                                            eps=1e-3)
    if res['inconsistency']:
        n_inconsistent += 1
        
    print(idx, res['inconsistency'])

0 False
1 False
2 False
3 False
0 False
1 False
2 False
3 False


In [359]:
testset_mel = {'p': int(np.ceil((1 + 64 + 102 + 53 + 28 + 4)*395/1011))}
testset_mel['n'] = 395 - testset_mel['p']

In [360]:
testset_mel

{'p': 99, 'n': 296}

In [361]:
for p in range(testset_mel['p']-20, testset_mel['p']+20):
    n = 395 - p
    tmp = check_1_testset_no_kfold_scores(testset={'p': p, 'n': n},
                                    scores=scores_mel.iloc[0],
                                    eps=1e-3)
    if not tmp['inconsistency']:
        print(p, n)

101 294


In [362]:
testset_mel = {'p': 101, 'n': 294}

In [363]:
for idx, row in scores_mel.iterrows():
    res = check_1_testset_no_kfold_scores(testset=testset_mel,
                                            scores=row,
                                            eps=1e-3)
    if res['inconsistency']:
        n_inconsistent += 1
        
    print(idx, res['inconsistency'])

0 False
1 False
2 False
3 False
4 False
5 False
6 False
7 False


In [364]:
results.append({'key': 'skin5',
                'citations': 322,
                'dataset': None,
                'epsilon': 1e-3,
                'scores': ['ppv', 'sens', 'spec'],
                'n_scores': len(scores_bwv_abs) + len(scores_bwv_prs) + len(scores_mel),
                'n_inconsistent': n_inconsistent,
                'remark': None})

# The skin cancer classification using deep convolutional neural network

doi : 10.1007/s11042-018-5714-1

In [365]:
results.append({'key': 'skin6',
                'citations': 313,
                'dataset': None,
                'epsilon': 1e-3,
                'scores': ['acc', 'sens', 'spec'],
                'n_scores': None,
                'n_inconsistent': None,
                'remark': 'Unclear evaluation'})

# Skin lesion classification with ensembles of deep convolutional neural networks

In [366]:
scores = [['method', 'AVG_ACC', 'M_ACC', 'SK_ACC', 'AVG_AUC', 'M_AUC', 'SK_AUC', 'AVG_SE', 'M_SE', 'SK_SE', 'AVG_SP', 'M_SP', 'SK_SP'],
['GoogLeNet',    0.842, 0.818, 0.865, 0.848, 0.794, 0.902, 0.592, 0.496, 0.689, 0.722, 0.613, 0.831],
['AlexNet',      0.848, 0.823, 0.872, 0.830, 0.802, 0.859, 0.518, 0.470, 0.567, 0.645, 0.561, 0.728],
['ResNet',       0.828, 0.822, 0.833, 0.809, 0.757, 0.861, 0.520, 0.385, 0.656, 0.598, 0.437, 0.759],
['VGGNet',       0.813, 0.802, 0.825, 0.808, 0.766, 0.849, 0.434, 0.256, 0.611, 0.636, 0.585, 0.686],
['SP',           0.867, 0.845, 0.888, 0.875, 0.832, 0.918, 0.516, 0.376, 0.656, 0.746, 0.654, 0.838],
['PP',           0.861, 0.845, 0.877, 0.875, 0.836, 0.913, 0.524, 0.393, 0.656, 0.738, 0.650, 0.826],
['SMV',          0.903, 0.875, 0.930, 0.839, 0.789, 0.888, 0.632, 0.487, 0.778, 0.822, 0.766, 0.878],
['SMP',          0.868, 0.848, 0.887, 0.884, 0.843, 0.925, 0.551, 0.402, 0.700, 0.763, 0.689, 0.836],
['ω(1)SMP',      0.869, 0.850, 0.888, 0.886, 0.844, 0.928, 0.580, 0.427, 0.733, 0.772, 0.689, 0.855],
['ω(2)SMP',      0.866, 0.852, 0.880, 0.891, 0.851, 0.930, 0.556, 0.402, 0.711, 0.785, 0.719, 0.851]]

In [367]:
scores = pd.DataFrame(scores)
scores.columns = scores.iloc[0]
scores.index = scores['method']
scores = scores.drop('method', axis='rows').drop('method', axis='columns')

In [368]:
isic2017m_reduced = {'p': 117, 'n': 393}
isic2017sk_reduced = {'p': 90, 'n': 393}

In [369]:
n_inconsistent = 0

for idx, row in scores.iterrows():
    res = check_1_testset_no_kfold_scores(testset=isic2017m,
                                            scores={'acc': row['M_ACC'],
                                                    'sens': row['M_SE'],
                                                    'spec': row['M_SP']},
                                            eps=1e-3)
    if res['inconsistency']:
        n_inconsistent += 1
        
    print(idx, res['inconsistency'])

for idx, row in scores.iterrows():
    res = check_1_testset_no_kfold_scores(testset=isic2017sk,
                                            scores={'acc': row['SK_ACC'],
                                                    'sens': row['SK_SE'],
                                                    'spec': row['SK_SP']},
                                            eps=1e-3)
    if res['inconsistency']:
        n_inconsistent += 1
        
    print(idx, res['inconsistency'])

GoogLeNet True
AlexNet True
ResNet True
VGGNet True
SP True
PP True
SMV True
SMP True
ω(1)SMP True
ω(2)SMP True
GoogLeNet True
AlexNet True
ResNet True
VGGNet True
SP True
PP True
SMV True
SMP True
ω(1)SMP True
ω(2)SMP True


In [370]:
for idx, row in scores.iterrows():
    res = check_1_testset_no_kfold_scores(testset=isic2017m_reduced,
                                            scores={'acc': row['M_ACC'],
                                                    'sens': row['M_SE'],
                                                    'spec': row['M_SP']},
                                            eps=1e-3)
        
    print(idx, res['inconsistency'])

for idx, row in scores.iterrows():
    res = check_1_testset_no_kfold_scores(testset=isic2017sk_reduced,
                                            scores={'acc': row['SK_ACC'],
                                                    'sens': row['SK_SE'],
                                                    'spec': row['SK_SP']},
                                            eps=1e-3)

    print(idx, res['inconsistency'])

GoogLeNet True
AlexNet True
ResNet True
VGGNet True
SP True
PP True
SMV True
SMP True
ω(1)SMP True
ω(2)SMP True
GoogLeNet True
AlexNet True
ResNet True
VGGNet True
SP True
PP True
SMV True
SMP True
ω(1)SMP True
ω(2)SMP True


In [371]:
results.append({'key': 'skin7',
                'citations': 312,
                'dataset': 'isic2017?',
                'epsilon': 1e-3,
                'scores': ['acc', 'sens', 'spec'],
                'n_scores': len(scores)*2,
                'n_inconsistent': n_inconsistent,
                'remark': 'Two hypothesis tested'})

# Melanoma classification on dermoscopy images using a neural network ensemble model

doi: 10.1109/TMI.2016.2633551

In [372]:
scores_x = pd.DataFrame({'acc': [0.9083, 0.9083, 0.9, 0.8833, 0.9167, 0.8917, 0.85, 0.8750, 0.9417],
                            'sens': [0.8250, 0.8, 0.775, 0.8, 0.875, 0.85, 0.65, 0.7, 0.95],
                            'spec': [0.95, 0.9625, 0.9625, 0.925, 0.9375, 0.9125, 0.95, 0.9625, 0.9375]})
scores_c = pd.DataFrame({'acc': [0.8722, 0.8833, 0.8222, 0.8778, 0.8778, 0.8722, 0.7722, 0.8389, 0.9111],
                            'sens': [0.7, 0.7333, 0.5167, 0.7167, 0.75, 0.75, 0.6333, 0.7333, 0.8333],
                            'spec': [0.9583, 0.9583, 0.975, 0.9583, 0.9417, 0.9333, 0.8417, 0.8917, 0.95]})

testset_x = {'p': 80, 'n': 160}
testset_c = {'p': 120, 'n': 240}

In [373]:
n_inconsistent = 0

for idx, row in scores_x.iterrows():
    res = check_1_testset_no_kfold_scores(testset=testset_x,
                                            scores=row,
                                            eps=1e-4)
    if res['inconsistency']:
        n_inconsistent += 1
        
    print(idx, res['inconsistency'])
    
for idx, row in scores_c.iterrows():
    res = check_1_testset_no_kfold_scores(testset=testset_c,
                                            scores=row,
                                            eps=1e-4)
    if res['inconsistency']:
        n_inconsistent += 1
        
    print(idx, res['inconsistency'])

0 False
1 False
2 False
3 False
4 False
5 False
6 False
7 False
8 False
0 False
1 False
2 False
3 False
4 False
5 False
6 False
7 False
8 False


In [374]:
results.append({'key': 'skin8',
                'citations': 259,
                'dataset': 'custom',
                'epsilon': 1e-4,
                'scores': ['acc', 'sens', 'spec'],
                'n_scores': len(scores_x) + len(scores_c),
                'n_inconsistent': n_inconsistent,
                'remark': None})

# Multiple Skin Lesions Diagnostics via Integrated Deep Convolutional Networks for Segmentation and Classification

doi: 10.1016/j.cmpb.2020.105351


In [375]:
scores_raw = pd.DataFrame({'acc': [0.7230, 0.7625, 0.7995, 0.8179],
                        'bacc': [0.7230, 0.7625, 0.7995, 0.8179],
                        'sens': [0.7229, 0.7626, 0.7994, 0.8180],
                        'spec': [0.6605, 0.6702, 0.6894, 0.7140],
                        'f1': [0.4776, 0.5161, 0.5632, 0.5965]})

In [376]:
scores = pd.DataFrame({'acc': [0.7704, 0.7995, 0.8179, 0.8127],
                        'sens': [0.7704, 0.7995, 0.8180, 0.8126],
                        'spec': [0.6622, 0.6793, 0.7140, 0.6525],
                        'f1': [0.7839, 0.8085, 0.8259, 0.8173]})

In [377]:
n_inconsistent = 0

for idx, row in scores_raw[['acc', 'f1']].iterrows():
    res = check_1_testset_no_kfold_scores(testset=isic2016,
                                            scores=row,
                                            eps=1e-4)
    if res['inconsistency']:
        n_inconsistent += 1
    
    print(res['inconsistency'], res['evidence'])

False (48, [(226, 226)])
False (48, [(241, 241)])
False (49, [(254, 254)])
False (51, [(259, 259)])


In [378]:
results.append({'key': 'skin9',
                'citations': 238,
                'dataset': 'isic2016',
                'epsilon': 1e-4,
                'scores': ['acc', 'sens', 'spec', 'f1'],
                'n_scores': len(scores_raw) + len(scores),
                'n_inconsistent': n_inconsistent,
                'remark': ['Incorrect weighting']})

In [379]:
pd.DataFrame(results)

Unnamed: 0,key,citations,dataset,epsilon,scores,n_scores,n_inconsistent,remark
0,skin0,991,isic2016,0.001,"[acc, sens, spec]",10.0,0.0,
1,skin1,603,isic2016,0.001,"[acc, sens, spec]",,,The paper is about image segmentation performa...
2,skin2,574,isic2016,0.001,"[acc, sens, spec]",27.0,3.0,
3,skin3,389,isic2017 m/sk,0.001,"[acc, sens, spec]",32.0,2.0,
4,skin4,389,custom,0.001,"[sens, spec]",,,Unclear evaluation methodology
5,skin5,322,,0.001,"[ppv, sens, spec]",16.0,0.0,
6,skin6,313,,0.001,"[acc, sens, spec]",,,Unclear evaluation
7,skin7,312,isic2017?,0.001,"[acc, sens, spec]",20.0,20.0,Two hypothesis tested
8,skin8,259,custom,0.0001,"[acc, sens, spec]",18.0,0.0,
9,skin9,238,isic2016,0.0001,"[acc, sens, spec, f1]",8.0,0.0,[Incorrect weighting]
