In [1]:
import pandas as pd
import numpy as np
import logging
from mlscorecheck.check import check_1_testset_no_kfold_scores

logging.getLogger('mlscorecheck').setLevel(logging.ERROR)

In [2]:
# the results are collected in this list

results = []

In [3]:
# the datasets appearing in the papers

isic2016 = {'p': 75, 'n': 304}
isic2017m = {'p': 117, 'n': 393 + 90}
isic2017sk = {'p': 90, 'n': 393 + 117}

In [4]:
def evaluate(scores, testset, eps, verbose=True):
    """
    Evaluates all scores in a paper

    Args:
        scores (pd.DataFrame): the dataframe of scores
        testset (dict): specification of the testset
        eps (float): the numerical uncertainty

    Returns:
        int: the number of inconsistent score configurations
    """
    n_inconsistent = 0
    for idx, row in scores.iterrows():
        res = check_1_testset_no_kfold_scores(testset=testset,
                                                scores=row,
                                                eps=eps)

        if res['inconsistency']:
            n_inconsistent += 1
            if verbose:
                print('inconsistency:', idx, row.to_dict(), testset, eps)
    return n_inconsistent

# Title: Automated melanoma recognition in dermoscopy images via very deep residual networks 

doi: 10.1109/TMI.2016.2642839

In [5]:
df_table_iv = pd.DataFrame({'acc': [0.855, 0.813, 0.834, 0.786, 0.844, 0.821, 0.815, 0.681, 0.731, 0.599],
                            'sens': [0.507, 0.573, 0.32 , 0.667, 0.24 , 0.2  , 0.467, 0.787, 0.773, 0.853],
                            'spec': [0.941, 0.872, 0.961, 0.816, 0.993, 0.974, 0.901, 0.655, 0.72, 0.536]})

df_table_v = pd.DataFrame({'acc': [0.828, 0.855],
                            'sens': [0.427, 0.547],
                            'spec': [0.927, 0.931]})

df_table_vi = pd.DataFrame({'acc': [0.826, 0.847, 0.855],
                            'sens': [0.413, 0.507, 0.547],
                            'spec': [0.928, 0.931, 0.931]})

df_table_vii = pd.DataFrame({'acc': [0.850, 0.844, 0.855],
                                'sens': [0.507, 0.520, 0.547],
                                'spec': [0.934, 0.824, 0.931]})

In [6]:
n_inconsistent = evaluate(df_table_iv, isic2016, 1e-3)\
                    + evaluate(df_table_v, isic2016, 1e-3)\
                    + evaluate(df_table_vi, isic2016, 1e-3)\
                    + evaluate(df_table_vii, isic2016, 1e-3)

print('n_inconsistent:', n_inconsistent)

inconsistency: 1 {'acc': 0.844, 'sens': 0.52, 'spec': 0.824} {'p': 75, 'n': 304} 0.001
n_inconsistent: 1


In [7]:
results.append({'key': 'skin0',
                'citations': 991,
                'dataset': 'isic2016',
                'uncertainty': 3,
                'scores': ['acc', 'sens', 'spec'],
                'suitable': 'y',
                'n_scores': len(df_table_iv) + len(df_table_v) + len(df_table_vi) + len(df_table_vii),
                'n_inconsistent': n_inconsistent,
                'remark': None,
                'conclusion': 'Potentially typos present.',
                'survey': 'Only the accuracy score of segmentation is reported.',
                'survey_ref': 155,
                'year': 2017})

# Title: Automatic Skin Lesion Segmentation Using Deep Fully Convolutional Networks with Jaccard Distance

doi: 10.1109/TMI.2017.2695227

In [8]:
results.append({'key': 'skin1',
                'citations': 603,
                'dataset': 'isic2016',
                'uncertainty': '-',
                'scores': ['-'],
                'suitable': 'n',
                'n_scores': '-',
                'n_inconsistent': '-',
                'remark': 'The paper is about image segmentation performance.',
                'conclusion': None,
                'survey': 'Reports segmentation scores',
                'survey_ref': 158,
                'year': 2017})

# Title: Deep learning ensembles for melanoma recognition in dermoscopy images

doi:  10.1147/JRD.2017.2708299

In [9]:
df0 = pd.DataFrame({'acc': [0.755, 0.81, 0.781, 0.834, 0.834, 0.807, 0.805, 0.855, 0.855, 0.836, 0.77, 0.726, 0.752],
                    'sens': [0.627, 0.72, 0.707, 0.533, 0.52, 0.693, 0.693, 0.547, 0.507, 0.253, 0.72, 0.693, 0.64],
                    'spec': [0.796, 0.832, 0.799, 0.9079, 0.9112, 0.836, 0.832, 0.931, 0.941, 0.98, 0.723, 0.734, 0.78]})

In [10]:
df1 = pd.DataFrame({'acc': [0.789, 0.734, 0.694, 0.702, 0.655, 0.694, 0.776, 0.723, 0.678, 0.699, 0.726, 0.686, 0.694, 0.702],
                    'sens': [0.213, 0.707, 0.707, 0.64, 0.4, 0.6, 0.267, 0.693, 0.627, 0.6, 0.507, 0.48, 0.587, 0.573],
                    'spec': [0.9309, 0.74, 0.691, 0.717, 0.73, 0.717, 0.9013, 0.73, 0.691, 0.724, 0.78, 0.737, 0.72, 0.734]})

In [11]:
n_inconsistent = evaluate(df0, isic2016, 1e-3) + evaluate(df1, isic2016, 1e-3)

print('n_inconsistent:', n_inconsistent)

inconsistency: 0 {'acc': 0.755, 'sens': 0.627, 'spec': 0.796} {'p': 75, 'n': 304} 0.001
inconsistency: 10 {'acc': 0.77, 'sens': 0.72, 'spec': 0.723} {'p': 75, 'n': 304} 0.001
inconsistency: 4 {'acc': 0.655, 'sens': 0.4, 'spec': 0.73} {'p': 75, 'n': 304} 0.001
n_inconsistent: 3


In [12]:
results.append({'key': 'skin2',
                'citations': 574,
                'dataset': 'isic2016',
                'uncertainty': 3,
                'scores': ['acc', 'sens', 'spec'],
                'suitable': 'y',
                'n_scores': len(df0) + len(df1),
                'n_inconsistent': n_inconsistent,
                'remark': None,
                'conclusion': 'Potentially typos present.',
                'survey': 'Results for the evaluation of 100 images (50-50).',
                'survey_ref': 156,
                'year': 2017})

# Title: Attention Residual Learning for Skin Lesion Classification

doi: 10.1109/TMI.2019.2893944

In [13]:
scores_m = pd.DataFrame({'acc': [0.748, 0.762, 0.757, 0.778, 0.838, 0.850, 0.848, 0.850],
                            'sens': [0.538, 0.615, 0.598, 0.615, 0.632, 0.624, 0.624, 0.658],
                            'spec': [0.799, 0.797, 0.795, 0.818, 0.888, 0.906, 0.903, 0.896]})
scores_sk = pd.DataFrame({'acc': [0.711, 0.758, 0.727, 0.763, 0.842, 0.862, 0.863, 0.868],
                            'sens': [0.8, 0.833, 0.811, 0.822, 0.867, 0.878, 0.856, 0.878],
                            'spec': [0.696, 0.745, 0.712, 0.753, 0.837, 0.859, 0.865, 0.867]})
scores_m_comp = pd.DataFrame({'acc': [0.850, 0.828, 0.823, 0.872, 0.858, 0.845, 0.837, 0.830],
                                'sens': [0.658, 0.735, 0.103, 0.547, 0.427, 0.350, 0.590, 0.436],
                                'spec': [0.896, 0.851, 0.998, 0.950, 0.963, 0.965, 0.896, 0.925]})
scores_sk_comp = pd.DataFrame({'acc': [0.868, 0.803, 0.875, 0.895, 0.918, 0.913, 0.908, 0.917],
                                'sens': [0.878, 0.978, 0.178, 0.356, 0.589, 0.556, 0.778, 0.7],
                                'spec': [0.867, 0.773, 0.998, 0.99, 0.976, 0.976, 0.931, 0.995]})

In [14]:
n_inconsistent = evaluate(scores_m, isic2017m, 1e-3)\
                + evaluate(scores_sk, isic2017sk, 1e-3)\
                + evaluate(scores_m_comp, isic2017m, 1e-3)\
                + evaluate(scores_sk_comp, isic2017sk, 1e-3)

print('n_inconsistent:', n_inconsistent)

inconsistency: 5 {'acc': 0.85, 'sens': 0.624, 'spec': 0.906} {'p': 117, 'n': 483} 0.001
inconsistency: 7 {'acc': 0.917, 'sens': 0.7, 'spec': 0.995} {'p': 90, 'n': 510} 0.001
n_inconsistent: 2


In [15]:
results.append({'key': 'skin3',
                'citations': 389,
                'dataset': 'isic2017 m/sk',
                'uncertainty': 3,
                'scores': ['acc', 'sens', 'spec'],
                'suitable': 'y',
                'n_scores': len(scores_m) + len(scores_sk) + len(scores_m_comp) + len(scores_sk_comp),
                'n_inconsistent': n_inconsistent,
                'remark': None,
                'conclusion': 'Potentially typos present.',
                'survey': 'No scores in the survey',
                'survey_ref': 169,
                'year': 2019})

# Title: Deep learning outperformed 136 of 157 dermatologists in a head-to-head dermoscopic melanoma image classification task

doi : 10.1016/j.ejca.2019.04.001

In [16]:
results.append({'key': 'skin4',
                'citations': 389,
                'dataset': 'isic2016 (custom selection)',
                'uncertainty': '-',
                'scores': ['-'],
                'suitable': 'n',
                'n_scores': '-',
                'n_inconsistent': '-',
                'remark': 'Not enough details shared.',
                'conclusion': None,
                'survey': None,
                'survey_ref': 178,
                'year': 2019})

# Title: Seven-Point Checklist and Skin Lesion Classification Using Multitask Multimodal Neural Nets

doi: 10.1109/JBHI.2018.2824327

In [17]:
# the BWV classification is a binary classification problem

In [18]:
scores_bwv_prs = pd.DataFrame({'sens': [0.493, 0.653, 0.773, 0.653],
                                'spec': [0.966, 0.925, 0.894, 0.919],
                                'ppv': [0.771, 0.671, 0.63, 0.653]})
scores_bwv_abs = pd.DataFrame({'sens': [0.966, 0.925, 0.894, 0.919],
                                'spec': [0.493, 0.653, 0.773, 0.653],
                                'ppv': [0.89, 0.919, 0.944, 0.919]})

scores_mel = pd.DataFrame({'sens': [0.901, 0.475, 0.96, 0.693, 0.96, 0.693, 0.941, 0.733],
                            'spec': [0.401, 0.874, 0.33, 0.789, 0.361, 0.776, 0.361, 0.786],
                            'ppv': [0.341, 0.565, 0.33, 0.53, 0.34, 0.515, 0.336, 0.54]})

In [19]:
testset_bwv = {'n': int(np.ceil(395/1011 * 816))}
testset_bwv['p'] = 395 - testset_bwv['n']

In [20]:
testset_bwv

{'n': 319, 'p': 76}

In [21]:
# The paper states that the testset consists of 395 images and has similar statistics as the entire dataset
# A-priori, the test set should contain p = 76 positive and n = 319 negative samples
# As there are multiple factors that need to be distributed in the test set equally, the characteristics of the test set could slightly deviate from this.
# We check this configuration and some very similar ones in the hope that only one of them will be compatible with the scores

for p in range(testset_bwv['p']-20, testset_bwv['p']+20):
    n = 395 - p
    tmp = check_1_testset_no_kfold_scores(testset={'p': p, 'n': n},
                                    scores=scores_bwv_prs.iloc[0],
                                    eps=1e-3)
    if not tmp['inconsistency']:
        print(p, n)

75 320


In [22]:
# The statistics for the presence and absense of the BWV in the testset is found.

In [23]:
testset_bwv_prs = {'p': 75, 'n': 320}
testset_bwv_abs = {'p': 320, 'n': 75}

In [24]:
n_inconsistent = evaluate(scores_bwv_prs, testset_bwv_prs, 1e-3) + evaluate(scores_bwv_abs, testset_bwv_abs, 1e-3)

print('n_inconsistent:', n_inconsistent)

n_inconsistent: 0


In [25]:
# Similarly to the case of BWV, now the number of melanoma samples in the testset is determined.

testset_mel = {'p': int(np.ceil((1 + 64 + 102 + 53 + 28 + 4)*395/1011))}
testset_mel['n'] = 395 - testset_mel['p']

In [26]:
testset_mel

{'p': 99, 'n': 296}

In [27]:
for p in range(testset_mel['p']-20, testset_mel['p']+20):
    n = 395 - p
    tmp = check_1_testset_no_kfold_scores(testset={'p': p, 'n': n},
                                    scores=scores_mel.iloc[0],
                                    eps=1e-3)
    if not tmp['inconsistency']:
        print(p, n)

101 294


In [28]:
testset_mel = {'p': 101, 'n': 294}

In [29]:
n_inconsistent += evaluate(scores_mel, testset_mel, 1e-3)

print('n_inconsistent:', n_inconsistent)

n_inconsistent: 0


In [30]:
results.append({'key': 'skin5',
                'citations': 322,
                'dataset': "Argenziano's \cite{argenziano}",
                'uncertainty': 3,
                'scores': ['ppv', 'sens', 'spec'],
                'suitable': 'y',
                'n_scores': len(scores_bwv_abs) + len(scores_bwv_prs) + len(scores_mel),
                'n_inconsistent': n_inconsistent,
                'remark': None,
                'conclusion': 'No inconsistency identified.',
                'survey': 'Reports the BWV results instead of the diagnosis results',
                'survey_ref': 172,
                'year': 2019})

# Title: The skin cancer classification using deep convolutional neural network

doi: 10.1007/s11042-018-5714-1

In [31]:
results.append({'key': 'skin6',
                'citations': 313,
                'dataset': 'custom',
                'uncertainty': '-',
                'scores': ['-'],
                'suitable': 'n',
                'n_scores': '-',
                'n_inconsistent': '-',
                'remark': 'Not enough details shared.',
                'conclusion': None,
                'survey': None,
                'survey_ref': 174,
                'year': 2018})

# Title: Skin lesion classification with ensembles of deep convolutional neural networks

doi: 10.1016/j.jbi.2018.08.006

In [32]:
scores = [['method', 'AVG_ACC', 'M_ACC', 'SK_ACC', 'AVG_AUC', 'M_AUC', 'SK_AUC', 'AVG_SE', 'M_SE', 'SK_SE', 'AVG_SP', 'M_SP', 'SK_SP'],
['GoogLeNet',    0.842, 0.818, 0.865, 0.848, 0.794, 0.902, 0.592, 0.496, 0.689, 0.722, 0.613, 0.831],
['AlexNet',      0.848, 0.823, 0.872, 0.830, 0.802, 0.859, 0.518, 0.470, 0.567, 0.645, 0.561, 0.728],
['ResNet',       0.828, 0.822, 0.833, 0.809, 0.757, 0.861, 0.520, 0.385, 0.656, 0.598, 0.437, 0.759],
['VGGNet',       0.813, 0.802, 0.825, 0.808, 0.766, 0.849, 0.434, 0.256, 0.611, 0.636, 0.585, 0.686],
['SP',           0.867, 0.845, 0.888, 0.875, 0.832, 0.918, 0.516, 0.376, 0.656, 0.746, 0.654, 0.838],
['PP',           0.861, 0.845, 0.877, 0.875, 0.836, 0.913, 0.524, 0.393, 0.656, 0.738, 0.650, 0.826],
['SMV',          0.903, 0.875, 0.930, 0.839, 0.789, 0.888, 0.632, 0.487, 0.778, 0.822, 0.766, 0.878],
['SMP',          0.868, 0.848, 0.887, 0.884, 0.843, 0.925, 0.551, 0.402, 0.700, 0.763, 0.689, 0.836],
['ω(1)SMP',      0.869, 0.850, 0.888, 0.886, 0.844, 0.928, 0.580, 0.427, 0.733, 0.772, 0.689, 0.855],
['ω(2)SMP',      0.866, 0.852, 0.880, 0.891, 0.851, 0.930, 0.556, 0.402, 0.711, 0.785, 0.719, 0.851]]

In [33]:
scores = pd.DataFrame(scores)
scores.columns = scores.iloc[0]
scores.index = scores['method']
scores = scores.drop('method', axis='rows').drop('method', axis='columns')

In [34]:
isic2017m_reduced = {'p': 117, 'n': 393}
isic2017sk_reduced = {'p': 90, 'n': 393}

In [35]:
n_inconsistent = evaluate(scores[['M_ACC', 'M_SE', 'M_SP']]\
                            .rename({'M_ACC': 'acc', 'M_SE': 'sens', 'M_SP': 'spec'}, axis='columns'),
                            isic2017m,
                            1e-3)
n_inconsistent += evaluate(scores[['SK_ACC', 'SK_SE', 'SK_SP']]\
                            .rename({'SK_ACC': 'acc', 'SK_SE': 'sens', 'SK_SP': 'spec'}, axis='columns'),
                            isic2017sk,
                            1e-3)

print('n_inconsistent:', n_inconsistent)

inconsistency: GoogLeNet {'acc': 0.818, 'sens': 0.496, 'spec': 0.613} {'p': 117, 'n': 483} 0.001
inconsistency: AlexNet {'acc': 0.823, 'sens': 0.47, 'spec': 0.561} {'p': 117, 'n': 483} 0.001
inconsistency: ResNet {'acc': 0.822, 'sens': 0.385, 'spec': 0.437} {'p': 117, 'n': 483} 0.001
inconsistency: VGGNet {'acc': 0.802, 'sens': 0.256, 'spec': 0.585} {'p': 117, 'n': 483} 0.001
inconsistency: SP {'acc': 0.845, 'sens': 0.376, 'spec': 0.654} {'p': 117, 'n': 483} 0.001
inconsistency: PP {'acc': 0.845, 'sens': 0.393, 'spec': 0.65} {'p': 117, 'n': 483} 0.001
inconsistency: SMV {'acc': 0.875, 'sens': 0.487, 'spec': 0.766} {'p': 117, 'n': 483} 0.001
inconsistency: SMP {'acc': 0.848, 'sens': 0.402, 'spec': 0.689} {'p': 117, 'n': 483} 0.001
inconsistency: ω(1)SMP {'acc': 0.85, 'sens': 0.427, 'spec': 0.689} {'p': 117, 'n': 483} 0.001
inconsistency: ω(2)SMP {'acc': 0.852, 'sens': 0.402, 'spec': 0.719} {'p': 117, 'n': 483} 0.001
inconsistency: GoogLeNet {'acc': 0.865, 'sens': 0.689, 'spec': 0.831} {

In [36]:
n_inconsistent

20

In [37]:
# testing the hypothesis that the negative class is only the nevus
n_inconsistent = evaluate(scores[['M_ACC', 'M_SE', 'M_SP']]\
                            .rename({'M_ACC': 'acc', 'M_SE': 'sens', 'M_SP': 'spec'}, axis='columns'),
                            isic2017m_reduced,
                            1e-3)
n_inconsistent += evaluate(scores[['SK_ACC', 'SK_SE', 'SK_SP']]\
                            .rename({'SK_ACC': 'acc', 'SK_SE': 'sens', 'SK_SP': 'spec'}, axis='columns'),
                            isic2017sk_reduced,
                            1e-3)

print('n_inconsistent:', n_inconsistent)

inconsistency: GoogLeNet {'acc': 0.818, 'sens': 0.496, 'spec': 0.613} {'p': 117, 'n': 393} 0.001
inconsistency: AlexNet {'acc': 0.823, 'sens': 0.47, 'spec': 0.561} {'p': 117, 'n': 393} 0.001
inconsistency: ResNet {'acc': 0.822, 'sens': 0.385, 'spec': 0.437} {'p': 117, 'n': 393} 0.001
inconsistency: VGGNet {'acc': 0.802, 'sens': 0.256, 'spec': 0.585} {'p': 117, 'n': 393} 0.001
inconsistency: SP {'acc': 0.845, 'sens': 0.376, 'spec': 0.654} {'p': 117, 'n': 393} 0.001
inconsistency: PP {'acc': 0.845, 'sens': 0.393, 'spec': 0.65} {'p': 117, 'n': 393} 0.001
inconsistency: SMV {'acc': 0.875, 'sens': 0.487, 'spec': 0.766} {'p': 117, 'n': 393} 0.001
inconsistency: SMP {'acc': 0.848, 'sens': 0.402, 'spec': 0.689} {'p': 117, 'n': 393} 0.001
inconsistency: ω(1)SMP {'acc': 0.85, 'sens': 0.427, 'spec': 0.689} {'p': 117, 'n': 393} 0.001
inconsistency: ω(2)SMP {'acc': 0.852, 'sens': 0.402, 'spec': 0.719} {'p': 117, 'n': 393} 0.001
inconsistency: GoogLeNet {'acc': 0.865, 'sens': 0.689, 'spec': 0.831} {

In [38]:
results.append({'key': 'skin7',
                'citations': 312,
                'dataset': 'isic2017 m/sk',
                'uncertainty': 3,
                'scores': ['acc', 'sens', 'spec'],
                'suitable': 'y',
                'n_scores': len(scores)*2,
                'n_inconsistent': n_inconsistent,
                'remark': None,
                'conclusion': 'All accuracy, sensitivity and specificity scores reported for the two binary classification tasks M and SK are inconsistent.',
                'survey': 'Reports incorrect scores',
                'survey_ref': 162,
                'year': 2018})

# Title: Melanoma classification on dermoscopy images using a neural network ensemble model

doi: 10.1109/TMI.2016.2633551

In [39]:
scores_x = pd.DataFrame({'acc': [0.9083, 0.9083, 0.9, 0.8833, 0.9167, 0.8917, 0.85, 0.8750, 0.9417],
                            'sens': [0.8250, 0.8, 0.775, 0.8, 0.875, 0.85, 0.65, 0.7, 0.95],
                            'spec': [0.95, 0.9625, 0.9625, 0.925, 0.9375, 0.9125, 0.95, 0.9625, 0.9375]})
scores_c = pd.DataFrame({'acc': [0.8722, 0.8833, 0.8222, 0.8778, 0.8778, 0.8722, 0.7722, 0.8389, 0.9111],
                            'sens': [0.7, 0.7333, 0.5167, 0.7167, 0.75, 0.75, 0.6333, 0.7333, 0.8333],
                            'spec': [0.9583, 0.9583, 0.975, 0.9583, 0.9417, 0.9333, 0.8417, 0.8917, 0.95]})

testset_x = {'p': 80, 'n': 160}
testset_c = {'p': 120, 'n': 240}

In [40]:
n_inconsistent = evaluate(scores_x, testset_x, 1e-4) + evaluate(scores_c, testset_c, 1e-4)

print('n_inconsistent:', n_inconsistent)

n_inconsistent: 0


In [41]:
results.append({'key': 'skin8',
                'citations': 259,
                'dataset': 'custom',
                'uncertainty': 4,
                'scores': ['acc', 'sens', 'spec'],
                'suitable': 'y',
                'n_scores': len(scores_x) + len(scores_c),
                'n_inconsistent': n_inconsistent,
                'remark': None,
                'conclusion': 'No inconsistency identified.',
                'survey': 'Reports the results only for the xanthous dataset',
                'survey_ref': 113,
                'year': 2017})

# Title: Multiple Skin Lesions Diagnostics via Integrated Deep Convolutional Networks for Segmentation and Classification

doi: 10.1016/j.cmpb.2020.105351


In [42]:
scores_table2 = pd.DataFrame({'acc': [0.7230, 0.7625, 0.7995, 0.8179],
                        'sens': [0.7229, 0.7626, 0.7994, 0.8180],
                        'spec': [0.6605, 0.6702, 0.6894, 0.7140],
                        'f1': [0.4776, 0.5161, 0.5632, 0.5965]})

In [43]:
scores_table3 = pd.DataFrame({'acc': [0.7704, 0.7995, 0.8179, 0.8127],
                        'sens': [0.7704, 0.7995, 0.8180, 0.8126],
                        'spec': [0.6622, 0.6793, 0.7140, 0.6525],
                        'f1': [0.7839, 0.8085, 0.8259, 0.8173]})

In [44]:
# these are not inconsistent, but the "weighted" sens and spec break the consistency
n_inconsistent = evaluate(scores_table2[['acc', 'f1']], isic2016, 1e-4)

print(n_inconsistent)

0


In [45]:
# the use of weighted measures is not claimed in this table
n_inconsistent = evaluate(scores_table3, isic2016, 1e-4)

print(n_inconsistent)

inconsistency: 0 {'acc': 0.7704, 'sens': 0.7704, 'spec': 0.6622, 'f1': 0.7839} {'p': 75, 'n': 304} 0.0001
inconsistency: 1 {'acc': 0.7995, 'sens': 0.7995, 'spec': 0.6793, 'f1': 0.8085} {'p': 75, 'n': 304} 0.0001
inconsistency: 2 {'acc': 0.8179, 'sens': 0.818, 'spec': 0.714, 'f1': 0.8259} {'p': 75, 'n': 304} 0.0001
inconsistency: 3 {'acc': 0.8127, 'sens': 0.8126, 'spec': 0.6525, 'f1': 0.8173} {'p': 75, 'n': 304} 0.0001
4


In [46]:
results.append({'key': 'skin9',
                'citations': 238,
                'dataset': 'isic2016',
                'uncertainty': 4,
                'scores': ['acc', 'sens', 'spec', 'f1'],
                'suitable': 'y',
                'n_scores': len(scores_table2) + len(scores_table3),
                'n_inconsistent': n_inconsistent,
                'remark': None,
                'conclusion': 'Unorthodox weighting of the scores.',
                'survey': 'isic2018 reported',
                'survey_ref': 201,
                'year': 2020})

# Assembling the final table

In [47]:
data = pd.DataFrame(results)

data['key'] = data['key'].apply(lambda x: '\cite{' + x + '}')
data = data.rename({'uncertainty': 'digits'}, axis='columns')
data['scores'] = data['scores'].apply(lambda x: ', '.join(x))
data['dataset'] = data['dataset'].apply(lambda x: x.replace('isic2016', 'ISIC2016 \cite{isic2016}').replace('isic2017', 'ISIC2017 \cite{isic2017}'))
data['conclusion'] = data.apply(lambda row: row['conclusion'] if row['remark'] is None else row['remark'], axis=1)

final = data[['key', 'citations', 'dataset', 'digits', 'scores', 'n_scores', 'n_inconsistent', 'conclusion']]

In [48]:
final = final.rename({'key': 'ref.', 'citations': 'cit.', 'n_scores': '$N_{sc.}$', 'n_inconsistent': '$N_{inc.}$'}, axis='columns')

In [49]:
print(final.to_latex(index=False).replace('NaN', '').replace('lrllllll',
            'l@{\hspace{5pt}}'\
            'r@{\hspace{5pt}}'\
            'p{75pt}@{\hspace{5pt}}'\
            'r@{\hspace{5pt}}'\
            'p{75pt}@{\hspace{5pt}}'\
            'l@{\hspace{5pt}}'\
            'l@{\hspace{5pt}}'\
            'p{240pt}@{\hspace{5pt}}'))

\begin{tabular}{l@{\hspace{5pt}}r@{\hspace{5pt}}p{75pt}@{\hspace{5pt}}r@{\hspace{5pt}}p{75pt}@{\hspace{5pt}}l@{\hspace{5pt}}l@{\hspace{5pt}}p{240pt}@{\hspace{5pt}}}
\toprule
ref. & cit. & dataset & digits & scores & $N_{sc.}$ & $N_{inc.}$ & conclusion \\
\midrule
\cite{skin0} & 991 & ISIC2016 \cite{isic2016} & 3 & acc, sens, spec & 18 & 1 & Potentially typos present. \\
\cite{skin1} & 603 & ISIC2016 \cite{isic2016} & - & - & - & - & The paper is about image segmentation performance. \\
\cite{skin2} & 574 & ISIC2016 \cite{isic2016} & 3 & acc, sens, spec & 27 & 3 & Potentially typos present. \\
\cite{skin3} & 389 & ISIC2017 \cite{isic2017} m/sk & 3 & acc, sens, spec & 32 & 2 & Potentially typos present. \\
\cite{skin4} & 389 & ISIC2016 \cite{isic2016} (custom selection) & - & - & - & - & Not enough details shared. \\
\cite{skin5} & 322 & Argenziano's \cite{argenziano} & 3 & ppv, sens, spec & 16 & 0 & No inconsistency identified. \\
\cite{skin6} & 313 & custom & - & - & - & - & Not enough

In [50]:
data

Unnamed: 0,key,citations,dataset,digits,scores,suitable,n_scores,n_inconsistent,remark,conclusion,survey,survey_ref,year
0,\cite{skin0},991,ISIC2016 \cite{isic2016},3,"acc, sens, spec",y,18,1,,Potentially typos present.,Only the accuracy score of segmentation is rep...,155,2017
1,\cite{skin1},603,ISIC2016 \cite{isic2016},-,-,n,-,-,The paper is about image segmentation performa...,The paper is about image segmentation performa...,Reports segmentation scores,158,2017
2,\cite{skin2},574,ISIC2016 \cite{isic2016},3,"acc, sens, spec",y,27,3,,Potentially typos present.,Results for the evaluation of 100 images (50-50).,156,2017
3,\cite{skin3},389,ISIC2017 \cite{isic2017} m/sk,3,"acc, sens, spec",y,32,2,,Potentially typos present.,No scores in the survey,169,2019
4,\cite{skin4},389,ISIC2016 \cite{isic2016} (custom selection),-,-,n,-,-,Not enough details shared.,Not enough details shared.,,178,2019
5,\cite{skin5},322,Argenziano's \cite{argenziano},3,"ppv, sens, spec",y,16,0,,No inconsistency identified.,Reports the BWV results instead of the diagnos...,172,2019
6,\cite{skin6},313,custom,-,-,n,-,-,Not enough details shared.,Not enough details shared.,,174,2018
7,\cite{skin7},312,ISIC2017 \cite{isic2017} m/sk,3,"acc, sens, spec",y,20,20,,"All accuracy, sensitivity and specificity scor...",Reports incorrect scores,162,2018
8,\cite{skin8},259,custom,4,"acc, sens, spec",y,18,0,,No inconsistency identified.,Reports the results only for the xanthous dataset,113,2017
9,\cite{skin9},238,ISIC2016 \cite{isic2016},4,"acc, sens, spec, f1",y,8,4,,Unorthodox weighting of the scores.,isic2018 reported,201,2020
