## Reproduce Analysis for Task 2 corresponding to Figure 3 in the paper.

This notebook contains code and analysis for reproducing results for Figure 3

In [1]:
from scipy.stats import bootstrap, permutation_test
import pandas as pd
from pathlib import Path
from functools import partial
from tqdm import tqdm
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import seaborn as sns


pio.templates["custom"] = go.layout.Template(
    layout=go.Layout(
        colorway=px.colors.qualitative.D3,
    )
)

from utils import get_model_stats, get_model_comparison_stats

### Load CSV files for each of the implementation approach predictions

We load saved prediction files for each of the implementation approaches. The CSV contains the predicted probabilities for malignancy and the true label from the dataset. 

In [2]:
path = Path("../outputs/predictions/task2")

In [3]:
implementation_dict = {
    "Foundation (Features)": [csv_path for csv_path in path.glob("foundation_features*.csv")],
    "Foundation (Finetuned)": [csv_path for csv_path in path.glob("foundation_finetuned*.csv")],
    "Supervised": [csv_path for csv_path in path.glob("supervised_random*.csv")],
    "Supervised (Finetuned)": [csv_path for csv_path in path.glob("supervised_finetuned*.csv")],
    "Med3D (Features)": [csv_path for csv_path in path.glob("med3d_features*.csv")],
    "Med3D (Finetuned)": [csv_path for csv_path in path.glob("med3d_finetuned*.csv")],
    "Models Genesis (Features)": [csv_path for csv_path in path.glob("modelsgen_features*.csv")],
    "Models Genesis (Finetuned)": [csv_path for csv_path in path.glob("modelsgen_finetuned*.csv")],
}

implementation_rank = {key: i for i, key in enumerate(implementation_dict.keys())}

### Analysis for computing metrics for each of the implementation approaches

Here we compute the metrics for each of the implementation approaches along with the 95% confidence intervals. Each implementation approach is also compared to all other implementation 
approaches with a difference confidence interval and p-value. We bootstrap to compute the confidence intervals and run a permutation test for the p-values.

In [4]:
pbar = tqdm(total=len(implementation_dict) * len(implementation_dict["Supervised"]))
results = []

# We use 1000 resamples in the study, but for the sake of time we use reproduce results with 10 here
N_RESAMPLES = 1000

for implementation_name, implementation_list in implementation_dict.items():
    for model_prediction_csv in implementation_list:
        data_percentage = (
            float(model_prediction_csv.stem.split("_")[-2]) / 100 if len(model_prediction_csv.stem.split("_")) > 2 else 1.0
        )
        df = pd.read_csv(model_prediction_csv)
        pred_set = (
            df["malignancy"].values,
            df["conf_scores_class_1"].values if "conf_scores_class_1" in df.columns else df["conf_scores_class"].values,
        )

        map_values = get_model_stats(
            *pred_set,
            fn="average_precision_score",
            nsamples=N_RESAMPLES,
        )

        auc_values = get_model_stats(
            *pred_set,
            fn="roc_auc_score",
            nsamples=N_RESAMPLES,
        )
        row = {
            "Implementation": implementation_name,
            "Data Percentage": data_percentage,
            "mAP": np.mean(map_values),
            "mAP_values": map_values,
            "mAP_low_CI": np.percentile(map_values, 2.5),
            "mAP_high_CI": np.percentile(map_values, 97.5),
            "AUC": np.mean(auc_values),
            "AUC_low_CI": np.percentile(auc_values, 2.5),
            "AUC_high_CI": np.percentile(auc_values, 97.5),
            "AUC_values": auc_values,
        }

        # Compute statistics for comparison between this implementation and all other ones (difference CI and p-value)
        compare_impementations = {k: v for k, v in implementation_dict.items() if k != implementation_name}
        for _implementation_name, _implementations_list in compare_impementations.items():
            for _model_prediction_csv in _implementations_list:
                _data_percentage = (
                    float(_model_prediction_csv.stem.split("_")[-2]) / 100
                    if len(_model_prediction_csv.stem.split("_")) > 2
                    else 1.0
                )
                if data_percentage == _data_percentage:
                    _df = pd.read_csv(_model_prediction_csv)
                    _pred = (
                        _df["conf_scores_class_1"].values
                        if "conf_scores_class_1" in _df.columns
                        else _df["conf_scores_class"].values
                    )
                    _pred_set = (*pred_set, _pred)

                    perm_test = get_model_comparison_stats(
                        *_pred_set,
                        fn="roc_auc_score",
                        nsamples=N_RESAMPLES,
                    )

                    row[f"AUC_diff_CI_low_{_implementation_name}"] = perm_test[0][0]
                    row[f"AUC_diff_CI_high_{_implementation_name}"] = perm_test[0][1]
                    row[f"AUC_pval_{_implementation_name}"] = perm_test[1]

                    perm_test = get_model_comparison_stats(
                        *_pred_set,
                        fn="average_precision_score",
                        nsamples=N_RESAMPLES,
                    )

                    row[f"mAP_diff_CI_low_{_implementation_name}"] = perm_test[0][0]
                    row[f"mAP_diff_CI_high_{_implementation_name}"] = perm_test[0][1]
                    row[f"mAP_pval_{_implementation_name}"] = perm_test[1]

        results.append(row)
        pbar.update(1)

  3%|▎         | 1/32 [00:41<21:20, 41.31s/it]

KeyboardInterrupt: 

In [4]:
results_df = pd.read_pickle("task2_results.pkl")
results_df["Implementation_Rank"] = results_df["Implementation"].map(implementation_rank)
results_df.sort_values(by=["Data Percentage", "Implementation_Rank"], inplace=True, ascending=True)
results_df.drop("Implementation_Rank", axis=1, inplace=True)

In [5]:
gray_palette = sns.color_palette("gray", 6).as_hex()

### Generate the figures
The figures are reproduced using plotly 

In [7]:
results_df_ = results_df[results_df["Data Percentage"] == 1]
for metric in ["mAP", "AUC"]:
    results_df_[f"e_plus_{metric}"] = results_df_[f"{metric}_high_CI"] - results_df_[metric]
    results_df_[f"e_minus_{metric}"] = results_df_[metric] - results_df_[f"{metric}_low_CI"]
    colors = ["#3182BD", "#00A3D5", *gray_palette]
    metric_values = results_df_[f"{metric}_values"].values

    fig = px.bar(
        results_df_,
        x="Implementation",
        y=metric,
        error_y=f"e_plus_{metric}",
        error_y_minus=f"e_minus_{metric}",
        color="Implementation",
        template="simple_white",
        labels={"Model": "", metric: metric, "Implementation": ""},
        color_discrete_sequence=colors,  # Use the blues color palette
        range_y=[0.4, 1],
    )

    random_val = np.random.rand(len(metric_values[0]))
    for sample_idx in range(len(metric_values[0])):
        fig.add_trace(
            go.Box(
                x=results_df_["Implementation"],
                y=[metric_values[i][sample_idx] for i in range(len(metric_values))],
                marker=dict(color="black", symbol="circle", size=1, line=dict(width=0)),
                fillcolor="rgba(255, 255, 255,0)",
                line=dict(
                    width=0,
                ),
                boxpoints="all",  # display the original data points
                opacity=0.2,
                pointpos=-1.6 + (random_val[sample_idx] * 3.2),
                showlegend=False,
            )
        )

    title = "Full training set"
    fig.update_layout(
        title=title,
        width=600,
        height=500,
        autosize=False,
        legend=dict(
            orientation="v",
        ),
        bargap=0,  # Reduce gap between bars
        template="simple_white",
        title_x=0.5,
        xaxis=dict(showticklabels=False),
        yaxis=dict(showgrid=True),
        xaxis_title=None,
        showlegend=True,
    ),
    fig.update_traces(marker_pattern_shape="\\", selector={"legendgroup": "Models Genesis (Finetuned)"})
    fig.update_traces(marker_pattern_shape="\\", selector={"legendgroup": "Models Genesis (Features)"})
    for trace in fig.data:
        if isinstance(trace, go.Bar):
            trace.error_y.thickness = 1
    fig.show()
    fig.write_image(f"figures_vector_pdf/task2_{metric}_dot.pdf")
    fig.data = []



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [10]:
for metric in ["mAP", "AUC"]:
    results_df[f"e_plus_{metric}"] = results_df[f"{metric}_high_CI"] - results_df[metric]
    results_df[f"e_minus_{metric}"] = results_df[metric] - results_df[f"{metric}_low_CI"]

    colors = ["#3182BD", "#00A3D5", *gray_palette]
    fig = px.line(
        results_df,
        x="Data Percentage",
        y=metric,
        # error_y=f"e_plus_{metric}",
        # error_y_minus=f"e_minus_{metric}",
        color="Implementation",
        markers=True,
        template="simple_white",
        labels={"Data Percentage": "Percentage", metric: metric},
        color_discrete_sequence=colors,  # Use the blues color palette
        range_y=[0.4, 1],
    )

    fig.update_traces(marker=dict(size=10))

    title = "Percentages of training data"
    fig.update_traces(
        error_y=dict(
            thickness=1,
        ),
    )
    fig.update_layout(
        title=title,
        width=600,
        height=600,
        autosize=True,
        showlegend=False,
        legend=dict(yanchor="bottom", y=-0.45, orientation="h", xanchor="right", x=1.2),
        template="simple_white",
        title_x=0.5,
        yaxis=dict(showgrid=True),
        xaxis=dict(
            tickmode="array",  # change 1
            tickvals=[0.1, 0.2, 0.5, 1],  # change 2
            ticktext=["10%", "20%", "50%", "100%"],  # change 3
            autorange="reversed",
        ),
    )
    fig.update_traces(patch={"line": {"dash": "dot"}}, selector={"legendgroup": "Models Genesis (Features)"})
    fig.update_traces(patch={"line": {"dash": "dot"}}, selector={"legendgroup": "Models Genesis (Finetuned)"})

    fig.show()
    fig.write_image(f"figures_vector_pdf/task2_{metric}_percentages.pdf")
    fig.data = []