## Reproduce Analysis for Task 1 corresponding to Figure 2 in the paper.

This notebook contains code and analysis for reproducing results for Figure 2

In [1]:
import pandas as pd
from pathlib import Path
from functools import partial
from tqdm import tqdm
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import plotly.io as pio
import numpy as np

from utils import get_model_stats, get_model_comparison_stats

pio.templates["custom"] = go.layout.Template(
    layout=go.Layout(
        colorway=px.colors.qualitative.D3,
    )
)

### Load CSV files for each of the implementation approach predictions

We load saved prediction files for each of the implementation approaches. The CSV contains the predicted probabilities for each of the 8 anatomical sites and the true label from the dataset. It also contains information about several other parameters such as lesion type, etc. 

In [2]:
path = Path("../outputs/predictions/task1")

In [4]:
implementation_dict = {
    "Foundation (Features)": [csv_path for csv_path in path.glob("foundation_features*.csv")],
    "Foundation (Finetuned)": [csv_path for csv_path in path.glob("foundation_finetuned*.csv")],
    "Supervised": [csv_path for csv_path in path.glob("supervised*.csv")],
    "Med3D (Features)": [csv_path for csv_path in path.glob("med3d_features*.csv")],
    "Med3D (Finetuned)": [csv_path for csv_path in path.glob("med3d_finetuned*.csv")],
    "Models Genesis (Features)": [csv_path for csv_path in path.glob("modelsgen_features*.csv")],
    "Models Genesis (Finetuned)": [csv_path for csv_path in path.glob("modelsgen_finetuned*.csv")],
}

implementation_rank = {key: i for i, key in enumerate(implementation_dict.keys())}

### Analysis for computing metrics for each of the implementation approaches

Here we compute the metrics for each of the implementation approaches along with the 95% confidence intervals. Each implementation approach is also compared to all other implementation 
approaches with a difference confidence interval and p-value. We bootstrap to compute the confidence intervals and run a permutation test for the p-values.

In [5]:
pbar = tqdm(total=len(implementation_dict) * len(implementation_dict["Supervised"]))
results = []

# We use 1000 resamples in the study, but for the sake of time we use reproduce results with 10 here
N_RESAMPLES = 10

for implementation_name, implementation_list in implementation_dict.items():
    for model_prediction_csv in implementation_list:
        data_percentage = (
            float(model_prediction_csv.stem.split("_")[-2]) / 100 if len(model_prediction_csv.stem.split("_")) > 2 else 1.0
        )
        df = pd.read_csv(model_prediction_csv)

        for i in range(8):
            if f"conf_scores_class_{i}" not in df.columns:
                df[f"conf_scores_class_{i}"] = 0

        pred_set = (df["Coarse_lesion_type"].values, df.filter(like="conf_scores").values)
        map_values = get_model_stats(
            *pred_set,
            fn="mean_average_precision",
            nsamples=N_RESAMPLES,
        )

        ba_values = get_model_stats(
            *pred_set,
            fn="balanced_accuracy",
            nsamples=N_RESAMPLES,
        )

        row = {
            "Implementation": implementation_name,
            "Data Percentage": data_percentage,
            "mAP": np.mean(map_values),
            "mAP_values": map_values,
            "mAP_low_CI": np.percentile(map_values, 2.5),
            "mAP_high_CI": np.percentile(map_values, 97.5),
            "BA": np.mean(ba_values),
            "BA_low_CI": np.percentile(ba_values, 2.5),
            "BA_high_CI": np.percentile(ba_values, 97.5),
            "BA_values": ba_values,
        }

        # Compute statistics for comparison between this implementation and all other ones (difference CI and p-value)
        compare_impementations = {k: v for k, v in implementation_dict.items() if k != implementation_name}
        for _implementation_name, _implementations_list in compare_impementations.items():
            for _model_prediction_csv in _implementations_list:
                _data_percentage = (
                    float(_model_prediction_csv.stem.split("_")[-2]) / 100
                    if len(_model_prediction_csv.stem.split("_")) > 2
                    else 1.0
                )
                if data_percentage == _data_percentage:
                    _df = pd.read_csv(_model_prediction_csv)
                    # Check if 8 columns with conf_scores_class_{idx} exist, if not add a column with zeros for missing
                    for i in range(8):
                        if f"conf_scores_class_{i}" not in _df.columns:
                            _df[f"conf_scores_class_{i}"] = 0

                    _pred = _df.filter(like="conf_scores").values
                    _pred_set = (*pred_set, _pred)

                    perm_test = get_model_comparison_stats(
                        *_pred_set,
                        fn="balanced_accuracy",
                        nsamples=N_RESAMPLES,
                    )

                    row[f"BA_diff_CI_low_{_implementation_name}"] = perm_test[0][0]
                    row[f"BA_diff_CI_high_{_implementation_name}"] = perm_test[0][1]
                    row[f"BA_pval_{_implementation_name}"] = perm_test[1]

                    perm_test = get_model_comparison_stats(
                        *_pred_set,
                        fn="mean_average_precision",
                        nsamples=N_RESAMPLES,
                    )

                    row[f"mAP_diff_CI_low_{_implementation_name}"] = perm_test[0][0]
                    row[f"mAP_diff_CI_high_{_implementation_name}"] = perm_test[0][1]
                    row[f"mAP_pval_{_implementation_name}"] = perm_test[1]

        results.append(row)
        pbar.update(1)

0it [00:00, ?it/s]

In [6]:
# results_df = pd.DataFrame(results)

results_df = pd.read_pickle("task1.pkl")
# results_df = pd.read_csv("result_csvs/task1_fixed.csv")
results_df["Implementation_Rank"] = results_df["Implementation"].map(implementation_rank)
results_df.sort_values(by=["Data Percentage", "Implementation_Rank"], inplace=True, ascending=True)
results_df.drop("Implementation_Rank", axis=1, inplace=True)

In [8]:
results_df[results_df["Data Percentage"] == 1]

Unnamed: 0,Implementation,Data Percentage,mAP,mAP_values,mAP_low_CI,mAP_high_CI,BA,BA_low_CI,BA_high_CI,BA_values,...,BA_pval_Models Genesis (Finetuned),mAP_diff_CI_low_Models Genesis (Finetuned),mAP_diff_CI_high_Models Genesis (Finetuned),mAP_pval_Models Genesis (Finetuned),BA_diff_CI_low_Foundation (Features),BA_diff_CI_high_Foundation (Features),BA_pval_Foundation (Features),mAP_diff_CI_low_Foundation (Features),mAP_diff_CI_high_Foundation (Features),mAP_pval_Foundation (Features)
3,Foundation (Features),1.0,0.847815,"[0.8432566196960686, 0.8619535147862756, 0.848...",0.816616,0.878474,0.77943,0.746945,0.809909,"[0.7945897019722147, 0.7890314542662229, 0.769...",...,0.252,-0.002126,0.056877,0.034,,,,,,
4,Foundation (Finetuned),1.0,0.855878,"[0.8513567507818766, 0.8454574271013343, 0.855...",0.826309,0.882107,0.804614,0.775275,0.834946,"[0.8368961112148825, 0.7995212924206898, 0.790...",...,0.021,0.011554,0.065391,0.006,-0.006785,0.058291,0.07,-0.016243,0.036086,0.244
9,Supervised,1.0,0.821244,"[0.8298386861958249, 0.8194270500611527, 0.820...",0.788203,0.85322,0.720204,0.689028,0.751029,"[0.7155627357973212, 0.7505271681199173, 0.745...",...,0.994,-0.030026,0.037112,0.449,-0.093282,-0.025713,1.0,-0.051811,0.001941,0.949
12,Med3D (Features),1.0,0.659406,"[0.6646172568418498, 0.6424729169054022, 0.633...",0.62501,0.692738,0.614128,0.581554,0.646572,"[0.6417301485834737, 0.620391726417544, 0.6314...",...,1.0,-0.197424,-0.120428,1.0,-0.204459,-0.126445,1.0,-0.224831,-0.153194,1.0
17,Med3D (Finetuned),1.0,0.813114,"[0.8075177935852775, 0.8391844055603279, 0.792...",0.779779,0.846698,0.783501,0.74973,0.816687,"[0.8023502379761918, 0.7716968512557327, 0.794...",...,0.165,-0.033516,0.023737,0.625,-0.03269,0.036007,0.44,-0.059902,-0.006134,0.992
23,Models Genesis (Features),1.0,0.791329,"[0.7842390625693547, 0.7718743295516455, 0.787...",0.760146,0.821855,0.698173,0.666347,0.729977,"[0.6763925218525426, 0.7079235605071561, 0.703...",...,1.0,-0.061754,0.00603,0.92,-0.119392,-0.04176,1.0,-0.084257,-0.024382,0.999
26,Models Genesis (Finetuned),1.0,0.819371,"[0.8309974656706345, 0.8160368114561494, 0.819...",0.786681,0.850924,0.768038,0.734784,0.800054,"[0.7554206850752618, 0.7641704225114363, 0.785...",...,,,,,-0.044366,0.021408,0.754,-0.061244,0.000282,0.962


In [9]:
df = pd.read_csv("/home/suraj/Repositories/foundation-cancer-image-biomarker/preds.csv")

In [14]:
pred_set = (df["Coarse_lesion_type"].values, df.filter(like="pred").values)

map_values = get_model_stats(
    *pred_set,
    fn="mean_average_precision",
    nsamples=N_RESAMPLES,
)

ba_values = get_model_stats(
    *pred_set,
    fn="balanced_accuracy",
    nsamples=N_RESAMPLES,
)

print(np.mean(map_values), np.mean(ba_values))

0.821850765753539 0.8034750643549391


### Generate the figures
The figures are reproduced using plotly 

In [13]:
gray_palette = sns.color_palette("gray", 6).as_hex()

In [48]:
results_df_ = results_df[results_df["Data Percentage"] == 1]
for metric in ["mAP", "BA"]:
    results_df_[f"e_plus_{metric}"] = results_df_[f"{metric}_high_CI"] - results_df_[metric]
    results_df_[f"e_minus_{metric}"] = results_df_[metric] - results_df_[f"{metric}_low_CI"]
    metric_values = results_df_[f"{metric}_values"].values

    colors = ["#3182BD", "#00A3D5", gray_palette[0], *gray_palette[2:]]

    fig = px.bar(
        results_df_,
        x="Implementation",
        y=metric,
        error_y=f"e_plus_{metric}",
        error_y_minus=f"e_minus_{metric}",
        color="Implementation",
        template="simple_white",
        labels={"Model": "", metric: metric, "Implementation": "Implementation approaches"},
        color_discrete_sequence=colors,  # Use the blues color palette
        range_y=[0.4, 1],
    )

    random_val = np.random.rand(len(metric_values[0]))
    for sample_idx in range(len(metric_values[0])):
        fig.add_trace(
            go.Box(
                x=results_df_["Implementation"],
                y=[metric_values[i][sample_idx] for i in range(len(metric_values))],
                marker=dict(color="black", symbol="circle", size=1, line=dict(width=0)),
                fillcolor="rgba(255, 255, 255,0)",
                line=dict(
                    width=0,
                ),
                boxpoints="all",  # display the original data points
                opacity=0.2,
                pointpos=-1.6 + (random_val[sample_idx] * 3.2),
                showlegend=False,
            )
        )

    title = "Full training set"
    fig.update_layout(
        title=title,
        width=600,
        height=500,
        autosize=False,
        legend=dict(
            orientation="v",
        ),
        template="simple_white",
        bargap=0,
        title_x=0.5,
        xaxis=dict(showticklabels=False),
        yaxis=dict(showgrid=True),
        xaxis_title=None,
        showlegend=True,
    ),
    for trace in fig.data:
        if isinstance(trace, go.Bar):
            trace.error_y.thickness = 1

    fig.show()
    fig.write_image(f"figures_vector_pdf/task1_{metric}_dot.pdf")
    fig.data = []



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [17]:
for metric in ["mAP", "BA"]:
    results_df[f"e_plus_{metric}"] = results_df[f"{metric}_high_CI"] - results_df[metric]
    results_df[f"e_minus_{metric}"] = results_df[metric] - results_df[f"{metric}_low_CI"]

    fig = px.line(
        results_df,
        x="Data Percentage",
        y=metric,
        # error_y=f"e_plus_{metric}",
        # error_y_minus=f"e_minus_{metric}",
        color="Implementation",
        markers=True,
        template="simple_white",
        labels={"Data Percentage": "Percentage", metric: metric},
        color_discrete_sequence=colors,  # Use the blues color palette
        range_y=[0.4, 1],
    )

    fig.update_traces(marker=dict(size=10))
    # fig.update_traces(marker=dict(symbol="star"))

    title = "Percentages of training data"
    fig.update_traces(
        error_y=dict(
            thickness=1,
        ),
    )
    fig.update_layout(
        title=title,
        width=600,
        height=600,
        autosize=True,
        showlegend=False,
        legend=dict(yanchor="bottom", y=-0.4, orientation="h", xanchor="right", x=1),
        template="simple_white",
        title_x=0.5,
        yaxis=dict(showgrid=True),
        xaxis=dict(
            tickmode="array",  # change 1
            tickvals=[0.1, 0.2, 0.5, 1],  # change 2
            ticktext=["10%", "20%", "50%", "100%"],  # change 3
            autorange="reversed",
        ),
    )

    fig.show()
    fig.write_image(f"figures_vector_pdf/task1_{metric}_percentages.pdf")
    fig.data = []