In [1]:
import pandas as pd
import os
import numpy as np

import seaborn as sns
sns.set_style("white")
from evalutils.roc import get_bootstrapped_roc_ci_curves
import matplotlib.pyplot as plt

%matplotlib inline
import sklearn.metrics as skl_metrics

## directory where results are
LOCAL_PC = False
root_dir = "/mnt/w" if LOCAL_PC else "/data/bodyct"
EXPERIMENT_DIR = f"{root_dir}/experiments/lung-malignancy-fairness-shaurya"
NLST_PREDS = f"{EXPERIMENT_DIR}/nlst-preds"

NLST_PREDS_LOCAL = "/mnt/c/Users/shaur/OneDrive - Radboudumc/Documents - Master - Shaurya Gaur/General/Malignancy-Estimation Results/nlst"
NLST_PREDS = NLST_PREDS_LOCAL

In [2]:
MODEL_NAME = "final_layer_nlst_validtion_20240626"
BASE_PATH = rf"{NLST_PREDS}/NLST_Tijmen_results/{MODEL_NAME}"

In [3]:
def sigmoid(x):
    """Compute sigmoid values for each set of scores in x."""
    s = 1 / (1 + np.exp(-x))
    return s

In [32]:
def load_external_csv(model_name, local=True, valid=True, base_path=None):
    if base_path is None:
        # Adjusted the base path to reflect the path structure seen in the screenshot
        base_path = rf"/mnt/c/Users/shaur/OneDrive - Radboudumc/Documents - Master - Shaurya Gaur/General/Malignancy-Estimation Results/nlst/NLST_Tijmen_results/{model_name}"
    
    if valid:
        base_path = rf"{base_path}/valid_fold"
    else:
        base_path = rf"{base_path}/fold"
    
    if local:
        column_name = 'y_local'
    else:
        column_name = 'y_pred'
    
    df = None
    # Loop over the folds
    for fold in range(10):
        # Generate the path for each fold's CSV file
        file_path = rf"{base_path}{fold}/results.csv"
        
        # Read the CSV file for the current fold
        fold_df = pd.read_csv(file_path)
        
        fold_df[column_name] = sigmoid(fold_df[column_name])

        if df is None:
            df = fold_df.copy()   
        else:
            # Merge on 'img_names' and add 'y_Combined' values
            df = pd.merge(df, fold_df[['img_names', column_name, 'y']], on='img_names', how='outer', suffixes=('', f'_fold{fold}')) 

    # Ensure the column from the first fold is kept
    df[f'{column_name}_fold0'] = df[column_name]
    df[f'y_fold0'] = df['y']
    df.drop(columns=[column_name], inplace=True)

    # Calculate entropy across folds
    df['entropy'] = df[[f'{column_name}_fold{i}' for i in range(10)]].apply(
        lambda row: np.mean([-(p * np.log2(p)) - (1 - p) * np.log2(1 - p) for p in row]), 
        axis=1
    )
    
    # Calculate mean prediction across all model results for each image
    df['model_mean'] = df[[f'{column_name}_fold{i}' for i in range(10)]].mean(axis=1)
    df['label_mean'] = df[[f'y_fold{i}' for i in range(10)]].mean(axis=1)

    print(df.keys()) 

    return df

In [33]:
df_combined = load_external_csv(MODEL_NAME, valid=False, local=False, base_path=BASE_PATH)
df_combined.info()

Index(['y', 'img_names', 'y_pred_fold1', 'y_fold1', 'y_pred_fold2', 'y_fold2',
       'y_pred_fold3', 'y_fold3', 'y_pred_fold4', 'y_fold4', 'y_pred_fold5',
       'y_fold5', 'y_pred_fold6', 'y_fold6', 'y_pred_fold7', 'y_fold7',
       'y_pred_fold8', 'y_fold8', 'y_pred_fold9', 'y_fold9', 'y_pred_fold0',
       'y_fold0', 'entropy', 'model_mean', 'label_mean'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3240 entries, 0 to 3239
Data columns (total 25 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   y             306 non-null    float64
 1   img_names     3240 non-null   object 
 2   y_pred_fold1  306 non-null    float64
 3   y_fold1       306 non-null    float64
 4   y_pred_fold2  333 non-null    float64
 5   y_fold2       333 non-null    float64
 6   y_pred_fold3  359 non-null    float64
 7   y_fold3       359 non-null    float64
 8   y_pred_fold4  359 non-null    float64
 9   y_fold4       359 non-null    

In [34]:
df_thijmen = df_combined[['img_names', 'label_mean', 'model_mean']].sort_values(by='img_names')
df_thijmen

Unnamed: 0,img_names,label_mean,model_mean
36,100012_1_19990102,1.0,8.066968e-01
157,100012_1_20000102,1.0,8.966099e-01
135,100069_1_20000102,0.0,7.221026e-11
225,100069_1_20010102,0.0,7.296459e-13
236,100069_2_20000102,0.0,1.071313e-10
...,...,...,...
2995,218857_1_20000102,0.0,2.112545e-02
2911,218857_1_20010102,0.0,3.985039e-02
2905,218862_1_19990102,0.0,3.870139e-01
3002,218862_1_20000102,0.0,2.125442e-02


In [37]:
mapper = {
    "img_names": "AnnotationID",
    "label_mean": "label",
    "model_mean": "Thijmen_mean"
}
df_final = df_thijmen.rename(columns=mapper)
df_final

Unnamed: 0,AnnotationID,label,Thijmen_mean
36,100012_1_19990102,1.0,8.066968e-01
157,100012_1_20000102,1.0,8.966099e-01
135,100069_1_20000102,0.0,7.221026e-11
225,100069_1_20010102,0.0,7.296459e-13
236,100069_2_20000102,0.0,1.071313e-10
...,...,...,...
2995,218857_1_20000102,0.0,2.112545e-02
2911,218857_1_20010102,0.0,3.985039e-02
2905,218862_1_19990102,0.0,3.870139e-01
3002,218862_1_20000102,0.0,2.125442e-02


In [38]:
output_path = rf"{NLST_PREDS}/NLST_Tijmen_results/merged_model_output.csv"
df_final.to_csv(output_path, index=False)

In [35]:
fold_dfs = []
total_len = 0
for fold in range(10):
    # Generate the path for each fold's CSV file
    file_path = rf"{BASE_PATH}/fold{fold}/results.csv"
    # Read the CSV file for the current fold
    fold_df = pd.read_csv(file_path)
    fold_df['y_sigmoid'] = sigmoid(fold_df['y_pred'])
    print(fold, len(fold_df))
    display(fold_df)
    total_len += len(fold_df)
    fold_dfs.append(fold_df)

print("total length:", total_len)

0 306


Unnamed: 0,y,y_pred,img_names,y_sigmoid
0,0.0,-5.576508,104669_1_19990102,3.771486e-03
1,0.0,-22.025143,104493_1_19990102,2.720207e-10
2,0.0,-1.043664,104065_1_19990102,2.604436e-01
3,0.0,-6.377769,102500_1_20010102,1.696028e-03
4,0.0,-14.241450,102815_2_19990102,6.531555e-07
...,...,...,...,...
301,0.0,-23.582792,100113_2_19990102,5.729590e-11
302,0.0,-21.100842,102789_2_20010102,6.855210e-10
303,0.0,-5.070404,101189_6_20010102,6.240692e-03
304,0.0,-25.776226,100113_1_20010102,6.390389e-12


1 306


Unnamed: 0,y,y_pred,img_names,y_sigmoid
0,0.0,-5.302941,108714_1_19990102,0.004952
1,0.0,-4.219375,108600_3_19990102,0.014495
2,0.0,-4.507777,108527_1_19990102,0.010903
3,0.0,-1.304109,106862_5_20010102,0.213474
4,0.0,1.054446,106837_1_20010102,0.741628
...,...,...,...,...
301,0.0,-1.085771,105358_1_19990102,0.252416
302,0.0,-2.911454,105934_1_20010102,0.051590
303,0.0,-3.538017,106015_1_19990102,0.028250
304,0.0,-1.344413,105360_1_20010102,0.206785


2 333


Unnamed: 0,y,y_pred,img_names,y_sigmoid
0,0.0,-2.831512,110846_1_19990102,0.055645
1,0.0,1.104000,112197_3_19990102,0.751009
2,0.0,-5.571085,113578_11_20010102,0.003792
3,1.0,4.445492,110432_1_20010102,0.988405
4,0.0,-5.985686,111595_1_19990102,0.002508
...,...,...,...,...
328,0.0,-6.002881,113693_1_19990102,0.002466
329,0.0,-7.733974,112557_6_20000102,0.000438
330,0.0,-7.743012,111849_1_19990102,0.000434
331,0.0,-5.517223,113578_3_20000102,0.004001


3 359


Unnamed: 0,y,y_pred,img_names,y_sigmoid
0,0.0,-6.234936,116662_1_19990102,1.955922e-03
1,0.0,0.187545,120751_1_20010102,5.467493e-01
2,0.0,-17.627247,120344_1_19990102,2.210975e-08
3,0.0,-6.055011,119956_1_19990102,2.340584e-03
4,0.0,-18.145200,120344_4_19990102,1.317164e-08
...,...,...,...,...
354,0.0,0.095859,120751_1_19990102,5.239465e-01
355,1.0,1.796200,117502_1_20010102,8.576858e-01
356,0.0,-0.761528,117265_1_19990102,3.183146e-01
357,0.0,-1.166468,116297_1_20000102,2.374939e-01


4 359


Unnamed: 0,y,y_pred,img_names,y_sigmoid
0,0.0,-4.167844,121222_3_19990102,0.015249
1,0.0,0.474525,125808_1_20010102,0.616454
2,0.0,-5.672084,125757_1_19990102,0.003429
3,0.0,1.992671,124346_3_20000102,0.880025
4,0.0,-0.842809,121380_3_19990102,0.300943
...,...,...,...,...
354,0.0,2.757979,125808_2_20000102,0.940362
355,1.0,0.391775,123326_1_19990102,0.596710
356,0.0,-2.876443,122398_1_20010102,0.053330
357,0.0,-1.613669,121095_3_20010102,0.166080


5 333


Unnamed: 0,y,y_pred,img_names,y_sigmoid
0,0.0,-25.377651,126110_2_20000102,9.519765e-12
1,0.0,-14.300641,127269_1_19990102,6.156165e-07
2,0.0,-13.920865,128252_2_19990102,9.000047e-07
3,1.0,1.034969,127400_1_19990102,7.378780e-01
4,0.0,-6.896164,126296_3_20000102,1.010636e-03
...,...,...,...,...
328,0.0,2.289497,128672_1_20010102,9.080034e-01
329,0.0,-1.131884,130664_4_20010102,2.438135e-01
330,0.0,-0.456469,126349_1_19990102,3.878237e-01
331,0.0,-13.333339,128252_1_19990102,1.619585e-06


6 316


Unnamed: 0,y,y_pred,img_names,y_sigmoid
0,0.0,-4.750983,131988_1_19990102,8.569132e-03
1,0.0,-1.491727,130974_1_19990102,1.836626e-01
2,0.0,-3.495608,132240_8_20010102,2.943746e-02
3,0.0,-2.821428,200272_2_20000102,5.617717e-02
4,0.0,-8.222148,133998_1_20000102,2.685653e-04
...,...,...,...,...
311,0.0,-20.487880,131666_4_20010102,1.265397e-09
312,0.0,-14.218341,131238_7_20010102,6.684250e-07
313,0.0,-23.069450,131793_2_20010102,9.573377e-11
314,0.0,-4.987609,131122_1_20000102,6.775733e-03


7 304


Unnamed: 0,y,y_pred,img_names,y_sigmoid
0,0.0,-3.119572,206057_3_20010102,0.042307
1,0.0,-3.617446,204168_1_20000102,0.026149
2,0.0,-5.431812,203829_1_20010102,0.004356
3,0.0,-4.096107,202311_3_20010102,0.016365
4,0.0,-8.766420,206370_2_19990102,0.000156
...,...,...,...,...
299,0.0,-4.404953,202905_3_20000102,0.012069
300,0.0,-8.566411,204759_1_20010102,0.000190
301,0.0,-3.466769,203090_2_20000102,0.030273
302,0.0,-2.800011,202949_1_20010102,0.057324


8 288


Unnamed: 0,y,y_pred,img_names,y_sigmoid
0,0.0,-3.987769,211501_2_20000102,1.820351e-02
1,0.0,-2.601997,209635_2_19990102,6.901001e-02
2,0.0,-7.554817,211539_1_20000102,5.233081e-04
3,0.0,-8.600963,212964_12_20000102,1.838948e-04
4,1.0,1.298698,210218_1_20010102,7.856158e-01
...,...,...,...,...
283,0.0,4.270953,209545_1_19990102,9.862240e-01
284,0.0,-1.012374,210071_1_19990102,2.665155e-01
285,0.0,-5.669306,209307_1_19990102,3.438395e-03
286,0.0,-1.606334,211063_1_19990102,1.670982e-01


9 336


Unnamed: 0,y,y_pred,img_names,y_sigmoid
0,0.0,-0.485536,218174_1_20000102,0.380946
1,0.0,-0.459882,218862_1_19990102,0.387014
2,0.0,-3.826290,213409_2_19990102,0.021326
3,0.0,-4.897595,214361_1_19990102,0.007409
4,0.0,-3.104005,213457_1_19990102,0.042942
...,...,...,...,...
331,0.0,-2.304427,214058_2_19990102,0.090757
332,0.0,-2.836962,216447_2_20010102,0.055359
333,0.0,-5.400105,217237_3_20010102,0.004496
334,0.0,-0.135986,215138_1_20000102,0.466056


total length: 3240
