In [13]:
import pandas as pd
import os
import numpy as np

import seaborn as sns
sns.set_style("white")
from evalutils.roc import get_bootstrapped_roc_ci_curves
import matplotlib.pyplot as plt

%matplotlib inline
import sklearn.metrics as skl_metrics

## directory where results are
EXPERIMENT_DIR = f"/data/bodyct/experiments/lung-malignancy-fairness-shaurya"
NLST_PREDS = f"{EXPERIMENT_DIR}/nlst"

TEAMS_DIR = "/mnt/c/Users/shaur/OneDrive - Radboudumc/Documents - Master - Shaurya Gaur/General/Malignancy-Estimation Results/nlst"
NLST_PREDS = TEAMS_DIR

In [14]:
MODEL_NAME = "clip_show_nod_global_nlst_20240501"
BASE_PATH = rf"{NLST_PREDS}/Tijmen-Global-ShowNodule-NLST"

In [15]:
def sigmoid(x):
    """Compute sigmoid values for each set of scores in x."""
    s = 1 / (1 + np.exp(-x))
    return s

In [16]:
def load_external_csv(model_name=None, local=True, valid=True, train=False, base_path=None, num_folds=10):
    if base_path is None:
        # Adjusted the base path to reflect the path structure seen in the screenshot
        base_path = rf"/mnt/c/Users/shaur/OneDrive - Radboudumc/Documents - Master - Shaurya Gaur/General/Malignancy-Estimation Results/nlst/NLST_Tijmen_results/{model_name}"
    
    if model_name is not None:
        base_path = rf"{base_path}/{model_name}"

    if valid:
        base_path = rf"{base_path}/valid_fold"
    elif train:
        base_path = rf"{base_path}/train_fold"
    else:
        base_path = rf"{base_path}/fold"
    
    if local:
        column_name = 'y_local'
    else:
        column_name = 'y_pred'
    
    df = None
    # Loop over the folds
    for fold in range(num_folds):
        # Generate the path for each fold's CSV file
        file_path = rf"{base_path}{fold}/results.csv"
        
        # Read the CSV file for the current fold
        fold_df = pd.read_csv(file_path)
        
        fold_df[column_name] = sigmoid(fold_df[column_name])

        if df is None:
            df = fold_df.copy()   
        else:
            # Merge on 'img_names' and add 'y_Combined' values
            df = pd.merge(df, fold_df[['img_names', column_name, 'y']], on='img_names', how='outer', suffixes=('', f'_fold{fold}')) 

    # Ensure the column from the first fold is kept
    df[f'{column_name}_fold0'] = df[column_name]
    df[f'y_fold0'] = df['y']
    df.drop(columns=[column_name], inplace=True)

    # Calculate entropy across folds
    df['entropy'] = df[[f'{column_name}_fold{i}' for i in range(10)]].apply(
        lambda row: np.mean([-(p * np.log2(p)) - (1 - p) * np.log2(1 - p) for p in row]), 
        axis=1
    )
    
    # Calculate mean prediction across all model results for each image
    df['model_mean'] = df[[f'{column_name}_fold{i}' for i in range(10)]].mean(axis=1)
    df['label_mean'] = df[[f'y_fold{i}' for i in range(10)]].mean(axis=1)

    print(df.keys()) 

    return df

In [18]:
df_valid = load_external_csv(model_name=MODEL_NAME, valid=True, local=False, base_path=BASE_PATH)
df_valid.query('img_names == "104769_1_19990102"')

Index(['y', 'img_names', 'y_pred_fold1', 'y_fold1', 'y_pred_fold2', 'y_fold2',
       'y_pred_fold3', 'y_fold3', 'y_pred_fold4', 'y_fold4', 'y_pred_fold5',
       'y_fold5', 'y_pred_fold6', 'y_fold6', 'y_pred_fold7', 'y_fold7',
       'y_pred_fold8', 'y_fold8', 'y_pred_fold9', 'y_fold9', 'y_pred_fold0',
       'y_fold0', 'entropy', 'model_mean', 'label_mean'],
      dtype='object')


Unnamed: 0,y,img_names,y_pred_fold1,y_fold1,y_pred_fold2,y_fold2,y_pred_fold3,y_fold3,y_pred_fold4,y_fold4,...,y_fold7,y_pred_fold8,y_fold8,y_pred_fold9,y_fold9,y_pred_fold0,y_fold0,entropy,model_mean,label_mean
323,0.0,104769_1_19990102,,,,,,,,,...,,,,,,0.00043,0.0,,0.00043,0.0


In [19]:
def process_loaded_csv(df, model_name, output_file_name):
    df = df[['img_names', 'label_mean', 'model_mean']]
    mapper = {
        "img_names": "AnnotationID",
        "label_mean": "label",
        "model_mean": model_name
    }
    df = df.rename(columns=mapper)
    df.to_csv(output_file_name, index=False)
    return df

In [20]:
df_valid_final = process_loaded_csv(df_valid, "Thijmen_global_show", rf"{BASE_PATH}/combined_valid_output.csv")
df_valid_final

Unnamed: 0,AnnotationID,label,Thijmen_global_show
0,104912_1_19990102,0.0,3.561057e-07
1,103013_2_19990102,0.0,2.586391e-36
2,102676_1_20010102,1.0,9.148323e-01
3,103347_2_19990102,0.0,2.190919e-10
4,101122_1_19990102,0.0,1.632817e-17
...,...,...,...
16072,214970_1_20000102,0.0,8.989767e-01
16073,218168_2_19990102,0.0,9.621083e-09
16074,213388_9_20000102,0.0,1.598292e-26
16075,215876_1_20010102,0.0,3.864532e-08


In [29]:
df_valid_final['Thijmen_global_show'].describe()

count    1.607700e+04
mean     1.369420e-01
std      2.927513e-01
min      3.645179e-60
25%      2.074120e-12
50%      4.344576e-06
75%      3.153028e-02
max      9.986481e-01
Name: Thijmen_global_show, dtype: float64

## Merge with exsiting predictions

In [21]:
nlst_preds_all = pd.read_csv(f'{NLST_PREDS}/nlst_demo_v1_w4preds.csv')
nlst_preds_all

Unnamed: 0,PatientID,StudyDate,SeriesInstanceUID,CoordX,CoordY,CoordZ,LesionID,NoduleType,Spiculation,Diameter [mm],...,cancoral,cancpanc,cancphar,cancstom,cancthyr,canctran,study,Thijmen_local,InSybilTrain,Thijmen_global_hidden
0,100012,19990102,1.2.840.113654.2.55.24023112856488152536348979...,-32.919853,-136.22139,-90.400002,1,Solid,False,11.5,...,0.0,0.0,0.0,0.0,0.0,0.0,1,0.968842,False,0.413974
1,100012,20000102,1.2.840.113654.2.55.50761756412482430061802871...,-47.650000,-130.37000,1222.350000,1,PartSolid,False,23.4,...,0.0,0.0,0.0,0.0,0.0,0.0,1,0.984129,False,0.529322
2,100019,20000102,1.2.840.113654.2.55.12567845460924968711329545...,-103.190000,74.01000,-1129.370000,1,Solid,False,10.6,...,0.0,0.0,0.0,0.0,0.0,0.0,1,0.919277,False,0.011265
3,100035,19990102,1.2.840.113654.2.55.64360087057061878755830205...,124.670000,5.85000,-193.590000,1,Solid,False,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,0.002378,True,0.000186
4,100035,20000102,1.2.840.113654.2.55.33941066798745864774354503...,127.360000,-2.95000,-184.570000,1,Solid,False,4.8,...,0.0,0.0,0.0,0.0,0.0,0.0,1,0.000555,False,0.000006
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16072,218863,19990102,1.3.6.1.4.1.14519.5.2.1.7009.9004.977896926234...,-91.340000,-44.15000,-177.200000,1,GroundGlassOpacity,False,4.7,...,0.0,0.0,0.0,0.0,0.0,0.0,2,0.002565,False,0.026826
16073,218863,19990102,1.3.6.1.4.1.14519.5.2.1.7009.9004.977896926234...,69.720000,-48.07000,-86.470000,2,SemiSolid,False,6.8,...,0.0,0.0,0.0,0.0,0.0,0.0,2,0.504255,False,0.026826
16074,218863,20010102,1.3.6.1.4.1.14519.5.2.1.7009.9004.796552648301...,61.210000,-7.69000,-15.050000,2,SemiSolid,False,7.6,...,0.0,0.0,0.0,0.0,0.0,0.0,2,0.477067,False,0.653735
16075,218866,20000102,1.3.6.1.4.1.14519.5.2.1.7009.9004.698887010763...,31.260000,-216.76000,250.570000,1,Solid,False,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2,0.022328,False,0.353565


In [22]:
nlst_preds_w4 = nlst_preds_all.merge(df_valid_final, 
                how="left", ## 'inner' for only ones Thijmen has preds for
                on=['AnnotationID', 'label'], suffixes=(None,None))
nlst_preds_w4

Unnamed: 0,PatientID,StudyDate,SeriesInstanceUID,CoordX,CoordY,CoordZ,LesionID,NoduleType,Spiculation,Diameter [mm],...,cancpanc,cancphar,cancstom,cancthyr,canctran,study,Thijmen_local,InSybilTrain,Thijmen_global_hidden,Thijmen_global_show
0,100012,19990102,1.2.840.113654.2.55.24023112856488152536348979...,-32.919853,-136.22139,-90.400002,1,Solid,False,11.5,...,0.0,0.0,0.0,0.0,0.0,1,0.968842,False,0.413974,4.719118e-01
1,100012,20000102,1.2.840.113654.2.55.50761756412482430061802871...,-47.650000,-130.37000,1222.350000,1,PartSolid,False,23.4,...,0.0,0.0,0.0,0.0,0.0,1,0.984129,False,0.529322,9.986481e-01
2,100019,20000102,1.2.840.113654.2.55.12567845460924968711329545...,-103.190000,74.01000,-1129.370000,1,Solid,False,10.6,...,0.0,0.0,0.0,0.0,0.0,1,0.919277,False,0.011265,4.384295e-01
3,100035,19990102,1.2.840.113654.2.55.64360087057061878755830205...,124.670000,5.85000,-193.590000,1,Solid,False,5.0,...,0.0,0.0,0.0,0.0,0.0,1,0.002378,True,0.000186,7.406760e-12
4,100035,20000102,1.2.840.113654.2.55.33941066798745864774354503...,127.360000,-2.95000,-184.570000,1,Solid,False,4.8,...,0.0,0.0,0.0,0.0,0.0,1,0.000555,False,0.000006,1.409943e-12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16072,218863,19990102,1.3.6.1.4.1.14519.5.2.1.7009.9004.977896926234...,-91.340000,-44.15000,-177.200000,1,GroundGlassOpacity,False,4.7,...,0.0,0.0,0.0,0.0,0.0,2,0.002565,False,0.026826,1.537454e-07
16073,218863,19990102,1.3.6.1.4.1.14519.5.2.1.7009.9004.977896926234...,69.720000,-48.07000,-86.470000,2,SemiSolid,False,6.8,...,0.0,0.0,0.0,0.0,0.0,2,0.504255,False,0.026826,1.537454e-07
16074,218863,20010102,1.3.6.1.4.1.14519.5.2.1.7009.9004.796552648301...,61.210000,-7.69000,-15.050000,2,SemiSolid,False,7.6,...,0.0,0.0,0.0,0.0,0.0,2,0.477067,False,0.653735,4.696775e-01
16075,218866,20000102,1.3.6.1.4.1.14519.5.2.1.7009.9004.698887010763...,31.260000,-216.76000,250.570000,1,Solid,False,4.0,...,0.0,0.0,0.0,0.0,0.0,2,0.022328,False,0.353565,1.465341e-03


In [23]:
nlst_preds_w4.to_csv(f'{NLST_PREDS}/nlst_demo_v1_w6preds.csv', index=False)