In [2]:
import pandas as pd
import os
import numpy as np

import seaborn as sns
sns.set_style("white")
from evalutils.roc import get_bootstrapped_roc_ci_curves
import matplotlib.pyplot as plt

%matplotlib inline
import sklearn.metrics as skl_metrics

## directory where results are
LOCAL_PC = False
root_dir = "/mnt/w" if LOCAL_PC else "/data/bodyct"
EXPERIMENT_DIR = f"{root_dir}/experiments/lung-malignancy-fairness-shaurya"
NLST_PREDS = f"{EXPERIMENT_DIR}/nlst-preds"

NLST_PREDS_LOCAL = "/mnt/c/Users/shaur/OneDrive - Radboudumc/Documents - Master - Shaurya Gaur/General/Malignancy-Estimation Results/nlst"
NLST_PREDS = NLST_PREDS_LOCAL

In [3]:
MODEL_NAME = "final_layer_nlst_validtion_20240626"
BASE_PATH = rf"{NLST_PREDS}/Tijmen Local NLST"

In [4]:
def sigmoid(x):
    """Compute sigmoid values for each set of scores in x."""
    s = 1 / (1 + np.exp(-x))
    return s

In [5]:
def load_external_csv(model_name, local=True, valid=True, train=False, base_path=None):
    if base_path is None:
        # Adjusted the base path to reflect the path structure seen in the screenshot
        base_path = rf"/mnt/c/Users/shaur/OneDrive - Radboudumc/Documents - Master - Shaurya Gaur/General/Malignancy-Estimation Results/nlst/NLST_Tijmen_results/{model_name}"
    
    if valid:
        base_path = rf"{base_path}/valid_fold"
    elif train:
        base_path = rf"{base_path}/train_fold"
    else:
        base_path = rf"{base_path}/fold"
    
    if local:
        column_name = 'y_local'
    else:
        column_name = 'y_pred'
    
    df = None
    # Loop over the folds
    for fold in range(10):
        # Generate the path for each fold's CSV file
        file_path = rf"{base_path}{fold}/results.csv"
        
        # Read the CSV file for the current fold
        fold_df = pd.read_csv(file_path)
        
        fold_df[column_name] = sigmoid(fold_df[column_name])

        if df is None:
            df = fold_df.copy()   
        else:
            # Merge on 'img_names' and add 'y_Combined' values
            df = pd.merge(df, fold_df[['img_names', column_name, 'y']], on='img_names', how='outer', suffixes=('', f'_fold{fold}')) 

    # Ensure the column from the first fold is kept
    df[f'{column_name}_fold0'] = df[column_name]
    df[f'y_fold0'] = df['y']
    df.drop(columns=[column_name], inplace=True)

    # Calculate entropy across folds
    df['entropy'] = df[[f'{column_name}_fold{i}' for i in range(10)]].apply(
        lambda row: np.mean([-(p * np.log2(p)) - (1 - p) * np.log2(1 - p) for p in row]), 
        axis=1
    )
    
    # Calculate mean prediction across all model results for each image
    df['model_mean'] = df[[f'{column_name}_fold{i}' for i in range(10)]].mean(axis=1)
    df['label_mean'] = df[[f'y_fold{i}' for i in range(10)]].mean(axis=1)

    print(df.keys()) 

    return df

In [6]:
df_train = load_external_csv(MODEL_NAME, valid=False, train=True, local=True, base_path=BASE_PATH)
df_train.query('img_names == "104769_1_19990102"')

Index(['y', 'img_names', 'y_local_fold1', 'y_fold1', 'y_local_fold2',
       'y_fold2', 'y_local_fold3', 'y_fold3', 'y_local_fold4', 'y_fold4',
       'y_local_fold5', 'y_fold5', 'y_local_fold6', 'y_fold6', 'y_local_fold7',
       'y_fold7', 'y_local_fold8', 'y_fold8', 'y_local_fold9', 'y_fold9',
       'y_local_fold0', 'y_fold0', 'entropy', 'model_mean', 'label_mean'],
      dtype='object')


Unnamed: 0,y,img_names,y_local_fold1,y_fold1,y_local_fold2,y_fold2,y_local_fold3,y_fold3,y_local_fold4,y_fold4,...,y_fold7,y_local_fold8,y_fold8,y_local_fold9,y_fold9,y_local_fold0,y_fold0,entropy,model_mean,label_mean
15628,,104769_1_19990102,0.351641,0.0,0.903803,0.0,0.317219,0.0,0.938126,0.0,...,0.0,0.391214,0.0,0.245816,0.0,,,,0.517156,0.0


In [7]:
df_valid = load_external_csv(MODEL_NAME, valid=True, train=False, local=True, base_path=BASE_PATH)
df_valid.query('img_names == "104769_1_19990102"')

Index(['y', 'img_names', 'y_local_fold1', 'y_fold1', 'y_local_fold2',
       'y_fold2', 'y_local_fold3', 'y_fold3', 'y_local_fold4', 'y_fold4',
       'y_local_fold5', 'y_fold5', 'y_local_fold6', 'y_fold6', 'y_local_fold7',
       'y_fold7', 'y_local_fold8', 'y_fold8', 'y_local_fold9', 'y_fold9',
       'y_local_fold0', 'y_fold0', 'entropy', 'model_mean', 'label_mean'],
      dtype='object')


Unnamed: 0,y,img_names,y_local_fold1,y_fold1,y_local_fold2,y_fold2,y_local_fold3,y_fold3,y_local_fold4,y_fold4,...,y_fold7,y_local_fold8,y_fold8,y_local_fold9,y_fold9,y_local_fold0,y_fold0,entropy,model_mean,label_mean
0,0.0,104769_1_19990102,,,,,,,,,...,,,,,,0.683814,0.0,,0.683814,0.0


In [8]:
# df_combined = load_external_csv(MODEL_NAME, valid=False, train=False, local=False, base_path=BASE_PATH)
# df_combined.info()

In [9]:
def process_loaded_csv(df, model_name, output_file_name):
    df = df[['img_names', 'label_mean', 'model_mean']]
    mapper = {
        "img_names": "AnnotationID",
        "label_mean": "label",
        "model_mean": model_name
    }
    df = df.rename(columns=mapper)
    df.to_csv(output_file_name, index=False)
    return df

In [13]:
df_valid_final = process_loaded_csv(df_valid, "Thijmen_local", rf"{NLST_PREDS}/Tijmen Local NLST/combined_valid_output.csv")
df_valid_final

Unnamed: 0,AnnotationID,label,Thijmen_local
0,104769_1_19990102,0.0,0.683814
1,101010_1_20010102,0.0,0.320513
2,100248_2_20010102,0.0,0.002520
3,102563_1_20000102,0.0,0.756166
4,101190_3_20000102,0.0,0.602210
...,...,...,...
16072,213766_3_19990102,0.0,0.006691
16073,218698_2_20000102,0.0,0.785073
16074,214553_1_19990102,1.0,0.968719
16075,217063_1_20000102,0.0,0.000158


In [14]:
df_train_final = process_loaded_csv(df_train, "Thijmen_local", rf"{NLST_PREDS}/Tijmen Local NLST/combined_train_output.csv")
df_train_final

Unnamed: 0,AnnotationID,label,Thijmen_local
0,130987_3_19990102,0.0,0.165665
1,211188_2_20000102,0.0,0.000582
2,119917_6_20010102,0.0,0.000838
3,130203_1_20010102,0.0,0.093253
4,213760_1_19990102,0.0,0.001035
...,...,...,...
16072,100597_2_19990102,0.0,0.000645
16073,103344_1_20010102,1.0,0.763849
16074,100507_2_20000102,0.0,0.040403
16075,102384_1_19990102,0.0,0.046913


## Merge with exsiting predictions

In [16]:
nlst_preds_all = pd.read_csv(f'{NLST_PREDS}/nlst_demo_v1_w3preds.csv')
nlst_preds_all

Unnamed: 0,PatientID,StudyDate,SeriesInstanceUID,CoordX,CoordY,CoordZ,LesionID,NoduleType,Spiculation,Diameter [mm],...,canclary,canclung,cancnasa,cancoral,cancpanc,cancphar,cancstom,cancthyr,canctran,study
0,100012,19990102,1.2.840.113654.2.55.24023112856488152536348979...,-32.919853,-136.22139,-90.400002,1,Solid,False,11.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,100012,20000102,1.2.840.113654.2.55.50761756412482430061802871...,-47.650000,-130.37000,1222.350000,1,PartSolid,False,23.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,100019,20000102,1.2.840.113654.2.55.12567845460924968711329545...,-103.190000,74.01000,-1129.370000,1,Solid,False,10.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,100035,19990102,1.2.840.113654.2.55.64360087057061878755830205...,124.670000,5.85000,-193.590000,1,Solid,False,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,100035,20000102,1.2.840.113654.2.55.33941066798745864774354503...,127.360000,-2.95000,-184.570000,1,Solid,False,4.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16072,218863,19990102,1.3.6.1.4.1.14519.5.2.1.7009.9004.977896926234...,-91.340000,-44.15000,-177.200000,1,GroundGlassOpacity,False,4.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
16073,218863,19990102,1.3.6.1.4.1.14519.5.2.1.7009.9004.977896926234...,69.720000,-48.07000,-86.470000,2,SemiSolid,False,6.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
16074,218863,20010102,1.3.6.1.4.1.14519.5.2.1.7009.9004.796552648301...,61.210000,-7.69000,-15.050000,2,SemiSolid,False,7.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
16075,218866,20000102,1.3.6.1.4.1.14519.5.2.1.7009.9004.698887010763...,31.260000,-216.76000,250.570000,1,Solid,False,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2


In [17]:
nlst_preds_w4 = nlst_preds_all.merge(df_valid_final, 
                how="left", ## 'inner' for only ones Thijmen has preds for
                on=['AnnotationID', 'label'], suffixes=(None,None))
nlst_preds_w4

Unnamed: 0,PatientID,StudyDate,SeriesInstanceUID,CoordX,CoordY,CoordZ,LesionID,NoduleType,Spiculation,Diameter [mm],...,canclung,cancnasa,cancoral,cancpanc,cancphar,cancstom,cancthyr,canctran,study,Thijmen_local
0,100012,19990102,1.2.840.113654.2.55.24023112856488152536348979...,-32.919853,-136.22139,-90.400002,1,Solid,False,11.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.968842
1,100012,20000102,1.2.840.113654.2.55.50761756412482430061802871...,-47.650000,-130.37000,1222.350000,1,PartSolid,False,23.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.984129
2,100019,20000102,1.2.840.113654.2.55.12567845460924968711329545...,-103.190000,74.01000,-1129.370000,1,Solid,False,10.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.919277
3,100035,19990102,1.2.840.113654.2.55.64360087057061878755830205...,124.670000,5.85000,-193.590000,1,Solid,False,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.002378
4,100035,20000102,1.2.840.113654.2.55.33941066798745864774354503...,127.360000,-2.95000,-184.570000,1,Solid,False,4.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.000555
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16072,218863,19990102,1.3.6.1.4.1.14519.5.2.1.7009.9004.977896926234...,-91.340000,-44.15000,-177.200000,1,GroundGlassOpacity,False,4.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,0.002565
16073,218863,19990102,1.3.6.1.4.1.14519.5.2.1.7009.9004.977896926234...,69.720000,-48.07000,-86.470000,2,SemiSolid,False,6.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,0.504255
16074,218863,20010102,1.3.6.1.4.1.14519.5.2.1.7009.9004.796552648301...,61.210000,-7.69000,-15.050000,2,SemiSolid,False,7.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,0.477067
16075,218866,20000102,1.3.6.1.4.1.14519.5.2.1.7009.9004.698887010763...,31.260000,-216.76000,250.570000,1,Solid,False,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,0.022328


In [18]:
nlst_preds_w4.to_csv(f'{NLST_PREDS}/nlst_demo_v1_w4preds.csv', index=False)