# Get Uncertainty info for Kiran's Model

For NLST and DLCST. Thanks Dre!

In [76]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
from IPython.display import display, Markdown
from sklearn.calibration import calibration_curve
from sklearn.calibration import CalibratedClassifierCV
from scipy.special import logit, expit
from scipy.stats import entropy
from sklearn.linear_model import LogisticRegression

import sys
sys.path.append('../')

import utils
from evalutils.roc import get_bootstrapped_roc_ci_curves
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

## directory where results are
CHANSEY_ROOT = "V:"
CHANSEY_DIR = f"{CHANSEY_ROOT}/experiments/lung-malignancy-fairness-shaurya"
TEAMS_DIR = "C:/Users/shaur/OneDrive - Radboudumc/Documents - Master - Shaurya Gaur/General/Malignancy-Estimation Results"

In [78]:
DLCST_CERTAIN   = f"{CHANSEY_ROOT}/experiments/amara/Technical/Uncertainty/results/LungRads/data/DLCST_certain_Entropy.csv"
DLCST_UNCERTAIN = f"{CHANSEY_ROOT}/experiments/amara/Technical/Uncertainty/results/LungRads/data/DLCST_uncertain_Entropy.csv"
NLST_LOGITS = f"{CHANSEY_ROOT}/experiments/amara/Technical/Uncertainty/dataset_csv/NLST_internal_validation.csv"

In [79]:
DLCST_PREDS = f"{TEAMS_DIR}/dlcst/calibrated_dlcst_thijmen_kiran_sybil_malignancy_estimation_results.csv"
NLST_PREDS = f"{TEAMS_DIR}/nlst/nlst_demov4_allmodels_cal.csv"

## Read DLCST

In [80]:
df_dlcst_certain = pd.read_csv(DLCST_CERTAIN)
df_dlcst_uncertain = pd.read_csv(DLCST_UNCERTAIN)
df_dlcst_certain.info(), df_dlcst_uncertain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 794 entries, 0 to 793
Data columns (total 63 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   PatientID                               794 non-null    int64  
 1   StudyDate                               794 non-null    int64  
 2   SeriesInstanceUID                       794 non-null    object 
 3   CoordX                                  794 non-null    float64
 4   CoordY                                  794 non-null    float64
 5   CoordZ                                  794 non-null    float64
 6   Age                                     794 non-null    int64  
 7   Sex                                     794 non-null    int64  
 8   FamilyHistoryLungCa                     794 non-null    int64  
 9   Emphysema                               794 non-null    int64  
 10  diameter_mm                             794 non-null    float6

(None, None)

In [81]:
df_dlcst_uncertainty = pd.concat([df_dlcst_uncertain, df_dlcst_certain], axis=0, ignore_index=True).reset_index(drop=True)
df_dlcst_uncertainty.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 883 entries, 0 to 882
Data columns (total 63 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   PatientID                               883 non-null    int64  
 1   StudyDate                               883 non-null    int64  
 2   SeriesInstanceUID                       883 non-null    object 
 3   CoordX                                  883 non-null    float64
 4   CoordY                                  883 non-null    float64
 5   CoordZ                                  883 non-null    float64
 6   Age                                     883 non-null    int64  
 7   Sex                                     883 non-null    int64  
 8   FamilyHistoryLungCa                     883 non-null    int64  
 9   Emphysema                               883 non-null    int64  
 10  diameter_mm                             883 non-null    float6

In [82]:
df_dlcst_uncertainty.nunique().sort_values(ascending=False).head(27)

i3d_05.08.2020_deepr_50mm_fold2           883
i3d_05.08.2020_deepr_50mm_fold0           883
i3d_05.08.2020_deepr_50mm_fold1           883
mvresnet50_05.08.2020_deepr_50mm_fold8    883
mvresnet50_05.08.2020_deepr_50mm_fold7    883
Mean_entropy                              883
calibratedDL                              883
DL                                        883
i3d_05.08.2020_deepr_50mm_fold9           883
i3d_05.08.2020_deepr_50mm_fold8           883
i3d_05.08.2020_deepr_50mm_fold7           883
i3d_05.08.2020_deepr_50mm_fold6           883
i3d_05.08.2020_deepr_50mm_fold5           883
i3d_05.08.2020_deepr_50mm_fold4           883
std                                       883
i3d_05.08.2020_deepr_50mm_fold3           883
mvresnet50_05.08.2020_deepr_50mm_fold4    883
mvresnet50_05.08.2020_deepr_50mm_fold2    883
mvresnet50_05.08.2020_deepr_50mm_fold3    883
mvresnet50_05.08.2020_deepr_50mm_fold1    883
mvresnet50_05.08.2020_deepr_50mm_fold0    883
mvresnet50_05.08.2020_deepr_50mm_f

In [83]:
dlcst_preds = pd.read_csv(DLCST_PREDS)
dlcst_preds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 599 entries, 0 to 598
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   PatientID            599 non-null    int64  
 1   StudyDate            599 non-null    int64  
 2   SeriesInstanceUID    599 non-null    object 
 3   Age                  599 non-null    int64  
 4   Sex                  599 non-null    int64  
 5   FamilyHistoryLungCa  599 non-null    int64  
 6   Emphysema            599 non-null    int64  
 7   NoduleCountPerScan   599 non-null    int64  
 8   sybil_year1          599 non-null    float64
 9   sybil_year2          599 non-null    float64
 10  sybil_year3          599 non-null    float64
 11  sybil_year4          599 non-null    float64
 12  sybil_year5          599 non-null    float64
 13  sybil_year6          599 non-null    float64
 14  PanCan2b             599 non-null    float64
 15  Ensemble_Kiran       599 non-null    flo

In [84]:
dlcst_preds_nodule = pd.read_csv(f"{TEAMS_DIR}/dlcst/20220816_DLCST_full_Venk21.csv")
dlcst_preds_nodule.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 883 entries, 0 to 882
Data columns (total 34 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   PatientID            883 non-null    int64  
 1   StudyDate            883 non-null    int64  
 2   SeriesInstanceUID    883 non-null    object 
 3   CoordX               883 non-null    float64
 4   CoordY               883 non-null    float64
 5   CoordZ               883 non-null    float64
 6   Age                  883 non-null    int64  
 7   Sex                  883 non-null    int64  
 8   FamilyHistoryLungCa  883 non-null    int64  
 9   Emphysema            883 non-null    int64  
 10  NoduleSizeDLCST      883 non-null    float64
 11  NoduleCountPerScan   883 non-null    int64  
 12  Spiculated           883 non-null    int64  
 13  NoduleType           883 non-null    object 
 14  Calcified            883 non-null    int64  
 15  PFN                  883 non-null    int

In [85]:
dlcst_preds.nunique().sort_values(ascending=False)

PatientID              599
SeriesInstanceUID      599
thijmen_mean_cal       599
thijmen_mean           599
Ensemble_Kiran_cal     596
Ensemble_Kiran         596
PanCan2b               478
StudyDate              221
sybil_year6            103
sybil_year5            100
sybil_year4             82
sybil_year3             82
sybil_year1             61
sybil_year2             60
Age                     23
NoduleCountPerScan       9
FamilyHistoryLungCa      2
Sex                      2
Emphysema                2
label                    2
dtype: int64

Ok, time to check the ensemble predictions.

In [86]:
ensemble_check_uncer = df_dlcst_uncertainty[['AnnotationID', 'DL']].sort_values(by='AnnotationID', ascending=True).reset_index(drop=True)
ensemble_check_preds = dlcst_preds_nodule[['AnnotationID', 'Ensemble']].sort_values(by='AnnotationID', ascending=True).reset_index(drop=True)
ensemble_check_preds['DL'] = ensemble_check_preds['Ensemble']

In [87]:
ensemble_check_preds[['AnnotationID', 'DL']].compare(ensemble_check_uncer[['AnnotationID', 'DL']])

Unnamed: 0_level_0,DL,DL
Unnamed: 0_level_1,self,other
615,2.268332e-07,2.268332e-07
630,2.763743e-05,2.763743e-05
792,2.06472e-05,2.06472e-05


In [88]:
spot_check_series = dlcst_preds['SeriesInstanceUID'][0]
spot_check_series

'1.2.840.113704.1.111.4964.1106577805.10'

In [89]:
display(dlcst_preds.query(f'SeriesInstanceUID == "{spot_check_series}"')[['SeriesInstanceUID', 'Ensemble_Kiran']])
display(dlcst_preds_nodule.query(f'SeriesInstanceUID == "{spot_check_series}"')[['AnnotationID', 'Ensemble']])
display(df_dlcst_uncertainty.query(f'SeriesInstanceUID == "{spot_check_series}"')[['SeriesInstanceUID', 'AnnotationID', 'DL', 'Mean_entropy']])

Unnamed: 0,SeriesInstanceUID,Ensemble_Kiran
0,1.2.840.113704.1.111.4964.1106577805.10,0.082652


Unnamed: 0,AnnotationID,Ensemble
0,4_1_20050124,0.030901
1,4_2_20050124,0.001042
2,4_3_20050124,0.002834
3,4_4_20050124,6.3e-05
4,4_6_20050124,0.082652


Unnamed: 0,SeriesInstanceUID,AnnotationID,DL,Mean_entropy
89,1.2.840.113704.1.111.4964.1106577805.10,4_1_20050124,0.030901,0.438489
90,1.2.840.113704.1.111.4964.1106577805.10,4_2_20050124,0.001042,0.093826
91,1.2.840.113704.1.111.4964.1106577805.10,4_3_20050124,0.002834,0.147615
92,1.2.840.113704.1.111.4964.1106577805.10,4_4_20050124,6.3e-05,0.028098
93,1.2.840.113704.1.111.4964.1106577805.10,4_6_20050124,0.082652,0.418363


Ok, so the predictions are the same.

In [90]:
dlcst_preds_nodule_w_entropy = dlcst_preds_nodule.merge(
    right=df_dlcst_uncertainty[['AnnotationID', 'Mean_entropy']], how='left', 
    on=['AnnotationID'], suffixes=('', '')
)
dlcst_preds_nodule_w_entropy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 883 entries, 0 to 882
Data columns (total 35 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   PatientID            883 non-null    int64  
 1   StudyDate            883 non-null    int64  
 2   SeriesInstanceUID    883 non-null    object 
 3   CoordX               883 non-null    float64
 4   CoordY               883 non-null    float64
 5   CoordZ               883 non-null    float64
 6   Age                  883 non-null    int64  
 7   Sex                  883 non-null    int64  
 8   FamilyHistoryLungCa  883 non-null    int64  
 9   Emphysema            883 non-null    int64  
 10  NoduleSizeDLCST      883 non-null    float64
 11  NoduleCountPerScan   883 non-null    int64  
 12  Spiculated           883 non-null    int64  
 13  NoduleType           883 non-null    object 
 14  Calcified            883 non-null    int64  
 15  PFN                  883 non-null    int

In [91]:
dlcst_preds_nodule_w_entropy.to_csv(f"{TEAMS_DIR}/dlcst/20220816_DLCST_full_Venk21_w_uncertainty.csv", index=False)

In [92]:
df_dlcst_uncertainty.to_csv(f"{TEAMS_DIR}/dlcst/dlcst_uncertainty_Venk21.csv", index=False)

In [93]:
df_dlcst_uncertainty['Ensemble_Kiran'] = df_dlcst_uncertainty['DL']

In [94]:
df_dlcst_uncertainty[['SeriesInstanceUID', 'Ensemble_Kiran', 'Mean_entropy']]

Unnamed: 0,SeriesInstanceUID,Ensemble_Kiran,Mean_entropy
0,1.2.840.113704.1.111.616.1112795278.10,0.709479,0.610065
1,1.2.840.113704.1.111.4620.1102346626.10,0.322166,0.561323
2,1.2.840.113704.1.111.3652.1193760933.11,0.692819,0.628372
3,1.2.840.113704.1.111.5808.1100791833.10,0.726948,0.601210
4,1.2.840.113704.1.111.9592.1200934074.11,0.274001,0.508309
...,...,...,...
878,1.2.840.113704.1.111.5848.1144079789.11,0.000007,0.014936
879,1.2.840.113704.1.111.5848.1144079789.11,0.081857,0.462017
880,1.2.840.113704.1.111.5112.1144079519.11,0.000162,0.040947
881,1.2.840.113704.1.111.5112.1144079519.11,0.001050,0.066176


In [95]:
def make_str_join_key(row):
    return f"{row['SeriesInstanceUID']} {row['Ensemble_Kiran']}"

In [96]:
dlcst_preds['joinkey'] = dlcst_preds.apply(make_str_join_key, axis=1)
df_dlcst_uncertainty['joinkey'] = df_dlcst_uncertainty.apply(make_str_join_key, axis=1)
dlcst_preds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 599 entries, 0 to 598
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   PatientID            599 non-null    int64  
 1   StudyDate            599 non-null    int64  
 2   SeriesInstanceUID    599 non-null    object 
 3   Age                  599 non-null    int64  
 4   Sex                  599 non-null    int64  
 5   FamilyHistoryLungCa  599 non-null    int64  
 6   Emphysema            599 non-null    int64  
 7   NoduleCountPerScan   599 non-null    int64  
 8   sybil_year1          599 non-null    float64
 9   sybil_year2          599 non-null    float64
 10  sybil_year3          599 non-null    float64
 11  sybil_year4          599 non-null    float64
 12  sybil_year5          599 non-null    float64
 13  sybil_year6          599 non-null    float64
 14  PanCan2b             599 non-null    float64
 15  Ensemble_Kiran       599 non-null    flo

In [97]:
dlcst_preds_w_entropy = dlcst_preds.merge(
    right=df_dlcst_uncertainty[['joinkey', 'Mean_entropy']], 
    left_on=['joinkey'], how='left',
    right_on=['joinkey'],
    # suffixes=('', '_Kiran'), copy=False
)
dlcst_preds_w_entropy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 599 entries, 0 to 598
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   PatientID            599 non-null    int64  
 1   StudyDate            599 non-null    int64  
 2   SeriesInstanceUID    599 non-null    object 
 3   Age                  599 non-null    int64  
 4   Sex                  599 non-null    int64  
 5   FamilyHistoryLungCa  599 non-null    int64  
 6   Emphysema            599 non-null    int64  
 7   NoduleCountPerScan   599 non-null    int64  
 8   sybil_year1          599 non-null    float64
 9   sybil_year2          599 non-null    float64
 10  sybil_year3          599 non-null    float64
 11  sybil_year4          599 non-null    float64
 12  sybil_year5          599 non-null    float64
 13  sybil_year6          599 non-null    float64
 14  PanCan2b             599 non-null    float64
 15  Ensemble_Kiran       599 non-null    flo

## NLST

In [98]:
dataset = pd.read_csv(NLST_LOGITS)


data_pred = expit(dataset[['logit_mvresnet50_05.08.2020_deepr_50mm',
                            'logit_i3d_05.08.2020_deepr_50mm']])

for index, row in data_pred.iterrows():
    prediction_entropy = []
    for value in row:
        class1 = value
        class0 = 1 - value
        entr = entropy([class0, class1], base=2)
        prediction_entropy.append(entr)
    dataset.loc[index, 'Mean_entropy'] = np.mean(prediction_entropy)
 
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16077 entries, 0 to 16076
Data columns (total 17 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   PatientID                               16077 non-null  int64  
 1   SeriesInstanceUID                       16077 non-null  object 
 2   StudyDate                               16077 non-null  int64  
 3   CoordX                                  16077 non-null  float64
 4   CoordY                                  16077 non-null  float64
 5   CoordZ                                  16077 non-null  float64
 6   Diameter [mm]                           16077 non-null  float64
 7   NoduleType                              16076 non-null  object 
 8   label                                   16077 non-null  int64  
 9   LesionID                                16077 non-null  int64  
 10  NoduleID                                16077 non-null  ob

In [99]:
nlst_preds_nodule = pd.read_csv(NLST_PREDS)
nlst_preds_nodule.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16077 entries, 0 to 16076
Columns: 126 entries, PatientID to Unfinished_ed
dtypes: bool(27), float64(85), int64(11), object(3)
memory usage: 12.6+ MB


In [101]:
dataset.nunique().sort_values(ascending=False)

Mean_logit                                16077
AnnotationID                              16077
Mean_entropy                              16077
Ensemble                                  16076
logit_i3d_05.08.2020_deepr_50mm           16075
logit_mvresnet50_05.08.2020_deepr_50mm    16075
CoordZ                                    13528
CoordY                                    13071
CoordX                                    11867
SeriesInstanceUID                         10183
NoduleID                                  10024
PatientID                                  5282
Diameter [mm]                               368
LesionID                                     18
NoduleType                                    7
StudyDate                                     3
label                                         2
dtype: int64

In [103]:
nlst_preds_nodule = nlst_preds_nodule.sort_values(by='AnnotationID', ascending=True)
nlst_uncertainty = dataset.sort_values(by='AnnotationID', ascending=True)

In [104]:
nlst_preds_nodule['Mean_Entropy_Kiran'] = nlst_uncertainty['Mean_entropy']
nlst_preds_nodule.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16077 entries, 7664 to 7663
Columns: 127 entries, PatientID to Mean_Entropy_Kiran
dtypes: bool(27), float64(86), int64(11), object(3)
memory usage: 12.8+ MB


In [105]:
nlst_preds_nodule.to_csv(NLST_PREDS, index=False)