In [2]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
from IPython.display import display, Markdown

import sys
sys.path.append('../')

from evalutils.roc import get_bootstrapped_roc_ci_curves
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from utilities import data, roc

## directory where results are
CHANSEY_ROOT = "W:"
EXPERIMENT_DIR = f"{CHANSEY_ROOT}/experiments/lung-malignancy-fairness-shaurya"
NLST_PREDS = f"{EXPERIMENT_DIR}/nlst"

TEAMS_DIR = "C:/Users/shaur/OneDrive - Radboudumc/Documents - Master - Shaurya Gaur/General/Malignancy-Estimation Results"
NLST_PREDS = f"{TEAMS_DIR}/nlst" ## Comment out if not using Teams backup (aka Chansey is up :)
FIG_DIR = f"{TEAMS_DIR}/figs"
TAB_DIR = f"{TEAMS_DIR}/tables"

In [3]:
nlst_preds_nodule = pd.read_csv(f"{NLST_PREDS}/nlst_demov4_allmodels_cal.csv")
nlst_preds_nodule.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16077 entries, 0 to 16076
Columns: 131 entries, PatientID to SliceCount
dtypes: bool(27), float64(86), int64(12), object(6)
memory usage: 13.2+ MB


In [4]:
with open(f'{NLST_PREDS}/nlst_demo_v4_cols.json') as json_data:
    nlst_democols_og = json.load(json_data)
    json_data.close()

nlst_democols_og

{'num': {'demo': ['BMI', 'Age', 'height', 'weight'],
  'smoke': ['smokeage', 'smokeday', 'smokeyr', 'pkyr'],
  'nodule': ['CoordX', 'CoordZ', 'CoordY', 'Mean_Entropy_Kiran'],
  'other': ['NoduleCounts', 'Diameter_mm', 'SliceCount']},
 'cat': {'demo': ['Overweight',
   'educat',
   'Gender',
   'Married',
   'HighSchoolPlus',
   'NonHispanicWhite',
   'Unfinished_ed',
   'WhiteOrBlack',
   'marital',
   'ethnic',
   'race'],
  'smoke': ['smokelive', 'cigar', 'cigsmok', 'smokework', 'pipe'],
  'work': ['wrkbaki',
   'wrkfoun',
   'wrkchem',
   'wrkasbe',
   'wrkfire',
   'wrksand',
   'wrkfarm',
   'wrkcoal',
   'wrkpain',
   'wrkweld',
   'wrkflou',
   'wrkbutc',
   'wrkhard',
   'wrkcott'],
  'disease': ['diagasbe',
   'diagchas',
   'diagpneu',
   'diagstro',
   'diagemph',
   'diagbron',
   'diagsili',
   'diagsarc',
   'diaghear',
   'diagdiab',
   'diagadas',
   'diagcopd',
   'diagfibr',
   'diagtube',
   'diaghype',
   'diagchro'],
  'canchist': ['canckidn',
   'cancphar',
   'ca

# Materials

## Demographic characteristics of the NLST validation sets (n=5911 scans, n=1172 scans).

In [5]:
nlst_5911, _, _ = data.prep_nlst_preds(nlst_preds_nodule, nlst_democols_og, scanlevel=True, tijmen=False, sybil=True, pretty=True, bin_num=False)
nlst_1172, nlst_democols, _ = data.prep_nlst_preds(nlst_preds_nodule, nlst_democols_og, scanlevel=True, tijmen=True, sybil=True, pretty=True, bin_num=False)
len(nlst_5911), len(nlst_1172)

  df[att] = df[att].replace(binary_key)
  df[att] = df[att].replace(binary_key)


(5911, 1172)

In [6]:
nlstval_full_mal = nlst_5911.query('label == 1')
nlstval_full_ben = nlst_5911.query('label == 0')
print('full:', len(nlstval_full_ben), 'ben', len(nlstval_full_mal), 'mal')

nlstval_some_mal = nlst_1172.query('label == 1')
nlstval_some_ben = nlst_1172.query('label == 0')
print('some:', len(nlstval_some_ben), 'ben', len(nlstval_some_mal), 'mal')

full: 5330 ben 581 mal
some: 1045 ben 127 mal


In [7]:
validation_sets = {
    "Full (malignant)": nlstval_full_mal,
    "Full (benign)": nlstval_full_ben,
    "Full (all)": nlst_5911,
    "Partial (malignant)": nlstval_some_mal,
    "Partial (benign)": nlstval_some_ben,
    "Partial (all)": nlst_1172,
}

In [8]:
df = data.combine_diff_dfs(nlst_democols['cat'], data.diffs_category_prevalence, validation_sets)
df[(df['category'] == "demo") & (df['attribute'].isin(['Gender', 'race', 'educat', 'Married', 'marital', 'ethnic']))]

Unnamed: 0,category,attribute,value,Full (malignant)_freq,Full (malignant)_norm,Full (benign)_freq,Full (benign)_norm,Full (all)_freq,Full (all)_norm,Partial (malignant)_freq,...,diff_Full (benign)_Full (all),diff_Full (benign)_Partial (malignant),diff_Full (benign)_Partial (benign),diff_Full (benign)_Partial (all),diff_Full (all)_Partial (malignant),diff_Full (all)_Partial (benign),diff_Full (all)_Partial (all),diff_Partial (malignant)_Partial (benign),diff_Partial (malignant)_Partial (all),diff_Partial (benign)_Partial (all)


In [9]:


for s in validation_sets:
    df[f"{s}_info"] = df.apply(lambda x: f'{0 if np.isnan(x[f"{s}_freq"]) else int(x[f"{s}_freq"])} ({0 if np.isnan(x[f"{s}_norm"]) else np.around(x[f"{s}_norm"], 1)})', axis=1)

df_out = df[
    (df['category'] == "Demographics") 
    # & (df['attribute'].isin(['Gender', 'Race', 'Education Status', 'Marital Status', 'Ethnic']))
][['attribute', 'value'] + [f"{s}_info" for s in validation_sets]].dropna(axis=0)
df_out

Unnamed: 0,attribute,value,Full (malignant)_info,Full (benign)_info,Full (all)_info,Partial (malignant)_info,Partial (benign)_info,Partial (all)_info
0,BMI > 25,False,209 (36.0),1690 (31.7),1899 (32.1),45 (35.4),326 (31.2),371 (31.7)
1,BMI > 25,True,372 (64.0),3640 (68.3),4012 (67.9),82 (64.6),719 (68.8),801 (68.3)
2,Education Status,8th grade or less,9 (1.5),102 (1.9),111 (1.9),4 (3.1),23 (2.2),27 (2.3)
3,Education Status,9th-11th grade,32 (5.5),258 (4.8),290 (4.9),2 (1.6),65 (6.2),67 (5.7)
4,Education Status,Associate degree/ some college,126 (21.7),1175 (22.0),1301 (22.0),31 (24.4),213 (20.4),244 (20.8)
5,Education Status,Bachelors Degree,96 (16.5),817 (15.3),913 (15.4),21 (16.5),164 (15.7),185 (15.8)
6,Education Status,Graduate School,76 (13.1),778 (14.6),854 (14.4),16 (12.6),163 (15.6),179 (15.3)
7,Education Status,High school graduate/GED,141 (24.3),1338 (25.1),1479 (25.0),36 (28.3),271 (25.9),307 (26.2)
8,Education Status,"Post high school training, excluding college",87 (15.0),765 (14.4),852 (14.4),16 (12.6),128 (12.2),144 (12.3)
10,Gender,Female,244 (42.0),2226 (41.8),2470 (41.8),48 (37.8),431 (41.2),479 (40.9)


In [None]:
df_out_idx = pd.MultiIndex.from_frame(df_out[['attribute', 'value']])
df_out2 = df_out.set_index(df_out_idx)[[f"{s}_info" for s in validation_sets]]

multicol_idx = pd.MultiIndex.from_tuples([
    ('Full Dataset (n=5911 Scans)', 'Malignant (n=581)'),
    ('Full Dataset (n=5911 Scans)', 'Benign (n=5330)'),
    ('Full Dataset (n=5911 Scans)', 'Total'),
    ('Partial Dataset (n=1172 Scans)', 'Malignant (n=127)'),
    ('Partial Dataset (n=1172 Scans)', 'Benign (n=1045)'),
    ('Partial Dataset (n=1172 Scans)', 'Total'),
])

df_out2.columns = multicol_idx
df_out2

Unnamed: 0_level_0,Unnamed: 1_level_0,Full Dataset (n=5911 Scans),Full Dataset (n=5911 Scans),Full Dataset (n=5911 Scans),Partial Dataset (n=1172 Scans),Partial Dataset (n=1172 Scans),Partial Dataset (n=1172 Scans)
Unnamed: 0_level_1,Unnamed: 1_level_1,Malignant (n=581),Benign (n=5330),Total,Malignant (n=127),Benign (n=1045),Total
attribute,value,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
BMI > 25,False,209 (36.0),1690 (31.7),1899 (32.1),45 (35.4),326 (31.2),371 (31.7)
BMI > 25,True,372 (64.0),3640 (68.3),4012 (67.9),82 (64.6),719 (68.8),801 (68.3)
Education Status,8th grade or less,9 (1.5),102 (1.9),111 (1.9),4 (3.1),23 (2.2),27 (2.3)
Education Status,9th-11th grade,32 (5.5),258 (4.8),290 (4.9),2 (1.6),65 (6.2),67 (5.7)
Education Status,Associate degree/ some college,126 (21.7),1175 (22.0),1301 (22.0),31 (24.4),213 (20.4),244 (20.8)
Education Status,Bachelors Degree,96 (16.5),817 (15.3),913 (15.4),21 (16.5),164 (15.7),185 (15.8)
Education Status,Graduate School,76 (13.1),778 (14.6),854 (14.4),16 (12.6),163 (15.6),179 (15.3)
Education Status,High school graduate/GED,141 (24.3),1338 (25.1),1479 (25.0),36 (28.3),271 (25.9),307 (26.2)
Education Status,"Post high school training, excluding college",87 (15.0),765 (14.4),852 (14.4),16 (12.6),128 (12.2),144 (12.3)
Gender,Female,244 (42.0),2226 (41.8),2470 (41.8),48 (37.8),431 (41.2),479 (40.9)


In [11]:
print(df_out2.style.to_latex(buf=f"{TAB_DIR}/datasetDemos.tex",
    hrules=True, label='tab:datasetDemos', environment='table*', column_format='ll|rrr|rrr', position='h!',
    multirow_align='c', multicol_align='c',  position_float='centering', clines='skip-last;data',
    caption=f'Demographic characteristics of the NLST validation sets (n={len(nlst_5911)} scans, n={len(nlst_1172)} scans).'))

None


In [22]:
num_df = data.combine_diff_dfs(nlst_democols['num'], data.diffs_numerical_means, validation_sets)
num_df = num_df[
    (num_df['category'] == "Demographics") & (num_df['value'] == 'mean')
    # & (df['attribute'].isin(['Gender', 'Race', 'Education Status', 'Marital Status', 'Ethnic']))
][['attribute', 'value'] + [f"{s}" for s in validation_sets]].dropna(axis=0)
num_df = num_df.set_index(pd.MultiIndex.from_frame(num_df[['attribute', 'value']]))[[f"{s}" for s in validation_sets]]
num_df.columns = multicol_idx
num_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Full Dataset (n=5911 Scans),Full Dataset (n=5911 Scans),Full Dataset (n=5911 Scans),Partial Dataset (n=1172 Scans),Partial Dataset (n=1172 Scans),Partial Dataset (n=1172 Scans)
Unnamed: 0_level_1,Unnamed: 1_level_1,Malignant (n=581),Benign (n=5330),Total,Malignant (n=127),Benign (n=1045),Total
attribute,value,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
BMI,mean,26.7923,27.6376,27.5553,26.6583,27.652,27.5442
Age,mean,64.062,63.0882,63.1839,64.063,63.199,63.2927
Height,mean,67.867,67.9921,67.9798,68.4173,68.0967,68.1314
Weight,mean,176.4817,182.5108,181.923,178.2441,183.3563,182.8019


In [24]:
df_out2 = pd.concat([df_out2, num_df], axis=0)

In [25]:
print(df_out2.style.to_latex(buf=f"{TAB_DIR}/datasetDemos.tex",
    hrules=True, label='tab:datasetDemos', environment='table*', column_format='ll|rrr|rrr', position='h!',
    multirow_align='c', multicol_align='c',  position_float='centering', clines='skip-last;data',
    caption=f'Demographic characteristics of the NLST validation sets (n={len(nlst_5911)} scans, n={len(nlst_1172)} scans).'))

None


## DLCST vs. NLST

In [13]:
nlst_valset, nlst_cols, _ = data.prep_nlst_preds(nlst_preds_nodule, nlst_democols_og, scanlevel=True, tijmen=False, sybil=True, pretty=True, bin_num=True)

dlcst_democols = {
    'cat': {'demo': ['Sex'], 'other': ['FamilyHistoryLungCa', 'Emphysema']},
    'num': {'demo': ['Age'], 'other': ['NoduleCountPerScan']}
}
dlcst_preds = pd.read_csv(f"{TEAMS_DIR}/dlcst/dlcst_thijmen_kiran_sybil_malignancy_estimation_results.csv", header=0)
dlcst_valset, dlcst_cols = data.nlst_pretty_labels(dlcst_preds, dlcst_democols)
display(dlcst_valset)
dlcst_cols

  df[att] = df[att].replace(binary_key)
  df[att] = df[att].replace(binary_key)


Unnamed: 0,PatientID,StudyDate,SeriesInstanceUID,Age,Gender,Family History of LC,Emphysema in Scan,Nodules Per Scan,sybil_year1,sybil_year2,sybil_year3,sybil_year4,sybil_year5,sybil_year6,PanCan2b,Ensemble_Kiran,thijmen_mean,label
0,4,20050124,1.2.840.113704.1.111.4964.1106577805.10,55,Female,False,False,9,0.021629,0.038573,0.071919,0.079270,0.095846,0.135681,0.053366,0.082652,0.166209,0
1,35,20051208,1.2.840.113704.1.111.5776.1134059140.11,56,Male,True,True,2,0.001170,0.002554,0.007835,0.011039,0.018442,0.030460,0.009543,0.000408,0.003368,0
2,38,20060109,1.2.840.113704.1.111.2004.1136823831.14,62,Female,False,True,4,0.001784,0.003870,0.007835,0.012797,0.019229,0.032957,0.006734,0.002702,0.065888,0
3,47,20051214,1.2.840.113704.1.111.8148.1134579622.14,57,Male,False,True,1,0.003951,0.015674,0.025373,0.034010,0.040605,0.058852,0.007944,0.084158,0.423341,0
4,56,20051213,1.2.840.113704.1.111.2744.1134487263.11,64,Male,False,True,3,0.000000,0.001574,0.003791,0.006847,0.010381,0.017287,0.000899,0.000013,0.005590,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
594,4057,20060314,1.2.840.113704.1.111.4796.1142355218.14,69,Male,False,True,2,0.168810,0.273494,0.257961,0.294720,0.327062,0.383196,0.380198,0.893933,0.682322,0
595,4063,20060313,1.2.840.113704.1.111.5104.1142267340.10,55,Male,False,True,4,0.001965,0.006793,0.013618,0.017289,0.021685,0.035951,0.000000,0.000099,0.103020,0
596,4079,20060328,1.2.840.113704.1.111.1308.1143556124.11,52,Female,False,False,1,0.003951,0.011459,0.025373,0.034010,0.040101,0.058852,0.039054,0.070744,0.121373,0
597,4098,20060403,1.2.840.113704.1.111.5848.1144079789.11,54,Male,False,True,3,0.117795,0.167839,0.189976,0.217799,0.229753,0.300137,0.134158,0.452513,0.347016,0


{'cat': {'Demographics': ['Gender'],
  'Other': ['Family History of LC', 'Emphysema in Scan']},
 'num': {'Demographics': ['Age'], 'Other': ['Nodules Per Scan']}}

In [17]:
popshift_check = {
    "NLST_mal": nlst_valset.query('label == 1'),
    "NLST_ben": nlst_valset.query('label == 0'),
    "NLST_full": nlst_valset,
    "DLCST_mal": dlcst_valset.query('label == 1'),
    "DLCST_ben": dlcst_valset.query('label == 0'),
    "DLCST_full": dlcst_valset,   
}

popshift_multi_idx = pd.MultiIndex.from_tuples([
    ('NLST Validation (n=5911 Scans)', 'Malignant (n=581)'),
    ('NLST Validation (n=5911 Scans)', 'Benign (n=5330)'),
    ('NLST Validation (n=5911 Scans)', 'Total'),
    ('DLCST Validation (n=599 Scans)', 'Malignant (n=59)'),
    ('DLCST Validation (n=599 Scans)', 'Benign (n=540)'),
    ('DLCST Validation (n=599 Scans)', 'Total'),
])

df_popshift = data.combine_diff_dfs(dlcst_cols['cat'], dfsets=popshift_check, dispdf=False)
for s in popshift_check:
    df_popshift[f"{s}_info"] = df_popshift.apply(lambda x: f'{0 if np.isnan(x[f"{s}_freq"]) else int(x[f"{s}_freq"])} ({0 if np.isnan(x[f"{s}_norm"]) else np.around(x[f"{s}_norm"], 1)})', axis=1)

cat_multiidx = pd.MultiIndex.from_frame(df_popshift[['attribute', 'value']])
df_popshift = df_popshift.set_index(cat_multiidx)[[f"{s}_info" for s in popshift_check]]
df_popshift.columns = popshift_multi_idx
df_popshift

Unnamed: 0_level_0,Unnamed: 1_level_0,NLST Validation (n=5911 Scans),NLST Validation (n=5911 Scans),NLST Validation (n=5911 Scans),DLCST Validation (n=599 Scans),DLCST Validation (n=599 Scans),DLCST Validation (n=599 Scans)
Unnamed: 0_level_1,Unnamed: 1_level_1,Malignant (n=581),Benign (n=5330),Total,Malignant (n=59),Benign (n=540),Total
attribute,value,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Gender,Female,244 (42.0),2226 (41.8),2470 (41.8),27 (45.8),249 (46.1),276 (46.1)
Gender,Male,337 (58.0),3104 (58.2),3441 (58.2),32 (54.2),291 (53.9),323 (53.9)
Family History of LC,False,401 (69.0),3983 (74.7),4384 (74.2),43 (72.9),454 (84.1),497 (83.0)
Family History of LC,True,180 (31.0),1347 (25.3),1527 (25.8),16 (27.1),86 (15.9),102 (17.0)
Emphysema in Scan,False,317 (54.6),3438 (64.5),3755 (63.5),15 (25.4),179 (33.1),194 (32.4)
Emphysema in Scan,True,264 (45.4),1892 (35.5),2156 (36.5),44 (74.6),361 (66.9),405 (67.6)


In [32]:
num_popshift = data.combine_diff_dfs(dlcst_cols['num'], df_func=data.diffs_numerical_means, dfsets=popshift_check, dispdf=False)
num_popshift = num_popshift[
    (num_popshift['category'] == "Demographics") & (num_popshift['value'] == 'mean')
    # & (df['attribute'].isin(['Gender', 'Race', 'Education Status', 'Marital Status', 'Ethnic']))
][['attribute', 'value'] + [f"{s}" for s in popshift_check]].dropna(axis=0)
num_popshift = num_popshift.set_index(pd.MultiIndex.from_frame(num_popshift[['attribute', 'value']]))[[f"{s}" for s in popshift_check]]
num_popshift.columns = popshift_multi_idx
num_popshift

Unnamed: 0_level_0,Unnamed: 1_level_0,NLST Validation (n=5911 Scans),NLST Validation (n=5911 Scans),NLST Validation (n=5911 Scans),DLCST Validation (n=599 Scans),DLCST Validation (n=599 Scans),DLCST Validation (n=599 Scans)
Unnamed: 0_level_1,Unnamed: 1_level_1,Malignant (n=581),Benign (n=5330),Total,Malignant (n=59),Benign (n=540),Total
attribute,value,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Age,mean,64.062,63.0882,63.1839,62.0,58.0037,58.3973


In [37]:
df_popshift = pd.concat([df_popshift, num_popshift], axis=0)

In [38]:
print(df_popshift.style.to_latex(
    buf=f"{TAB_DIR}/populationShift.tex",
    hrules=True, label='tab:populationShift', environment='table*', column_format='ll|rrr|rrr', position='h!',
    multirow_align='c', multicol_align='c',  position_float='centering', clines='skip-last;data',
    caption=f'Demographic characteristics of the DLCST validation set (n=599 scans), compared to those from the NLST validation set (n=5911 scans).'))

None


# Methods