In [1]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
from IPython.display import display, Markdown

import sys
sys.path.append('../')

from evalutils.roc import get_bootstrapped_roc_ci_curves
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from utilities import data, roc

## directory where results are
CHANSEY_ROOT = "W:"
EXPERIMENT_DIR = f"{CHANSEY_ROOT}/experiments/lung-malignancy-fairness-shaurya"
NLST_PREDS = f"{EXPERIMENT_DIR}/nlst"

TEAMS_DIR = "C:/Users/shaur/OneDrive - Radboudumc/Documents - Master - Shaurya Gaur/General/Malignancy-Estimation Results"
NLST_PREDS = f"{TEAMS_DIR}/nlst" ## Comment out if not using Teams backup (aka Chansey is up :)
FIG_DIR = f"{TEAMS_DIR}/figs"
TAB_DIR = f"{TEAMS_DIR}/tables"

In [2]:
nlst_preds_nodule = pd.read_csv(f"{NLST_PREDS}/nlst_demov4_allmodels_cal.csv")
nlst_preds_nodule.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16077 entries, 0 to 16076
Columns: 131 entries, PatientID to SliceCount
dtypes: bool(27), float64(86), int64(12), object(6)
memory usage: 13.2+ MB


In [3]:
with open(f'{NLST_PREDS}/nlst_demo_v4_cols.json') as json_data:
    nlst_democols_og = json.load(json_data)
    json_data.close()

nlst_democols_og

{'num': {'demo': ['BMI', 'Age', 'height', 'weight'],
  'smoke': ['smokeage', 'smokeday', 'smokeyr', 'pkyr'],
  'nodule': ['CoordX', 'CoordZ', 'CoordY', 'Mean_Entropy_Kiran'],
  'other': ['NoduleCounts', 'Diameter_mm', 'SliceCount']},
 'cat': {'demo': ['Overweight',
   'educat',
   'Gender',
   'Married',
   'HighSchoolPlus',
   'NonHispanicWhite',
   'Unfinished_ed',
   'WhiteOrBlack',
   'marital',
   'ethnic',
   'race'],
  'smoke': ['smokelive', 'cigar', 'cigsmok', 'smokework', 'pipe'],
  'work': ['wrkbaki',
   'wrkfoun',
   'wrkchem',
   'wrkasbe',
   'wrkfire',
   'wrksand',
   'wrkfarm',
   'wrkcoal',
   'wrkpain',
   'wrkweld',
   'wrkflou',
   'wrkbutc',
   'wrkhard',
   'wrkcott'],
  'disease': ['diagasbe',
   'diagchas',
   'diagpneu',
   'diagstro',
   'diagemph',
   'diagbron',
   'diagsili',
   'diagsarc',
   'diaghear',
   'diagdiab',
   'diagadas',
   'diagcopd',
   'diagfibr',
   'diagtube',
   'diaghype',
   'diagchro'],
  'canchist': ['canckidn',
   'cancphar',
   'ca

# Materials

## Demographic characteristics of the NLST validation sets (n=5911 scans, n=1172 scans).

In [4]:
nlst_5911, _, _ = data.prep_nlst_preds(nlst_preds_nodule, nlst_democols_og, scanlevel=True, tijmen=False, sybil=True, pretty=True, bin_num=True)
nlst_1172, nlst_democols, _ = data.prep_nlst_preds(nlst_preds_nodule, nlst_democols_og, scanlevel=True, tijmen=True, sybil=True, pretty=True, bin_num=True)
len(nlst_5911), len(nlst_1172)

  df[att] = df[att].replace(binary_key)


Unnamed: 0,PatientID,StudyDate,SeriesInstanceUID,LesionID,Spiculated Nodule,Diameter (mm),Age,Gender,Family History of LC,Emphysema in Scan,...,Perfissural,Semi-Solid,Solid,LC Stage,White or Black,BMI > 25,Unfinished Education Level,Manufacturer,Model Name,Slices In Scan
0,100012,19990102,1.2.840.113654.2.55.24023112856488152536348979...,1,False,11.5,61,Female,True,False,...,False,False,True,IA,White,False,False,SIEMENS,Volume Zoom,162.0
1,100012,20000102,1.2.840.113654.2.55.50761756412482430061802871...,1,False,23.4,62,Female,True,False,...,False,False,False,IA,White,False,False,SIEMENS,Volume Zoom,157.0
2,100019,20000102,1.2.840.113654.2.55.12567845460924968711329545...,1,False,10.6,62,Male,False,True,...,False,False,True,,White,False,True,Philips,Mx8000,148.0
4,100035,20000102,1.2.840.113654.2.55.33941066798745864774354503...,1,False,4.8,56,Female,False,True,...,False,False,True,,White,False,False,GE MEDICAL SYSTEMS,HiSpeed QX/i,125.0
5,100035,20010102,1.2.840.113654.2.55.17844441025190420803491337...,1,False,4.5,57,Female,False,True,...,False,False,True,,White,False,False,GE MEDICAL SYSTEMS,HiSpeed QX/i,118.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10168,218829,19990102,1.3.6.1.4.1.14519.5.2.1.7009.9004.173225373237...,1,False,5.5,59,Female,False,False,...,False,False,True,,White,True,False,SIEMENS,Sensation 4,164.0
10179,218863,19990102,1.3.6.1.4.1.14519.5.2.1.7009.9004.977896926234...,1,False,6.8,62,Female,True,False,...,False,True,False,,White,True,False,GE MEDICAL SYSTEMS,LightSpeed16,149.0
10180,218863,20010102,1.3.6.1.4.1.14519.5.2.1.7009.9004.796552648301...,2,False,7.6,64,Female,True,False,...,False,True,False,,White,True,False,GE MEDICAL SYSTEMS,LightSpeed Pro 16,477.0
10181,218866,20000102,1.3.6.1.4.1.14519.5.2.1.7009.9004.698887010763...,1,False,4.0,58,Male,True,False,...,False,False,True,,White,False,False,SIEMENS,Sensation 16,237.0


  df[att] = df[att].replace(binary_key)


Unnamed: 0,PatientID,StudyDate,SeriesInstanceUID,LesionID,Spiculated Nodule,Diameter (mm),Age,Gender,Family History of LC,Emphysema in Scan,...,Perfissural,Semi-Solid,Solid,LC Stage,White or Black,BMI > 25,Unfinished Education Level,Manufacturer,Model Name,Slices In Scan
0,100012,19990102,1.2.840.113654.2.55.24023112856488152536348979...,1,False,11.5,61,Female,True,False,...,False,False,True,IA,White,False,False,SIEMENS,Volume Zoom,162.0
1,100012,20000102,1.2.840.113654.2.55.50761756412482430061802871...,1,False,23.4,62,Female,True,False,...,False,False,False,IA,White,False,False,SIEMENS,Volume Zoom,157.0
17,100085,19990102,1.2.840.113654.2.55.29455027499886611857018480...,1,False,5.0,62,Male,False,True,...,False,False,True,,White,False,True,GE MEDICAL SYSTEMS,LightSpeed Plus,149.0
18,100085,20000102,1.2.840.113654.2.55.25718121962247241797527391...,1,False,4.2,63,Male,False,True,...,False,False,True,,White,False,True,GE MEDICAL SYSTEMS,LightSpeed Plus,143.0
19,100085,20010102,1.2.840.113654.2.55.55242834554306661252396802...,1,False,4.7,64,Male,False,True,...,False,False,True,,White,False,True,GE MEDICAL SYSTEMS,LightSpeed Plus,150.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10086,218357,19990102,1.3.6.1.4.1.14519.5.2.1.7009.9004.103248834477...,1,False,10.2,66,Male,False,True,...,False,False,False,IV,Black,True,False,GE MEDICAL SYSTEMS,LightSpeed16,266.0
10087,218357,20000102,1.3.6.1.4.1.14519.5.2.1.7009.9004.232556181915...,1,False,10.1,67,Male,False,True,...,False,False,False,IV,Black,True,False,GE MEDICAL SYSTEMS,LightSpeed Ultra,281.0
10088,218357,20010102,1.3.6.1.4.1.14519.5.2.1.7009.9004.518652745898...,1,False,33.2,68,Male,False,False,...,False,False,True,IV,Black,True,False,GE MEDICAL SYSTEMS,LightSpeed16,426.0
10099,218398,19990102,1.3.6.1.4.1.14519.5.2.1.7009.9004.280255694692...,1,False,14.7,58,Male,False,True,...,False,False,True,,White,True,False,SIEMENS,Sensation 16,358.0


(5911, 1172)

In [5]:
nlstval_full_mal = nlst_5911.query('label == 1')
nlstval_full_ben = nlst_5911.query('label == 0')
print('full:', len(nlstval_full_ben), 'ben', len(nlstval_full_mal), 'mal')

nlstval_some_mal = nlst_1172.query('label == 1')
nlstval_some_ben = nlst_1172.query('label == 0')
print('some:', len(nlstval_some_ben), 'ben', len(nlstval_some_mal), 'mal')

full: 5330 ben 581 mal
some: 1045 ben 127 mal


In [6]:
validation_sets = {
    "Full (malignant)": nlstval_full_mal,
    "Full (benign)": nlstval_full_ben,
    "Full (all)": nlst_5911,
    "Partial (malignant)": nlstval_some_mal,
    "Partial (benign)": nlstval_some_ben,
    "Partial (all)": nlst_1172,
}

In [7]:
df = data.combine_diff_dfs(nlst_democols['cat'], data.diffs_category_prevalence, validation_sets)
df[(df['category'] == "demo") & (df['attribute'].isin(['Gender', 'race', 'educat', 'Married', 'marital', 'ethnic']))]

Unnamed: 0,category,attribute,value,Full (malignant)_freq,Full (malignant)_norm,Full (benign)_freq,Full (benign)_norm,Full (all)_freq,Full (all)_norm,Partial (malignant)_freq,...,diff_Full (benign)_Full (all),diff_Full (benign)_Partial (malignant),diff_Full (benign)_Partial (benign),diff_Full (benign)_Partial (all),diff_Full (all)_Partial (malignant),diff_Full (all)_Partial (benign),diff_Full (all)_Partial (all),diff_Partial (malignant)_Partial (benign),diff_Partial (malignant)_Partial (all),diff_Partial (benign)_Partial (all)


In [8]:


for s in validation_sets:
    df[f"{s}_info"] = df.apply(lambda x: f'{0 if np.isnan(x[f"{s}_freq"]) else int(x[f"{s}_freq"])} ({0 if np.isnan(x[f"{s}_norm"]) else np.around(x[f"{s}_norm"], 1)})', axis=1)

df_out = df[
    (df['category'] == "Demographics") 
    # & (df['attribute'].isin(['Gender', 'Race', 'Education Status', 'Marital Status', 'Ethnic']))
][['attribute', 'value'] + [f"{s}_info" for s in validation_sets]].dropna(axis=0)
df_out

Unnamed: 0,attribute,value,Full (malignant)_info,Full (benign)_info,Full (all)_info,Partial (malignant)_info,Partial (benign)_info,Partial (all)_info
0,Age > 61,False,223 (38.4),2363 (44.3),2586 (43.7),49 (38.6),454 (43.4),503 (42.9)
1,Age > 61,True,358 (61.6),2967 (55.7),3325 (56.3),78 (61.4),591 (56.6),669 (57.1)
2,BMI > 25,False,209 (36.0),1690 (31.7),1899 (32.1),45 (35.4),326 (31.2),371 (31.7)
3,BMI > 25,True,372 (64.0),3640 (68.3),4012 (67.9),82 (64.6),719 (68.8),801 (68.3)
4,Education Status,8th grade or less,9 (1.5),102 (1.9),111 (1.9),4 (3.1),23 (2.2),27 (2.3)
5,Education Status,9th-11th grade,32 (5.5),258 (4.8),290 (4.9),2 (1.6),65 (6.2),67 (5.7)
6,Education Status,Associate degree/ some college,126 (21.7),1175 (22.0),1301 (22.0),31 (24.4),213 (20.4),244 (20.8)
7,Education Status,Bachelors Degree,96 (16.5),817 (15.3),913 (15.4),21 (16.5),164 (15.7),185 (15.8)
8,Education Status,Graduate School,76 (13.1),778 (14.6),854 (14.4),16 (12.6),163 (15.6),179 (15.3)
9,Education Status,High school graduate/GED,141 (24.3),1338 (25.1),1479 (25.0),36 (28.3),271 (25.9),307 (26.2)


In [9]:
df_out_idx = pd.MultiIndex.from_frame(df_out[['attribute', 'value']])
df_out2 = df_out.set_index(df_out_idx)[[f"{s}_info" for s in validation_sets]]

datasets = {'Full': nlst_5911, 'Partial': nlst_1172}
multicol_idx = pd.MultiIndex.from_tuples([
    ('Full Dataset (n=5911 Scans)', 'Malignant (n=581)'),
    ('Full Dataset (n=5911 Scans)', 'Benign (n=5330)'),
    ('Full Dataset (n=5911 Scans)', 'Total'),
    ('Partial Dataset (n=1172 Scans)', 'Malignant (n=127)'),
    ('Partial Dataset (n=1172 Scans)', 'Benign (n=1045)'),
    ('Partial Dataset (n=1172 Scans)', 'Total'),
])

df_out2.columns = multicol_idx
df_out2

Unnamed: 0_level_0,Unnamed: 1_level_0,Full Dataset (n=5911 Scans),Full Dataset (n=5911 Scans),Full Dataset (n=5911 Scans),Partial Dataset (n=1172 Scans),Partial Dataset (n=1172 Scans),Partial Dataset (n=1172 Scans)
Unnamed: 0_level_1,Unnamed: 1_level_1,Malignant (n=581),Benign (n=5330),Total,Malignant (n=127),Benign (n=1045),Total
attribute,value,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Age > 61,False,223 (38.4),2363 (44.3),2586 (43.7),49 (38.6),454 (43.4),503 (42.9)
Age > 61,True,358 (61.6),2967 (55.7),3325 (56.3),78 (61.4),591 (56.6),669 (57.1)
BMI > 25,False,209 (36.0),1690 (31.7),1899 (32.1),45 (35.4),326 (31.2),371 (31.7)
BMI > 25,True,372 (64.0),3640 (68.3),4012 (67.9),82 (64.6),719 (68.8),801 (68.3)
Education Status,8th grade or less,9 (1.5),102 (1.9),111 (1.9),4 (3.1),23 (2.2),27 (2.3)
Education Status,9th-11th grade,32 (5.5),258 (4.8),290 (4.9),2 (1.6),65 (6.2),67 (5.7)
Education Status,Associate degree/ some college,126 (21.7),1175 (22.0),1301 (22.0),31 (24.4),213 (20.4),244 (20.8)
Education Status,Bachelors Degree,96 (16.5),817 (15.3),913 (15.4),21 (16.5),164 (15.7),185 (15.8)
Education Status,Graduate School,76 (13.1),778 (14.6),854 (14.4),16 (12.6),163 (15.6),179 (15.3)
Education Status,High school graduate/GED,141 (24.3),1338 (25.1),1479 (25.0),36 (28.3),271 (25.9),307 (26.2)


In [10]:
print(df_out.to_latex(index=False, header=False))

\begin{tabular}{llllllll}
\toprule
\midrule
Age > 61 & False & 223 (38.4) & 2363 (44.3) & 2586 (43.7) & 49 (38.6) & 454 (43.4) & 503 (42.9) \\
Age > 61 & True & 358 (61.6) & 2967 (55.7) & 3325 (56.3) & 78 (61.4) & 591 (56.6) & 669 (57.1) \\
BMI > 25 & False & 209 (36.0) & 1690 (31.7) & 1899 (32.1) & 45 (35.4) & 326 (31.2) & 371 (31.7) \\
BMI > 25 & True & 372 (64.0) & 3640 (68.3) & 4012 (67.9) & 82 (64.6) & 719 (68.8) & 801 (68.3) \\
Education Status & 8th grade or less & 9 (1.5) & 102 (1.9) & 111 (1.9) & 4 (3.1) & 23 (2.2) & 27 (2.3) \\
Education Status & 9th-11th grade & 32 (5.5) & 258 (4.8) & 290 (4.9) & 2 (1.6) & 65 (6.2) & 67 (5.7) \\
Education Status & Associate degree/ some college & 126 (21.7) & 1175 (22.0) & 1301 (22.0) & 31 (24.4) & 213 (20.4) & 244 (20.8) \\
Education Status & Bachelors Degree & 96 (16.5) & 817 (15.3) & 913 (15.4) & 21 (16.5) & 164 (15.7) & 185 (15.8) \\
Education Status & Graduate School & 76 (13.1) & 778 (14.6) & 854 (14.4) & 16 (12.6) & 163 (15.6) & 179 (

In [11]:
print(df_out2.style.to_latex(
    hrules=True, label='tab:datasetDemos', environment='table*', column_format='ll|rrr|rrr', position='h!',
    multirow_align='c', multicol_align='c',  position_float='centering', clines='skip-last;data',
    caption=f'Demographic characteristics of the NLST validation sets (n={len(nlst_5911)} scans, n={len(nlst_1172)} scans).'))

\begin{table*}[h!]
\centering
\caption{Demographic characteristics of the NLST validation sets (n=5911 scans, n=1172 scans).}
\label{tab:datasetDemos}
\begin{tabular}{ll|rrr|rrr}
\toprule
 &  & \multicolumn{3}{c}{Full Dataset (n=5911 Scans)} & \multicolumn{3}{c}{Partial Dataset (n=1172 Scans)} \\
 &  & Malignant (n=581) & Benign (n=5330) & Total & Malignant (n=127) & Benign (n=1045) & Total \\
attribute & value &  &  &  &  &  &  \\
\midrule
\multirow[c]{2}{*}{Age > 61} & False & 223 (38.4) & 2363 (44.3) & 2586 (43.7) & 49 (38.6) & 454 (43.4) & 503 (42.9) \\
 & True & 358 (61.6) & 2967 (55.7) & 3325 (56.3) & 78 (61.4) & 591 (56.6) & 669 (57.1) \\
\cline{1-8}
\multirow[c]{2}{*}{BMI > 25} & False & 209 (36.0) & 1690 (31.7) & 1899 (32.1) & 45 (35.4) & 326 (31.2) & 371 (31.7) \\
 & True & 372 (64.0) & 3640 (68.3) & 4012 (67.9) & 82 (64.6) & 719 (68.8) & 801 (68.3) \\
\cline{1-8}
\multirow[c]{7}{*}{Education Status} & 8th grade or less & 9 (1.5) & 102 (1.9) & 111 (1.9) & 4 (3.1) & 23 (2.2) &

# Methods