In [4]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
from IPython.display import display, Markdown
from scipy.stats import pearsonr, spearmanr, ks_2samp, mannwhitneyu, ttest_ind

import sys
sys.path.append('../')

# import utils
from evalutils.roc import get_bootstrapped_roc_ci_curves
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

## directory where results are
EXPERIMENT_DIR = f"/data/bodyct/experiments/lung-malignancy-fairness-shaurya"
NLST_PREDS = f"{EXPERIMENT_DIR}/nlst"

TEAMS_DIR = "C:/Users/shaur/OneDrive - Radboudumc/Documents - Master - Shaurya Gaur/General/Malignancy-Estimation Results"
NLST_PREDS = f"{TEAMS_DIR}/nlst" ## Comment out if not using Teams backup (aka Chansey is up :)

In [3]:
df = pd.read_csv(f"{NLST_PREDS}/sybil_fn_brock_top25.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,PatientID,StudyDate,SeriesInstanceUID,LesionID,Spiculation,Diameter [mm],Age,Gender,FamilyHistoryLungCa,...,Calcified,GroundGlassOpacity,NonSolid,PartSolid,Perifissural,SemiSolid,Solid,LC_stage,Sybil_pred_label,Sybil_PanCan_diff
0,1464,115238,20000102,1.2.840.113654.2.55.15839158435375720096247945...,1,False,63.7,68,1,True,...,False,False,False,False,False,False,True,310.0,0,-0.722076
1,4260,212718,20000102,1.3.6.1.4.1.14519.5.2.1.7009.9004.298279943701...,1,False,27.4,64,2,True,...,False,False,False,True,False,False,False,,0,-0.612852
2,3389,200628,20010102,1.3.6.1.4.1.14519.5.2.1.7009.9004.184145865626...,1,False,24.1,58,2,True,...,False,False,False,False,False,False,True,400.0,0,-0.500824
3,262,102806,19990102,1.2.840.113654.2.55.24192404935399215978590385...,1,False,63.1,58,1,False,...,False,False,False,False,False,False,True,400.0,0,-0.433243
4,2105,121786,20000102,1.2.840.113654.2.55.14538535262458232659682438...,1,False,36.3,65,1,False,...,False,False,False,False,False,False,True,400.0,0,-0.418499


In [4]:
df[[
     'SeriesInstanceUID', 'PanCan2b', 'sybil_year1',
     'Age', 'Gender', 'race', 'weight', 'BMI',
     'Emphysema', 'Adenocarcinoma', 'pkyr', 'pipe', 'cigar', 'diaghype', 'LC_stage',
     'wrknomask', 'wrkfoun', 'wrkasbe',  'diaghear'
    #  'Squamous_cell_carcinoma', 'Large_cell_carcinoma', 'diagcopd', 'NoduleInUpperLung', 'Solid'
]]

Unnamed: 0,SeriesInstanceUID,PanCan2b,sybil_year1,Age,Gender,race,weight,BMI,Emphysema,Adenocarcinoma,pkyr,pipe,cigar,diaghype,LC_stage,wrknomask,wrkfoun,wrkasbe,diaghear
0,1.2.840.113654.2.55.15839158435375720096247945...,0.724267,0.002192,68,1,,,,False,False,141.0,,,,310.0,False,,,
1,1.3.6.1.4.1.14519.5.2.1.7009.9004.298279943701...,0.670269,0.057416,64,2,1.0,110.0,20.117066,False,False,88.0,0.0,0.0,0.0,,False,0.0,0.0,0.0
2,1.3.6.1.4.1.14519.5.2.1.7009.9004.184145865626...,0.518817,0.017994,58,2,1.0,155.0,26.602783,True,True,43.0,0.0,0.0,0.0,400.0,False,0.0,0.0,0.0
3,1.2.840.113654.2.55.24192404935399215978590385...,0.461365,0.028122,58,1,1.0,145.0,22.70773,True,False,55.0,0.0,0.0,0.0,400.0,True,0.0,0.0,0.0
4,1.2.840.113654.2.55.14538535262458232659682438...,0.47272,0.054222,65,1,1.0,200.0,27.121914,False,False,61.5,0.0,0.0,0.0,400.0,False,0.0,0.0,1.0
5,1.2.840.113654.2.55.26313206686064528258313402...,0.387788,0.007145,60,1,2.0,173.0,24.125967,False,False,39.0,0.0,1.0,1.0,320.0,False,0.0,1.0,0.0
6,1.3.6.1.4.1.14519.5.2.1.7009.9004.535429846806...,0.377434,0.002824,70,2,1.0,150.0,25.744629,True,False,67.5,0.0,0.0,0.0,310.0,False,0.0,0.0,0.0
7,1.3.6.1.4.1.14519.5.2.1.7009.9004.652542111531...,0.417392,0.043411,62,1,1.0,225.0,29.681929,False,False,36.0,1.0,0.0,1.0,110.0,False,0.0,0.0,0.0
8,1.2.840.113654.2.55.28713721257308594954412027...,0.382956,0.028122,61,2,1.0,145.0,27.394518,False,True,50.0,0.0,0.0,1.0,400.0,False,0.0,0.0,0.0
9,1.3.6.1.4.1.14519.5.2.1.7009.9004.216648624768...,0.351782,0.007145,72,2,1.0,210.0,27.703134,True,True,52.0,0.0,0.0,0.0,110.0,False,0.0,0.0,0.0


In [5]:
df.sort_values(by=['race', 'Emphysema', 'Squamous_cell_carcinoma', 'Gender', 'BMI', 'diaghype'], ascending=[False, False, True, True, False, True])[[
     'SeriesInstanceUID',
     'Age', 'Gender', 'race', 'weight', 'BMI',
     'Emphysema', 'Adenocarcinoma', 'pkyr', 'pipe', 'cigar', 'diaghype', 'LC_stage','Squamous_cell_carcinoma',
        # 'wrknomask', 'wrkfoun', 'wrkasbe',  'diaghear'-
    #  'Squamous_cell_carcinoma', 'Large_cell_carcinoma', 'diagcopd', 'NoduleInUpperLung', 'Solid'
]]

Unnamed: 0,SeriesInstanceUID,Age,Gender,race,weight,BMI,Emphysema,Adenocarcinoma,pkyr,pipe,cigar,diaghype,LC_stage,Squamous_cell_carcinoma
11,1.2.840.113654.2.55.67095101227495808685302586...,58,2,3.0,105.0,20.504167,True,False,38.0,0.0,0.0,1.0,310.0,True
21,1.3.6.1.4.1.14519.5.2.1.7009.9004.611567201165...,58,2,2.0,151.0,22.956964,True,False,37.0,0.0,0.0,0.0,320.0,False
5,1.2.840.113654.2.55.26313206686064528258313402...,60,1,2.0,173.0,24.125967,False,False,39.0,0.0,1.0,1.0,320.0,False
20,1.3.6.1.4.1.14519.5.2.1.7009.9004.117168587881...,60,1,1.0,243.0,34.863061,True,False,37.5,0.0,0.0,0.0,320.0,False
16,1.3.6.1.4.1.14519.5.2.1.7009.9004.336832610089...,58,1,1.0,213.0,31.451166,True,False,51.0,1.0,0.0,1.0,210.0,False
22,1.3.6.1.4.1.14519.5.2.1.7009.9004.351278480511...,57,1,1.0,213.0,31.451166,True,False,51.0,1.0,0.0,1.0,210.0,False
18,1.3.6.1.4.1.14519.5.2.1.7009.9004.145215505102...,61,1,1.0,200.0,28.693878,True,False,67.5,1.0,1.0,0.0,110.0,False
3,1.2.840.113654.2.55.24192404935399215978590385...,58,1,1.0,145.0,22.70773,True,False,55.0,0.0,0.0,0.0,400.0,False
10,1.3.6.1.4.1.14519.5.2.1.7009.9004.181715871926...,71,2,1.0,195.0,38.079167,True,False,47.0,0.0,0.0,0.0,400.0,False
13,1.2.840.113654.2.55.17570252566187162689078480...,56,2,1.0,205.0,31.166739,True,True,105.0,0.0,0.0,0.0,220.0,False


In [6]:
priority_idxs = [24, 20, 1, 2, 6, 21, 5, 0, 14, 17,
                 11, 13, 16, 22,
                 7, 9, 
                 3, 4, 8, 10, 12, 15, 18, 19, 23
                 ]

In [7]:
len(priority_idxs)

25

In [8]:
df.iloc[priority_idxs][[
     'SeriesInstanceUID',
     'Age', 'Gender', 'race', 'weight', 'BMI',
     'Emphysema', 'Adenocarcinoma', 'pkyr', 'pipe', 'cigar', 'diaghype', 'LC_stage','Squamous_cell_carcinoma',
        # 'wrknomask', 'wrkfoun', 'wrkasbe',  'diaghear'-
    #  'Squamous_cell_carcinoma', 'Large_cell_carcinoma', 'diagcopd', 'NoduleInUpperLung', 'Solid'
]]

Unnamed: 0,SeriesInstanceUID,Age,Gender,race,weight,BMI,Emphysema,Adenocarcinoma,pkyr,pipe,cigar,diaghype,LC_stage,Squamous_cell_carcinoma
24,1.2.840.113654.2.55.81136962133262551156371928...,59,1,1.0,178.0,28.726814,False,False,60.0,1.0,0.0,1.0,310.0,False
20,1.3.6.1.4.1.14519.5.2.1.7009.9004.117168587881...,60,1,1.0,243.0,34.863061,True,False,37.5,0.0,0.0,0.0,320.0,False
1,1.3.6.1.4.1.14519.5.2.1.7009.9004.298279943701...,64,2,1.0,110.0,20.117066,False,False,88.0,0.0,0.0,0.0,,False
2,1.3.6.1.4.1.14519.5.2.1.7009.9004.184145865626...,58,2,1.0,155.0,26.602783,True,True,43.0,0.0,0.0,0.0,400.0,False
6,1.3.6.1.4.1.14519.5.2.1.7009.9004.535429846806...,70,2,1.0,150.0,25.744629,True,False,67.5,0.0,0.0,0.0,310.0,False
21,1.3.6.1.4.1.14519.5.2.1.7009.9004.611567201165...,58,2,2.0,151.0,22.956964,True,False,37.0,0.0,0.0,0.0,320.0,False
5,1.2.840.113654.2.55.26313206686064528258313402...,60,1,2.0,173.0,24.125967,False,False,39.0,0.0,1.0,1.0,320.0,False
0,1.2.840.113654.2.55.15839158435375720096247945...,68,1,,,,False,False,141.0,,,,310.0,True
14,1.3.6.1.4.1.14519.5.2.1.7009.9004.265076645294...,63,1,1.0,175.0,25.84016,True,False,82.0,1.0,1.0,0.0,320.0,True
17,1.2.840.113654.2.55.55905915368746417680608580...,59,1,1.0,165.0,22.375579,False,False,76.0,0.0,1.0,0.0,110.0,True


In [8]:
with open(f'{NLST_PREDS}/nlst_demo_v4_cols.json') as json_data:
    nlst_democols = json.load(json_data)
    json_data.close()

In [10]:
series_list = df.iloc[priority_idxs]['SeriesInstanceUID'].tolist()
for i, series_id in enumerate(series_list):
    print(f"{i+1}) \t {series_id}")

1) 	 1.2.840.113654.2.55.81136962133262551156371928517766508555
2) 	 1.3.6.1.4.1.14519.5.2.1.7009.9004.117168587881036721196334662566
3) 	 1.3.6.1.4.1.14519.5.2.1.7009.9004.298279943701275302166863540472
4) 	 1.3.6.1.4.1.14519.5.2.1.7009.9004.184145865626087655808498226297
5) 	 1.3.6.1.4.1.14519.5.2.1.7009.9004.535429846806879730758167799634
6) 	 1.3.6.1.4.1.14519.5.2.1.7009.9004.611567201165109552550650414957
7) 	 1.2.840.113654.2.55.263132066860645282583134020193623337530
8) 	 1.2.840.113654.2.55.158391584353757200962479457670501546291
9) 	 1.3.6.1.4.1.14519.5.2.1.7009.9004.265076645294278420451491835992
10) 	 1.2.840.113654.2.55.55905915368746417680608580988846234578
11) 	 1.2.840.113654.2.55.67095101227495808685302586770473894957
12) 	 1.2.840.113654.2.55.175702525661871626890784804368160596565
13) 	 1.3.6.1.4.1.14519.5.2.1.7009.9004.336832610089461934819661898582
14) 	 1.3.6.1.4.1.14519.5.2.1.7009.9004.351278480511322788417778234687
15) 	 1.3.6.1.4.1.14519.5.2.1.7009.9004.65254211

In [5]:
df = pd.read_csv(f"{TEAMS_DIR}/nlst/nlst_demov4_allmodels_cal.csv")

In [None]:
series_id = ""

pancan_cols = ['label', 'Age', 'Gender', 'race', 'FamilyHistoryLungCa', 'height', 'Emphysema', 'Diameter_mm', 'NoduleInUpperLung', 'PartSolid', 'NoduleCounts', 'Spiculation']
df.query(f'SeriesInstanceUID == "{series_id}"')[['PanCan2b', 'NoduleID', 'LesionID', 'AnnotationID'] + pancan_cols + nlst_democols['cat']['nodule'] + ['CoordX', 'CoordY', 'CoordZ'] + nlst_democols['cat']['lungcanc']].T

Unnamed: 0,15920
PanCan2b,0.081762
NoduleID,218357_1
LesionID,1
AnnotationID,218357_1_19990102
label,1
Age,66
Gender,1
race,2.0
FamilyHistoryLungCa,False
height,71.0


In [10]:
df.query(f'SeriesInstanceUID == "{series_id}"')[[
     'SeriesInstanceUID',
     'Age', 'Gender', 'race', 'weight', 'BMI',
     'Emphysema', 'Adenocarcinoma', 'pkyr', 'pipe', 'cigar', 'diaghype', 'LC_stage','Squamous_cell_carcinoma',
        'wrknomask', 'wrkfoun', 'wrkasbe',  'diaghear',
     'Squamous_cell_carcinoma', 'Large_cell_carcinoma', 'diagcopd', 'NoduleInUpperLung', 'Solid'
]].T

Unnamed: 0,5589
SeriesInstanceUID,1.2.840.113654.2.55.10728645898414765445001934...
Age,61
Gender,2
race,3.0
weight,162.0
BMI,26.144628
Emphysema,False
Adenocarcinoma,False
pkyr,37.0
pipe,0.0
