In [1]:
import pandas as pd
import os
import numpy as np
import json

import seaborn as sns
sns.set_style("white")
from evalutils.roc import get_bootstrapped_roc_ci_curves
import matplotlib.pyplot as plt

import sys
sys.path.append('../')
import utils

%matplotlib inline
import sklearn.metrics as skl_metrics
from IPython.display import display, Markdown

import scipy.stats

## directory where results are
EXPERIMENT_DIR = f"/data/bodyct/experiments/lung-malignancy-fairness-shaurya"
NLST_PREDS = f"{EXPERIMENT_DIR}/nlst"

TEAMS_DIR = "C:/Users/shaur/OneDrive - Radboudumc/Documents - Master - Shaurya Gaur/General/Malignancy-Estimation Results/nlst"
NLST_PREDS = TEAMS_DIR ## Comment out if not using Teams backup (aka Chansey is up :)

In [5]:
nlst_preds = pd.read_csv(f"{NLST_PREDS}/nlst_demov3_allmodels_cal.csv")
nlst_preds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16077 entries, 0 to 16076
Columns: 108 entries, PatientID to Thijmen_mean_cal
dtypes: bool(10), float64(83), int64(11), object(4)
memory usage: 12.2+ MB


In [4]:
with open(f'{NLST_PREDS}/nlst_demo_v3_cols.json') as json_data:
    nlst_democols = json.load(json_data)
    json_data.close()

In [6]:
nlst_dict_full = pd.read_csv(f"{NLST_PREDS}/participant_d040722.csv")
df = nlst_dict_full[nlst_dict_full['pid'].isin(nlst_preds['PatientID'])].reset_index(drop=True)
df.info()

  nlst_dict_full = pd.read_csv(f"{NLST_PREDS}/participant_d040722.csv")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5282 entries, 0 to 5281
Columns: 356 entries, cen to evpsent
dtypes: float64(283), int64(64), object(9)
memory usage: 14.3+ MB


In [12]:
lc_cols = ['de_type', 'de_stag_7thed', 'histology_cat', 'de_grade']

In [27]:
for c in lc_cols:
    display(pd.DataFrame([df.query(f'gender == {i}')[c].value_counts(dropna=False) for i in [1,2]], index=['M','F']))

de_type,NaN,8140.0,8070.0,8046.0,8250.0,8041.0,8012.0,8560.0,8013.0,8252.0,...,8240.0,8323.0,8084.0,8075.0,8490.0,8083.0,8072.0,8000.0,8045.0,8570.0
M,2611.0,143.0,102.0,31.0,30.0,19.0,9.0,8.0,8.0,7.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,
F,1991.0,120.0,27.0,24.0,46.0,12.0,6.0,3.0,5.0,6.0,...,3.0,1.0,,,2.0,,,2.0,,1.0


de_stag_7thed,NaN,110.0,400.0,310.0,210.0,120.0,320.0,220.0,900.0,999.0
M,2618,179,69,46,39,35,13,9,1,1
F,1991,157,27,35,11,29,7,12,1,2


histology_cat,NaN,3.0,2.0,4.0,9.0,1.0,5.0,6.0,8.0
M,2611,160,110,42,39,22,17,8,1
F,1991,132,32,57,29,14,11,3,3


de_grade,NaN,4.0,3.0,2.0,6.0,9.0,1.0,5.0,8.0
M,2608,133,122,48,30,30,20,11,8
F,1988,72,76,57,20,33,12,8,6


In [30]:
locations = ['car', 'lhil', 'lin', 'llow', 'lmsb', 'lup', 'med', 'oth', 'rhil', 'rlow', 'rmid', 'rmsb', 'rup', 'unk']
pd.DataFrame({t: [len(df.query(f'gender == {i} and loc{t} == 1')) for i in [1,2]] for t in locations}, index=['M','F'])

Unnamed: 0,car,lhil,lin,llow,lmsb,lup,med,oth,rhil,rlow,rmid,rmsb,rup,unk
M,2,7,1,55,4,91,4,4,8,66,28,3,150,4
F,2,5,2,43,4,70,6,2,4,55,18,1,103,1


In [31]:
use_locations = ['locllow', 'loclup', 'locrlow', 'locrup', 'locrmid']
pd.DataFrame({t: [len(df.query(f'gender == {i} and {t} == 1')) for i in [1,2]] for t in use_locations}, index=['M','F'])

Unnamed: 0,locllow,loclup,locrlow,locrup,locrmid
M,55,91,66,150,28
F,43,70,55,103,18


In [46]:
histology_cat_key = {
    1: "Small_cell_carcinoma",
    2: "Squamous_cell_carcinoma",
    3: "Adenocarcinoma",
    4: "Bronchiolo-alveolar_carcinoma",
    5: "Large_cell_carcinoma",
    6: "Adenosquamous_carcinoma",
    7: "Pleomorphic/sarcomatoid",
    8: "Carcinoid_tumor",
    9: "Unclassified_carcinoma",
}

In [35]:
df['LC_type'] = df['histology_cat'].replace(histology_cat_key)
display(pd.DataFrame([df.query(f'gender == {i}')['LC_type'].value_counts(dropna=False) for i in [1,2]], index=['M','F']))

LC_type,NaN,Adenocarcinoma,Squamous_cell_carcinoma,Bronchiolo-alveolar_carcinoma,Unclassified_carcinoma,Small_cell_carcinoma,Large_cell_carcinoma,Adenosquamous_carcinoma,Carcinoid_tumor
M,2611,160,110,42,39,22,17,8,1
F,1991,132,32,57,29,14,11,3,3


In [None]:
type_cols = pd.get_dummies(df, columns=['LC_type'], prefix='', prefix_sep='')
type_cols

Unnamed: 0,cen,dataset_version,elig,ineligible,pid,rndgroup,study,age,educat,ethnic,...,evpsel,evpsent,Adenocarcinoma,Adenosquamous_carcinoma,Bronchiolo-alveolar_carcinoma,Carcinoid_tumor,Large_cell_carcinoma,Small_cell_carcinoma,Squamous_cell_carcinoma,Unclassified_carcinoma
0,AV,2011.02.03/04.07.22,2,,100012,1,1,61,7,2,...,,,True,False,False,False,False,False,False,False
1,AF,2011.02.03/04.07.22,2,,100019,1,1,61,5,2,...,,,False,False,False,False,False,False,False,False
2,BD,2011.02.03/04.07.22,2,,100035,1,1,55,4,2,...,,,False,False,False,False,False,False,False,False
3,BC,2011.02.03/04.07.22,2,,100040,1,1,60,7,2,...,,,False,False,False,False,False,False,False,False
4,AR,2011.02.03/04.07.22,2,,100053,1,1,62,2,2,...,,,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5277,AE,2011.02.03/04.07.22,2,,218860,1,3,68,3,2,...,,,False,False,False,False,False,False,False,False
5278,AM,2011.02.03/04.07.22,2,,218862,1,3,57,4,2,...,,,False,False,False,False,False,False,False,False
5279,BF,2011.02.03/04.07.22,2,,218863,1,2,62,3,2,...,,,False,False,False,False,False,False,False,False
5280,AG,2011.02.03/04.07.22,2,,218866,1,2,57,7,2,...,1.0,1.0,False,False,False,False,False,False,False,False


In [40]:
df = type_cols

In [47]:
nlst_democols['cat']['lungcanc'] = list(histology_cat_key.values())
nlst_democols['cat']['lungcanc'].remove("Pleomorphic/sarcomatoid")
nlst_democols

{'num': {'demo': ['height', 'weight', 'Age', 'BMI'],
  'smoke': ['pkyr', 'smokeage', 'smokeday', 'smokeyr'],
  'nodule': ['CoordX', 'CoordY', 'CoordZ'],
  'other': ['NoduleCounts', 'Diameter [mm]']},
 'cat': {'demo': ['marital',
   'educat',
   'race',
   'ethnic',
   'Gender',
   'Married',
   'NonHispanicWhite',
   'HighSchoolPlus'],
  'smoke': ['cigar', 'cigsmok', 'pipe', 'smokelive', 'smokework'],
  'work': ['wrkasbe',
   'wrkbaki',
   'wrkbutc',
   'wrkchem',
   'wrkcoal',
   'wrkcott',
   'wrkfarm',
   'wrkfire',
   'wrkflou',
   'wrkfoun',
   'wrkhard',
   'wrkpain',
   'wrksand',
   'wrkweld'],
  'disease': ['diagadas',
   'diagasbe',
   'diagbron',
   'diagchas',
   'diagchro',
   'diagcopd',
   'diagdiab',
   'diagemph',
   'diagfibr',
   'diaghear',
   'diaghype',
   'diagpneu',
   'diagsarc',
   'diagsili',
   'diagstro',
   'diagtube'],
  'canchist': ['cancblad',
   'cancbrea',
   'canccerv',
   'canccolo',
   'cancesop',
   'canckidn',
   'canclary',
   'canclung',
   'ca

In [42]:
df['PatientID'] = df['pid']

In [43]:
df

Unnamed: 0,cen,dataset_version,elig,ineligible,pid,rndgroup,study,age,educat,ethnic,...,evpsent,Adenocarcinoma,Adenosquamous_carcinoma,Bronchiolo-alveolar_carcinoma,Carcinoid_tumor,Large_cell_carcinoma,Small_cell_carcinoma,Squamous_cell_carcinoma,Unclassified_carcinoma,PatientID
0,AV,2011.02.03/04.07.22,2,,100012,1,1,61,7,2,...,,True,False,False,False,False,False,False,False,100012
1,AF,2011.02.03/04.07.22,2,,100019,1,1,61,5,2,...,,False,False,False,False,False,False,False,False,100019
2,BD,2011.02.03/04.07.22,2,,100035,1,1,55,4,2,...,,False,False,False,False,False,False,False,False,100035
3,BC,2011.02.03/04.07.22,2,,100040,1,1,60,7,2,...,,False,False,False,False,False,False,False,False,100040
4,AR,2011.02.03/04.07.22,2,,100053,1,1,62,2,2,...,,False,False,False,False,False,False,False,False,100053
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5277,AE,2011.02.03/04.07.22,2,,218860,1,3,68,3,2,...,,False,False,False,False,False,False,False,False,218860
5278,AM,2011.02.03/04.07.22,2,,218862,1,3,57,4,2,...,,False,False,False,False,False,False,False,False,218862
5279,BF,2011.02.03/04.07.22,2,,218863,1,2,62,3,2,...,,False,False,False,False,False,False,False,False,218863
5280,AG,2011.02.03/04.07.22,2,,218866,1,2,57,7,2,...,1.0,False,False,False,False,False,False,False,False,218866


In [49]:
nlst_preds_plus_lc = nlst_preds.merge(df[nlst_democols['cat']['lungcanc'] + ['PatientID']], on='PatientID', how='left')

In [50]:
nlst_preds_plus_lc

Unnamed: 0,PatientID,StudyDate,SeriesInstanceUID,CoordX,CoordY,CoordZ,LesionID,NoduleType,Spiculation,Diameter [mm],...,Thijmen_global_show_cal,Thijmen_mean_cal,Small_cell_carcinoma,Squamous_cell_carcinoma,Adenocarcinoma,Bronchiolo-alveolar_carcinoma,Large_cell_carcinoma,Adenosquamous_carcinoma,Carcinoid_tumor,Unclassified_carcinoma
0,100019,20000102,1.2.840.113654.2.55.12567845460924968711329545...,-103.19,74.01,-1129.37,1,Solid,False,10.6,...,0.231420,,False,False,False,False,False,False,False,False
1,100035,20000102,1.2.840.113654.2.55.33941066798745864774354503...,127.36,-2.95,-184.57,1,Solid,False,4.8,...,0.001354,,False,False,False,False,False,False,False,False
2,100035,20010102,1.2.840.113654.2.55.17844441025190420803491337...,131.75,19.73,-212.86,1,Solid,False,4.5,...,0.003395,,False,False,False,False,False,False,False,False
3,100040,20000102,1.2.840.113654.2.55.29596035092976721211031255...,-20.26,-132.44,-155.64,1,Solid,False,5.3,...,0.033014,,False,False,False,False,False,False,False,False
4,100040,20010102,1.2.840.113654.2.55.13087519151671351584835330...,-8.00,-146.34,-173.76,1,Solid,False,5.8,...,0.062095,,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16072,218860,20000102,1.3.6.1.4.1.14519.5.2.1.7009.9004.326137593491...,-37.67,100.27,-175.23,1,Solid,False,5.4,...,0.102411,,False,False,False,False,False,False,False,False
16073,218860,20010102,1.3.6.1.4.1.14519.5.2.1.7009.9004.166160744798...,-34.33,142.29,-116.53,1,Solid,False,5.3,...,0.040909,,False,False,False,False,False,False,False,False
16074,218862,19990102,1.3.6.1.4.1.14519.5.2.1.7009.9004.323224967275...,30.22,138.38,1481.77,1,Solid,False,17.7,...,0.035349,0.173441,False,False,False,False,False,False,False,False
16075,218862,20000102,1.3.6.1.4.1.14519.5.2.1.7009.9004.223578926541...,19.95,137.02,1657.81,1,Solid,False,5.7,...,0.006291,0.034600,False,False,False,False,False,False,False,False


In [52]:
nlst_preds_plus_lc['NoduleType'].unique()

array(['Solid', 'GroundGlassOpacity', 'Perifissural', 'NonSolid',
       'PartSolid', 'SemiSolid', 'Calcified'], dtype=object)

In [51]:
noduletype_cols = pd.get_dummies(nlst_preds_plus_lc, columns=['NoduleType'], prefix='', prefix_sep='')
noduletype_cols

Unnamed: 0,PatientID,StudyDate,SeriesInstanceUID,CoordX,CoordY,CoordZ,LesionID,Spiculation,Diameter [mm],Age,...,Adenosquamous_carcinoma,Carcinoid_tumor,Unclassified_carcinoma,Calcified,GroundGlassOpacity,NonSolid,PartSolid,Perifissural,SemiSolid,Solid
0,100019,20000102,1.2.840.113654.2.55.12567845460924968711329545...,-103.19,74.01,-1129.37,1,False,10.6,62,...,False,False,False,False,False,False,False,False,False,True
1,100035,20000102,1.2.840.113654.2.55.33941066798745864774354503...,127.36,-2.95,-184.57,1,False,4.8,56,...,False,False,False,False,False,False,False,False,False,True
2,100035,20010102,1.2.840.113654.2.55.17844441025190420803491337...,131.75,19.73,-212.86,1,False,4.5,57,...,False,False,False,False,False,False,False,False,False,True
3,100040,20000102,1.2.840.113654.2.55.29596035092976721211031255...,-20.26,-132.44,-155.64,1,False,5.3,61,...,False,False,False,False,False,False,False,False,False,True
4,100040,20010102,1.2.840.113654.2.55.13087519151671351584835330...,-8.00,-146.34,-173.76,1,False,5.8,62,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16072,218860,20000102,1.3.6.1.4.1.14519.5.2.1.7009.9004.326137593491...,-37.67,100.27,-175.23,1,False,5.4,69,...,False,False,False,False,False,False,False,False,False,True
16073,218860,20010102,1.3.6.1.4.1.14519.5.2.1.7009.9004.166160744798...,-34.33,142.29,-116.53,1,False,5.3,70,...,False,False,False,False,False,False,False,False,False,True
16074,218862,19990102,1.3.6.1.4.1.14519.5.2.1.7009.9004.323224967275...,30.22,138.38,1481.77,1,False,17.7,57,...,False,False,False,False,False,False,False,False,False,True
16075,218862,20000102,1.3.6.1.4.1.14519.5.2.1.7009.9004.223578926541...,19.95,137.02,1657.81,1,False,5.7,58,...,False,False,False,False,False,False,False,False,False,True


In [59]:
nlst_preds = noduletype_cols

In [54]:
print(nlst_democols['cat']['nodule'])
nlst_democols['cat']['nodule'].remove('NoduleType')
nlst_democols['cat']['nodule'].extend(['Solid', 'GroundGlassOpacity', 'Perifissural', 'NonSolid','PartSolid', 'SemiSolid', 'Calcified'])
nlst_democols['cat']['nodule']

['Spiculation', 'NoduleType', 'NoduleInUpperLung']


['Spiculation',
 'NoduleInUpperLung',
 'Solid',
 'GroundGlassOpacity',
 'Perifissural',
 'NonSolid',
 'PartSolid',
 'SemiSolid',
 'Calcified']

In [55]:
with open(f'{NLST_PREDS}/nlst_demo_v4_cols.json', 'w') as f:
    json.dump(nlst_democols, f)

In [56]:
cols_num = sum(nlst_democols['num'].values(), start=[])
cols_cat = sum(nlst_democols['cat'].values(), start=[])
cols_list = cols_num + cols_cat + nlst_democols['info']
len(cols_list)

94

In [60]:
nlst_preds[cols_list].to_csv(f'{NLST_PREDS}/nlst_demo_v4.csv', index=False)

In [61]:
nlst_preds.to_csv(f"{NLST_PREDS}/nlst_demov3_allmodels_cal.csv", index=False)