In [3]:
import pandas as pd
import os
import numpy as np
import json

import seaborn as sns
sns.set_style("white")
from evalutils.roc import get_bootstrapped_roc_ci_curves
import matplotlib.pyplot as plt

import sys
sys.path.append('../')
import utils

%matplotlib inline
import sklearn.metrics as skl_metrics

## directory where results are
CHANSEY_DIR = f"V:/experiments/lung-malignancy-fairness-shaurya"
NLST_PREDS = f"{CHANSEY_DIR}/nlst"

TEAMS_DIR = "C:/Users/shaur/OneDrive - Radboudumc/Documents - Master - Shaurya Gaur/General/Malignancy-Estimation Results/nlst"
NLST_PREDS = TEAMS_DIR ## Comment out if not using Teams backup (aka Chansey is up :)

In [4]:
with open(f'{NLST_PREDS}/nlst_demo_v4_cols.json') as json_data:
    nlst_democols = json.load(json_data)
    json_data.close()

nlst_democols

{'num': {'demo': ['height', 'weight', 'Age', 'BMI'],
  'smoke': ['pkyr', 'smokeage', 'smokeday', 'smokeyr'],
  'nodule': ['CoordX', 'CoordY', 'CoordZ'],
  'other': ['NoduleCounts', 'Diameter [mm]'],
  'lungcanc': ['LC_stage']},
 'cat': {'demo': ['marital',
   'educat',
   'race',
   'ethnic',
   'Gender',
   'Married',
   'NonHispanicWhite',
   'HighSchoolPlus'],
  'smoke': ['cigar', 'cigsmok', 'pipe', 'smokelive', 'smokework'],
  'work': ['wrkasbe',
   'wrkbaki',
   'wrkbutc',
   'wrkchem',
   'wrkcoal',
   'wrkcott',
   'wrkfarm',
   'wrkfire',
   'wrkflou',
   'wrkfoun',
   'wrkhard',
   'wrkpain',
   'wrksand',
   'wrkweld'],
  'disease': ['diagadas',
   'diagasbe',
   'diagbron',
   'diagchas',
   'diagchro',
   'diagcopd',
   'diagdiab',
   'diagemph',
   'diagfibr',
   'diaghear',
   'diaghype',
   'diagpneu',
   'diagsarc',
   'diagsili',
   'diagstro',
   'diagtube'],
  'canchist': ['cancblad',
   'cancbrea',
   'canccerv',
   'canccolo',
   'cancesop',
   'canckidn',
   'canc

In [5]:
df = pd.read_csv(f"{NLST_PREDS}/nlst_demov4_allmodels_cal.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16077 entries, 0 to 16076
Columns: 123 entries, PatientID to LC_stage
dtypes: bool(25), float64(84), int64(11), object(3)
memory usage: 12.4+ MB


## Aggregations

In [6]:
# worktypes = ['asbe', 'baki', 'butc', 'chem', 'coal', 'cott', 'farm', 'fire', 'flou', 'foun', 'hard', 'pain', 'sand', 'weld']
# for t in worktypes:
#     nomask = df[f'res{t}'] == 0
#     df[f'wrknomask{t}'] = nomask

# df['wrknomask'] = df[[f'wrknomask{t}' for t in worktypes]].any(axis=1)

In [7]:
# df['NonHispanicWhite'] = (df['race'] == 1.0) & (df['ethnic'] == 2.0)
# df['NonHispanicWhite'].value_counts(dropna=False)

In [8]:
# df['HighSchoolPlus'] = (df['educat'] != 1.0)
# df['HighSchoolPlus'].value_counts(dropna=False)

In [9]:
# df['Married'] = (df['marital'] == 2.0)
# df['Married'].value_counts(dropna=False)

In [10]:
onlycancertypes = list(set(nlst_democols['cat']['canchist']) - set(['FamilyHistoryLungCa'])) 
df['PersonalCancerHist'] = df[onlycancertypes].any(axis=1)
df['PersonalCancerHist'].value_counts(dropna=False)

PersonalCancerHist
False    15361
True       716
Name: count, dtype: int64

In [11]:
df['WhiteOrBlack'] = df['race'].replace([3, 4, 5, 6], value=np.nan, inplace=False)
df['WhiteOrBlack'].value_counts()
nlst_democols['cat']['demo'].append('WhiteOrBlack')

In [12]:
df['Overweight'] = (df['BMI'] >= 25)
nlst_democols['cat']['demo'].append('Overweight')

In [13]:
df['Unfinished_ed'] = (df['educat'] == 2) | (df['educat'] == 5)
nlst_democols['cat']['demo'].append('Unfinished_ed')

## Save

In [14]:
nlst_democols['num'].pop('lungcanc')

['LC_stage']

In [15]:
for typ in ['cat', 'num']:
    for c in nlst_democols[typ]:
        nlst_democols[typ][c] = list(set(nlst_democols[typ][c]))

nlst_democols

{'num': {'demo': ['BMI', 'Age', 'height', 'weight'],
  'smoke': ['smokeage', 'smokeday', 'smokeyr', 'pkyr'],
  'nodule': ['CoordX', 'CoordZ', 'CoordY'],
  'other': ['NoduleCounts', 'Diameter [mm]']},
 'cat': {'demo': ['Overweight',
   'educat',
   'Gender',
   'Married',
   'HighSchoolPlus',
   'NonHispanicWhite',
   'Unfinished_ed',
   'WhiteOrBlack',
   'marital',
   'ethnic',
   'race'],
  'smoke': ['smokelive', 'cigar', 'cigsmok', 'smokework', 'pipe'],
  'work': ['wrkbaki',
   'wrkfoun',
   'wrkchem',
   'wrkasbe',
   'wrkfire',
   'wrksand',
   'wrkfarm',
   'wrkcoal',
   'wrkpain',
   'wrkweld',
   'wrkflou',
   'wrkbutc',
   'wrkhard',
   'wrkcott'],
  'disease': ['diagasbe',
   'diagchas',
   'diagpneu',
   'diagstro',
   'diagemph',
   'diagbron',
   'diagsili',
   'diagsarc',
   'diaghear',
   'diagdiab',
   'diagadas',
   'diagcopd',
   'diagfibr',
   'diagtube',
   'diaghype',
   'diagchro'],
  'canchist': ['canckidn',
   'cancphar',
   'canccolo',
   'cancoral',
   'cancpa

In [16]:
df

Unnamed: 0,PatientID,StudyDate,SeriesInstanceUID,CoordX,CoordY,CoordZ,LesionID,Spiculation,Diameter [mm],Age,...,GroundGlassOpacity,NonSolid,PartSolid,Perifissural,SemiSolid,Solid,LC_stage,WhiteOrBlack,Overweight,Unfinished_ed
0,100019,20000102,1.2.840.113654.2.55.12567845460924968711329545...,-103.19,74.01,-1129.37,1,False,10.6,62,...,False,False,False,False,False,True,,1.0,False,True
1,100035,20000102,1.2.840.113654.2.55.33941066798745864774354503...,127.36,-2.95,-184.57,1,False,4.8,56,...,False,False,False,False,False,True,,1.0,False,False
2,100035,20010102,1.2.840.113654.2.55.17844441025190420803491337...,131.75,19.73,-212.86,1,False,4.5,57,...,False,False,False,False,False,True,,1.0,False,False
3,100040,20000102,1.2.840.113654.2.55.29596035092976721211031255...,-20.26,-132.44,-155.64,1,False,5.3,61,...,False,False,False,False,False,True,,1.0,True,False
4,100040,20010102,1.2.840.113654.2.55.13087519151671351584835330...,-8.00,-146.34,-173.76,1,False,5.8,62,...,False,False,False,False,False,True,,1.0,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16072,218860,20000102,1.3.6.1.4.1.14519.5.2.1.7009.9004.326137593491...,-37.67,100.27,-175.23,1,False,5.4,69,...,False,False,False,False,False,True,,1.0,True,False
16073,218860,20010102,1.3.6.1.4.1.14519.5.2.1.7009.9004.166160744798...,-34.33,142.29,-116.53,1,False,5.3,70,...,False,False,False,False,False,True,,1.0,True,False
16074,218862,19990102,1.3.6.1.4.1.14519.5.2.1.7009.9004.323224967275...,30.22,138.38,1481.77,1,False,17.7,57,...,False,False,False,False,False,True,,1.0,True,False
16075,218862,20000102,1.3.6.1.4.1.14519.5.2.1.7009.9004.223578926541...,19.95,137.02,1657.81,1,False,5.7,58,...,False,False,False,False,False,True,,1.0,True,False


In [17]:
with open(f'{NLST_PREDS}/nlst_demo_v4_cols.json', 'w') as f:
    json.dump(nlst_democols, f)

In [19]:
cols_num = sum(nlst_democols['num'].values(), start=[])
cols_cat = sum(nlst_democols['cat'].values(), start=[])
cols_list = cols_num + cols_cat + nlst_democols['info'] + ['AnnotationID']
len(cols_list)

99

In [20]:
df[cols_list].to_csv(f'{NLST_PREDS}/nlst_demo_v4_csv', index=False)

In [21]:
df.to_csv(f"{NLST_PREDS}/nlst_demov4_allmodels_cal.csv", index=False)