In [1]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
from IPython.display import display, Markdown

import sys
sys.path.append('../')

from evalutils.roc import get_bootstrapped_roc_ci_curves
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from utilities import data, roc

## directory where results are
EXPERIMENT_DIR = f"/data/bodyct/experiments/lung-malignancy-fairness-shaurya"
NLST_PREDS = f"{EXPERIMENT_DIR}/nlst"

TEAMS_DIR = "C:/Users/shaur/OneDrive - Radboudumc/Documents - Master - Shaurya Gaur/General/Malignancy-Estimation Results"
NLST_PREDS = f"{TEAMS_DIR}/nlst" ## Comment out if not using Teams backup (aka Chansey is up :)
RESULTS_DIR = f"{TEAMS_DIR}/fairness-analysis-results"

## DLCST

In [2]:
dlcst_preds = pd.read_csv(f"{TEAMS_DIR}/dlcst/dlcst_thijmen_kiran_sybil_malignancy_estimation_results.csv", header=0)
dlcst_preds

Unnamed: 0,PatientID,StudyDate,SeriesInstanceUID,Age,Sex,FamilyHistoryLungCa,Emphysema,NoduleCountPerScan,sybil_year1,sybil_year2,sybil_year3,sybil_year4,sybil_year5,sybil_year6,PanCan2b,Ensemble_Kiran,thijmen_mean,label
0,4,20050124,1.2.840.113704.1.111.4964.1106577805.10,55,2,0,0,9,0.021629,0.038573,0.071919,0.079270,0.095846,0.135681,0.053366,0.082652,0.166209,0
1,35,20051208,1.2.840.113704.1.111.5776.1134059140.11,56,1,1,1,2,0.001170,0.002554,0.007835,0.011039,0.018442,0.030460,0.009543,0.000408,0.003368,0
2,38,20060109,1.2.840.113704.1.111.2004.1136823831.14,62,2,0,1,4,0.001784,0.003870,0.007835,0.012797,0.019229,0.032957,0.006734,0.002702,0.065888,0
3,47,20051214,1.2.840.113704.1.111.8148.1134579622.14,57,1,0,1,1,0.003951,0.015674,0.025373,0.034010,0.040605,0.058852,0.007944,0.084158,0.423341,0
4,56,20051213,1.2.840.113704.1.111.2744.1134487263.11,64,1,0,1,3,0.000000,0.001574,0.003791,0.006847,0.010381,0.017287,0.000899,0.000013,0.005590,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
594,4057,20060314,1.2.840.113704.1.111.4796.1142355218.14,69,1,0,1,2,0.168810,0.273494,0.257961,0.294720,0.327062,0.383196,0.380198,0.893933,0.682322,0
595,4063,20060313,1.2.840.113704.1.111.5104.1142267340.10,55,1,0,1,4,0.001965,0.006793,0.013618,0.017289,0.021685,0.035951,0.000000,0.000099,0.103020,0
596,4079,20060328,1.2.840.113704.1.111.1308.1143556124.11,52,2,0,0,1,0.003951,0.011459,0.025373,0.034010,0.040101,0.058852,0.039054,0.070744,0.121373,0
597,4098,20060403,1.2.840.113704.1.111.5848.1144079789.11,54,1,0,1,3,0.117795,0.167839,0.189976,0.217799,0.229753,0.300137,0.134158,0.452513,0.347016,0


In [3]:
dlcst_democols = {
    'cat': {'demo': ['Sex'], 'other': ['FamilyHistoryLungCa', 'Emphysema']},
    'num': {'demo': ['Age'], 'other': ['NoduleCountPerScan']}
}

In [4]:
dlcst_preds, dlcst_democols = data.bin_numerical_columns(dlcst_preds, dlcst_democols)
dlcst_democols

{'cat': {'demo': ['Age > 61', 'Sex'],
  'other': ['Emphysema', 'FamilyHistoryLungCa', 'NoduleCountPerScan > 1']},
 'num': {'demo': ['Age'], 'other': ['NoduleCountPerScan']}}

In [5]:
DLCST_MODELCOLS = {
    "Venkadesh": "Ensemble_Kiran",
    "de Haas": "thijmen_mean",
    "PanCan2b": "PanCan2b",
    "Sybil year 1": "sybil_year1",
    # "Sybil year 2": "sybil_year2",
    # "Sybil year 3": "sybil_year3",
    # "Sybil year 4": "sybil_year4",
    # "Sybil year 5": "sybil_year5",
    # "Sybil year 6": "sybil_year6",
}

In [6]:
result_df = roc.all_results_subgroups_models(dlcst_preds, dlcst_democols['cat'], models=DLCST_MODELCOLS, csvpath=f"{RESULTS_DIR}/auroc-dlcst-{len(dlcst_preds)}.csv")
result_df

Unnamed: 0,p,z,Group_1,AUC_1,AUC-CI-lo_1,AUC-CI-hi_1,Group_2,AUC_2,AUC-CI-lo_2,AUC-CI-hi_2,...,Group_1_ben,Group_1_pct,Group_1_pct_mal,Group_2_mal,Group_2_ben,Group_2_pct,Group_2_pct_mal,col,attribute,category
Venkadesh,0.909707,0.113408,True,0.923066,0.868841,0.969003,False,0.928394,0.877238,0.970634,...,133,27.378965,18.902439,28,407,72.621035,6.436782,Ensemble_Kiran,Age > 61,demo
de Haas,0.68605,0.404221,True,0.912013,0.860069,0.964839,False,0.931365,0.887097,0.97577,...,133,27.378965,18.902439,28,407,72.621035,6.436782,thijmen_mean,Age > 61,demo
PanCan2b,0.419335,0.807575,True,0.851815,0.792921,0.915105,False,0.898209,0.841399,0.945332,...,133,27.378965,18.902439,28,407,72.621035,6.436782,PanCan2b,Age > 61,demo
Sybil year 1,0.928492,0.089743,True,0.860758,0.78427,0.930278,False,0.866137,0.785758,0.923974,...,133,27.378965,18.902439,28,407,72.621035,6.436782,sybil_year1,Age > 61,demo
Venkadesh,0.437917,0.775716,1.0,0.908254,0.837521,0.970516,2.0,0.944307,0.908594,0.968446,...,291,53.923205,9.907121,27,249,46.076795,9.782609,Ensemble_Kiran,Sex,demo
de Haas,0.353367,0.928078,1.0,0.90296,0.839925,0.965447,2.0,0.946357,0.913565,0.974769,...,291,53.923205,9.907121,27,249,46.076795,9.782609,thijmen_mean,Sex,demo
PanCan2b,0.35083,-0.932981,1.0,0.909613,0.85165,0.953015,2.0,0.856363,0.792547,0.90008,...,291,53.923205,9.907121,27,249,46.076795,9.782609,PanCan2b,Sex,demo
Sybil year 1,0.88113,-0.149537,1.0,0.866212,0.77639,0.946607,2.0,0.857134,0.782818,0.922584,...,291,53.923205,9.907121,27,249,46.076795,9.782609,sybil_year1,Sex,demo
Venkadesh,0.210925,1.251025,1.0,0.91071,0.853562,0.954101,0.0,0.965888,0.940037,0.988043,...,361,67.612688,10.864198,15,179,32.387312,7.731959,Ensemble_Kiran,Emphysema,other
de Haas,0.508438,0.661272,1.0,0.915596,0.866233,0.960734,0.0,0.948117,0.902625,0.978142,...,361,67.612688,10.864198,15,179,32.387312,7.731959,thijmen_mean,Emphysema,other


## NLST

In [7]:
nlst_preds_nodule = pd.read_csv(f"{NLST_PREDS}/nlst_demov4_allmodels_cal.csv")

with open(f'{NLST_PREDS}/nlst_demo_v4_cols.json') as json_data:
    nlst_democols = json.load(json_data)
    json_data.close()

nlst_democols['cat'].pop('lungcanc')
nlst_democols

{'num': {'demo': ['BMI', 'Age', 'height', 'weight'],
  'smoke': ['smokeage', 'smokeday', 'smokeyr', 'pkyr'],
  'nodule': ['CoordX', 'CoordZ', 'CoordY', 'Mean_Entropy_Kiran'],
  'other': ['NoduleCounts', 'Diameter_mm']},
 'cat': {'demo': ['Overweight',
   'educat',
   'Gender',
   'Married',
   'HighSchoolPlus',
   'NonHispanicWhite',
   'Unfinished_ed',
   'WhiteOrBlack',
   'marital',
   'ethnic',
   'race'],
  'smoke': ['smokelive', 'cigar', 'cigsmok', 'smokework', 'pipe'],
  'work': ['wrkbaki',
   'wrkfoun',
   'wrkchem',
   'wrkasbe',
   'wrkfire',
   'wrksand',
   'wrkfarm',
   'wrkcoal',
   'wrkpain',
   'wrkweld',
   'wrkflou',
   'wrkbutc',
   'wrkhard',
   'wrkcott'],
  'disease': ['diagasbe',
   'diagchas',
   'diagpneu',
   'diagstro',
   'diagemph',
   'diagbron',
   'diagsili',
   'diagsarc',
   'diaghear',
   'diagdiab',
   'diagadas',
   'diagcopd',
   'diagfibr',
   'diagtube',
   'diaghype',
   'diagchro'],
  'canchist': ['canckidn',
   'cancphar',
   'canccolo',
   'c

### Nodule-level

In [8]:
nlst_preds, nlst_democols, MODELS = data.prep_nlst_preds(nlst_preds_nodule, nlst_democols, scanlevel=False, tijmen=False, sybil=False)
print(len(nlst_preds), " nodules")

result_df = roc.all_results_subgroups_models(nlst_preds, nlst_democols['cat'], models=MODELS, csvpath=f"{RESULTS_DIR}/auroc-nlst-{len(nlst_preds)}.csv")
result_df

16077  nodules


  se = np.sqrt(


Unnamed: 0,p,z,Group_1,AUC_1,AUC-CI-lo_1,AUC-CI-hi_1,Group_2,AUC_2,AUC-CI-lo_2,AUC-CI-hi_2,...,Group_1_ben,Group_1_pct,Group_1_pct_mal,Group_2_mal,Group_2_ben,Group_2_pct,Group_2_pct_mal,col,attribute,category
Venkadesh,0.185623,1.323639,True,0.902498,0.890859,0.91109,False,0.917669,0.906832,0.927024,...,8398,57.237047,8.737231,445,6430,42.762953,6.472727,DL_cal,Age > 61,demo
de Haas Local,0.474735,0.714797,True,0.892337,0.883819,0.902983,False,0.901062,0.889233,0.912455,...,8398,57.237047,8.737231,445,6430,42.762953,6.472727,Thijmen_local_cal,Age > 61,demo
de Haas Global (hidden nodule),0.358868,0.917524,True,0.858178,0.849051,0.868449,False,0.870622,0.857232,0.882986,...,8398,57.237047,8.737231,445,6430,42.762953,6.472727,Thijmen_global_hidden_cal,Age > 61,demo
de Haas Global (shown nodule),0.187259,-1.318732,True,0.895447,0.88708,0.905291,False,0.878542,0.86528,0.891992,...,8398,57.237047,8.737231,445,6430,42.762953,6.472727,Thijmen_global_show_cal,Age > 61,demo
PanCan2b,0.485667,0.697217,True,0.83037,0.817448,0.841415,False,0.840516,0.823942,0.860333,...,8398,57.237047,8.737231,445,6430,42.762953,6.472727,PanCan2b,Age > 61,demo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Venkadesh,0.199439,1.283152,LightSpeed QX/i,0.895073,0.869483,0.915284,Volume Zoom,0.919862,0.89897,0.937618,...,2041,14.194190,10.560911,181,3036,20.009952,5.626360,DL_cal,ManufacturersModelName,scanner
de Haas Local,0.458104,0.741973,LightSpeed QX/i,0.87524,0.851591,0.900752,Volume Zoom,0.891072,0.865797,0.913374,...,2041,14.194190,10.560911,181,3036,20.009952,5.626360,Thijmen_local_cal,ManufacturersModelName,scanner
de Haas Global (hidden nodule),0.852734,0.185631,LightSpeed QX/i,0.863618,0.844484,0.883522,Volume Zoom,0.86781,0.848926,0.882675,...,2041,14.194190,10.560911,181,3036,20.009952,5.626360,Thijmen_global_hidden_cal,ManufacturersModelName,scanner
de Haas Global (shown nodule),0.744952,0.325303,LightSpeed QX/i,0.896691,0.873727,0.913598,Volume Zoom,0.903232,0.884644,0.922774,...,2041,14.194190,10.560911,181,3036,20.009952,5.626360,Thijmen_global_show_cal,ManufacturersModelName,scanner


In [None]:
nlst_preds, nlst_democols, MODELS = data.prep_nlst_preds(nlst_preds_nodule, nlst_democols, scanlevel=False, tijmen=True, sybil=False)
print(len(nlst_preds), " nodules")

result_df = roc.all_results_subgroups_models(nlst_preds, nlst_democols['cat'], models=MODELS, csvpath=f"{RESULTS_DIR}/auroc-nlst-{len(nlst_preds)}.csv")
result_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[query_string] = df.eval(query_string)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[query_string] = df.eval(query_string)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[query_string] = df.eval(query_string)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

3  nodules


AttributeError: 'tuple' object has no attribute 'groupby'

### Scan-level

In [None]:
nlst_preds, nlst_democols, MODELS = data.prep_nlst_preds(nlst_preds_nodule, nlst_democols, scanlevel=True, tijmen=False, sybil=True)
print(len(nlst_preds), " scans")

result_df = roc.all_results_subgroups_models(nlst_preds, nlst_democols['cat'], models=MODELS, csvpath=f"{RESULTS_DIR}/auroc-nlst-{len(nlst_preds)}.csv")
result_df

5911  scans


Unnamed: 0,p,z,Group_1,AUC_1,AUC-CI-lo_1,AUC-CI-hi_1,Group_2,AUC_2,AUC-CI-lo_2,AUC-CI-hi_2,...,Group_1_ben,Group_1_pct,Group_1_pct_mal,Group_2_mal,Group_2_ben,Group_2_pct,Group_2_pct_mal,col,attribute,category
Venkadesh,0.275465,-1.090564,True,0.894097,0.876698,0.910466,False,0.873489,0.845453,0.895642,...,3640,67.873456,9.272183,209,1690,32.126544,11.005793,DL_cal,Overweight,demo
de Haas Local,0.964593,0.04439,True,0.869771,0.851916,0.887,False,0.870638,0.849109,0.890377,...,3640,67.873456,9.272183,209,1690,32.126544,11.005793,Thijmen_local_cal,Overweight,demo
de Haas Global (hidden nodule),0.641519,-0.465576,True,0.800075,0.781818,0.818452,False,0.789479,0.761269,0.817362,...,3640,67.873456,9.272183,209,1690,32.126544,11.005793,Thijmen_global_hidden_cal,Overweight,demo
de Haas Global (shown nodule),0.026891,-2.213102,True,0.86711,0.851198,0.883208,False,0.82043,0.793541,0.844531,...,3640,67.873456,9.272183,209,1690,32.126544,11.005793,Thijmen_global_show_cal,Overweight,demo
PanCan2b,0.000797,-3.353769,True,0.80803,0.788701,0.83022,False,0.729121,0.695184,0.763285,...,3640,67.873456,9.272183,209,1690,32.126544,11.005793,PanCan2b,Overweight,demo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Venkadesh,0.749085,-0.319846,Volume Zoom,0.905674,0.872976,0.939315,Sensation 16,0.896457,0.851294,0.928748,...,1204,22.128235,7.951070,96,719,13.787853,11.779141,DL_cal,ManufacturersModelName,scanner
de Haas Local,0.967376,0.0409,Volume Zoom,0.885999,0.846497,0.924116,Sensation 16,0.887243,0.849386,0.917924,...,1204,22.128235,7.951070,96,719,13.787853,11.779141,Thijmen_local_cal,ManufacturersModelName,scanner
de Haas Global (hidden nodule),0.951275,-0.061105,Volume Zoom,0.805651,0.769375,0.836721,Sensation 16,0.803406,0.767096,0.844879,...,1204,22.128235,7.951070,96,719,13.787853,11.779141,Thijmen_global_hidden_cal,ManufacturersModelName,scanner
de Haas Global (shown nodule),0.488925,-0.69202,Volume Zoom,0.872788,0.837737,0.9042,Sensation 16,0.85009,0.809006,0.882188,...,1204,22.128235,7.951070,96,719,13.787853,11.779141,Thijmen_global_show_cal,ManufacturersModelName,scanner


In [None]:
nlst_preds, nlst_democols, MODELS = data.prep_nlst_preds(nlst_preds_nodule, nlst_democols, scanlevel=True, tijmen=True, sybil=True)
print(len(nlst_preds), " scans")

result_df = roc.all_results_subgroups_models(nlst_preds, nlst_democols['cat'], models=MODELS, csvpath=f"{RESULTS_DIR}/auroc-nlst-{len(nlst_preds)}.csv", plot=False)
result_df

## Load a result DataFrame

In [None]:
dataset_name, dataset_len = 'nlst', 5911
filename = f"{RESULTS_DIR}/auroc-{dataset_name}-{dataset_len}.csv"
result_df = pd.read_csv(filename)
result_df