In [2]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
from IPython.display import display, Markdown

from utilities import data, roc, threshold, output
from utilities.info import *

In [3]:
RESULTS_DIR = f"{TEAMS_DIR}/temp-results"
NUM_BOOTSTRAPS = 2
FILE_DIR, RESULTS_DIR

('C:/Users/shaur/OneDrive - Radboudumc/Documents - Master - Shaurya Gaur/General/Malignancy-Estimation Results/files',
 'C:/Users/shaur/OneDrive - Radboudumc/Documents - Master - Shaurya Gaur/General/Malignancy-Estimation Results/temp-results')

In [4]:
nlst_nodule = pd.read_csv(f"{FILE_DIR}/nlst_allmodels_demos.csv")

with open(f'{FILE_DIR}/nlst_democols.json') as json_data:
    nlst_demos_original = json.load(json_data)
    json_data.close()

nlst_data, nlst_demos, nlst_models = data.prep_nlst_preds(nlst_nodule, nlst_demos_original, scanlevel=True, sybil=True, tijmen=False, bin_num=True)
print(len(nlst_data))
nlst_demos

5911


{'num': {'demo': ['weight', 'height', 'BMI', 'Age'],
  'smoke': ['pkyr', 'smokeage', 'smokeday', 'smokeyr'],
  'other': ['Diameter_mm', 'NoduleCounts']},
 'cat': {'demo': ['Age > 61',
   'Gender',
   'HighSchoolPlus',
   'Married',
   'NonHispanicWhite',
   'Overweight',
   'Unfinished_ed',
   'WhiteOrBlack',
   'educat',
   'ethnic',
   'height > 68',
   'marital',
   'race',
   'weight > 180'],
  'smoke': ['cigar',
   'cigsmok',
   'pipe',
   'pkyr > 55',
   'smokeage > 16',
   'smokeday > 25',
   'smokelive',
   'smokework',
   'smokeyr > 40'],
  'work': ['wrkasbe',
   'wrkbaki',
   'wrkbutc',
   'wrkchem',
   'wrkcoal',
   'wrkcott',
   'wrkfarm',
   'wrkfire',
   'wrkflou',
   'wrkfoun',
   'wrkhard',
   'wrkpain',
   'wrksand',
   'wrkweld'],
  'disease': ['diagadas',
   'diagasbe',
   'diagbron',
   'diagchas',
   'diagchro',
   'diagcopd',
   'diagdiab',
   'diagemph',
   'diagfibr',
   'diaghear',
   'diaghype',
   'diagpneu',
   'diagsarc',
   'diagsili',
   'diagstro',
   'd

In [5]:
nlst_models

{'Venkadesh': 'DL_cal',
 'de Haas Local': 'Thijmen_local_cal',
 'de Haas Global (hidden nodule)': 'Thijmen_global_hidden_cal',
 'de Haas Global (shown nodule)': 'Thijmen_global_show_cal',
 'Sybil year 1': 'sybil_year1',
 'PanCan2b': 'PanCan2b'}

In [6]:
nlst_policies, _ = threshold.get_threshold_policies(nlst_data, models=nlst_models, policies=THRESHOLD_POLICIES, brock=True)
nlst_policies

Unnamed: 0,Sensitivity=0.9,Specificity=0.9,Brock
Venkadesh,0.049,0.222,0.06
de Haas Local,0.045,0.226,0.06
de Haas Global (hidden nodule),0.066,0.265,0.06
de Haas Global (shown nodule),0.073,0.312,0.06
Sybil year 1,0.003,0.058,0.06
PanCan2b,0.015,0.165,0.06


In [7]:
def analyze_confounders(
        df=nlst_data, demos=nlst_demos, models=nlst_models, 
        democol='Gender', demosavename='gender', plot_roc=False, plot_thres=False):
    split_groups = {k: v for k, v in df.groupby(democol)}
    sg = list(split_groups.keys())

    cat_df = data.combine_diff_dfs(nlst_demos['cat'], data.diffs_category_prevalence, split_groups)
    cat_df = cat_df.query('value != 0').dropna(subset='value', axis=0)

    display(Markdown(f"### Categorical Confounders"))
    display(cat_df.sort_values(by=f'diff_{sg[0]}_{sg[1]}', ascending=False).head(20))
    display(cat_df.sort_values(by=f'diff_{sg[0]}_{sg[1]}', ascending=True).head(20))
    
    display(Markdown(f"### Numerical Confounders"))
    num_df = data.combine_diff_dfs(nlst_demos['num'], data.diffs_numerical_means, split_groups)
    num_df2 = num_df[num_df['value'].isin(['Median (IQR)', 'Mean (SD)'])]
    display(num_df2.sort_values(by=f'diff_{sg[0]}_{sg[1]}', ascending=False).head(10))
    display(num_df2.sort_values(by=f'diff_{sg[0]}_{sg[1]}', ascending=True).head(10))

    print("ROC Isolations ...", end='\r')
    roc_df = None
    # roc_df = roc.save_results_isolate_confounders(
    #     df, democol, demos['cat'], models, 
    #     csvpath=f'{RESULTS_DIR}/auroc-{demosavename}-by-factors-nlst-{len(df)}.csv',
    #     plot=plot_roc, num_bootstraps=NUM_BOOTSTRAPS)
    print("ROC Isolations done!")

    print("Threshold Isolations ...", end='\r')
    thres_df = None
    # thres_df = threshold.save_results_isolate_confounders(
    #     df, democol, demos['cat'], nlst_policies, models, 
    #     csvpath=f'{RESULTS_DIR}/threshold-{demosavename}-by-factors-nlst-{len(df)}.csv', 
    #     plot=plot_thres, num_bootstraps=NUM_BOOTSTRAPS)
    print("Threshold isolations done!")

    return roc_df, thres_df

## Gender

In [8]:
roc_gender, thres_gender = analyze_confounders(democol='Gender', demosavename='gender', plot_thres=False)

### Categorical Confounders

Unnamed: 0,category,attribute,value,1_freq,1_norm,2_freq,2_norm,diff_1_2
2,demo,Gender,1.0,3441.0,100.0,0.0,0.0,100.0
29,demo,height > 68,1.0,2621.0,76.1697,146.0,5.9109,70.2588
44,demo,weight > 180,1.0,2226.0,64.6905,590.0,23.8866,40.8039
51,smoke,pipe,1.0,1256.0,36.501,45.0,1.8219,34.6791
46,smoke,cigar,1.0,1065.0,30.9503,83.0,3.3603,27.59
7,demo,Married,1.0,2669.0,77.5647,1305.0,52.834,24.7307
31,demo,marital,2.0,2669.0,77.5647,1305.0,52.834,24.7307
236,other,wrknomask,1.0,1184.0,34.4086,320.0,12.9555,21.4531
54,smoke,pkyr > 55,1.0,1699.0,49.3752,822.0,33.2794,16.0958
11,demo,Overweight,1.0,2538.0,73.7576,1474.0,59.6761,14.0815


Unnamed: 0,category,attribute,value,1_freq,1_norm,2_freq,2_norm,diff_1_2
3,demo,Gender,2.0,0.0,0.0,2470.0,100.0,-100.0
34,demo,marital,5.0,477.0,13.8622,646.0,26.1538,-12.2916
32,demo,marital,3.0,127.0,3.6908,380.0,15.3846,-11.6938
60,smoke,smokelive,1.0,2877.0,83.6094,2334.0,94.4939,-10.8845
143,disease,diagpneu,1.0,652.0,18.948,730.0,29.5547,-10.6067
56,smoke,smokeage > 16,1.0,1407.0,40.8893,1248.0,50.5263,-9.637
122,disease,diagchro,1.0,222.0,6.4516,397.0,16.0729,-9.6213
240,nodule,GroundGlassOpacity,1.0,818.0,23.7722,818.0,33.1174,-9.3452
110,disease,diagadas,1.0,117.0,3.4002,258.0,10.4453,-7.0451
19,demo,educat,3.0,768.0,22.3191,711.0,28.7854,-6.4663


### Numerical Confounders

Unnamed: 0,category,attribute,value,1,2,diff_1_2
3,demo,weight,Mean (SD),197.1 (35.0),160.7 (34.8),36.3716
4,demo,weight,Median (IQR),192 (46),157 (45),35.0
46,smoke,smokeday,Median (IQR),30 (20),20 (10),10.0
31,smoke,pkyr,Mean (SD),62.4 (27.1),52.6 (20.9),9.7587
32,smoke,pkyr,Median (IQR),55 (35),46 (22),9.0
11,demo,height,Median (IQR),71 (3),64 (3),7.0
10,demo,height,Mean (SD),70.5 (2.7),64.5 (2.6),5.9899
45,smoke,smokeday,Mean (SD),30.3 (12.3),26.3 (10.4),3.9389
53,smoke,smokeyr,Median (IQR),42 (10),40 (9),2.0
52,smoke,smokeyr,Mean (SD),41.6 (7.6),40.5 (7.0),1.0913


Unnamed: 0,category,attribute,value,1,2,diff_1_2
38,smoke,smokeage,Mean (SD),16.0 (3.4),17.1 (3.6),-1.083
39,smoke,smokeage,Median (IQR),16 (4),17 (4),-1.0
65,other,NoduleCounts,Median (IQR),1 (1),1 (1),0.0
64,other,NoduleCounts,Mean (SD),1.9 (1.3),1.9 (1.2),0.0014
60,other,Diameter_mm,Median (IQR),7 (5),7 (4),0.2
24,demo,Age,Mean (SD),63.4 (5.3),62.9 (5.1),0.5531
59,other,Diameter_mm,Mean (SD),9.3 (6.5),8.7 (5.2),0.6355
17,demo,BMI,Mean (SD),27.9 (4.4),27.1 (5.5),0.7129
18,demo,BMI,Median (IQR),27 (5),26 (7),0.888
25,demo,Age,Median (IQR),63 (8),62 (8),1.0


ROC Isolations done!
Threshold isolations done!


## Race

In [9]:
roc_race, thres_race = analyze_confounders(democol='WhiteOrBlack', demosavename='race', plot_thres=False)

### Categorical Confounders

Unnamed: 0,category,attribute,value,1.0_freq,1.0_norm,2.0_freq,2.0_norm,diff_1.0_2.0
14,demo,WhiteOrBlack,1.0,5523.0,100.0,0.0,0.0,100.0
35,demo,race,1.0,5523.0,100.0,0.0,0.0,100.0
9,demo,NonHispanicWhite,1.0,5430.0,98.3161,0.0,0.0,98.3161
30,demo,marital,2.0,3759.0,68.0608,75.0,39.8936,28.1672
7,demo,Married,1.0,3759.0,68.0608,75.0,39.8936,28.1672
52,smoke,smokeday > 25,1.0,2667.0,48.289,41.0,21.8085,26.4805
48,smoke,pkyr > 55,1.0,2397.0,43.4003,40.0,21.2766,22.1237
45,smoke,pipe,1.0,1259.0,22.7956,17.0,9.0426,13.753
2,demo,Gender,1.0,3221.0,58.3198,85.0,45.2128,13.107
28,demo,height > 68,1.0,2637.0,47.7458,71.0,37.766,9.9798


Unnamed: 0,category,attribute,value,1.0_freq,1.0_norm,2.0_freq,2.0_norm,diff_1.0_2.0
15,demo,WhiteOrBlack,2.0,0.0,0.0,188.0,100.0,-100.0
36,demo,race,2.0,0.0,0.0,188.0,100.0,-100.0
121,disease,diaghype,1.0,1813.0,32.8264,102.0,54.2553,-21.4289
43,smoke,cigsmok,1.0,2734.0,49.5021,129.0,68.617,-19.1149
13,demo,Unfinished_ed,1.0,1466.0,26.5435,79.0,42.0213,-15.4778
33,demo,marital,5.0,1015.0,18.3777,63.0,33.5106,-15.1329
3,demo,Gender,2.0,2302.0,41.6802,103.0,54.7872,-13.107
20,demo,educat,5.0,1200.0,21.7273,61.0,32.4468,-10.7195
109,disease,diagdiab,1.0,462.0,8.365,35.0,18.617,-10.252
50,smoke,smokeage > 16,1.0,2468.0,44.6859,99.0,52.6596,-7.9737


### Numerical Confounders

Unnamed: 0,category,attribute,value,1.0,2.0,diff_1.0_2.0
31,smoke,pkyr,Mean (SD),58.6 (25.0),48.6 (18.6),10.0402
32,smoke,pkyr,Median (IQR),51 (29),43 (18),8.0
45,smoke,smokeday,Mean (SD),28.8 (11.7),23.1 (7.9),5.7245
46,smoke,smokeday,Median (IQR),25 (17),20 (4),5.0
11,demo,height,Median (IQR),68 (6),67 (7),1.0
24,demo,Age,Mean (SD),63.2 (5.3),62.7 (4.9),0.5507
10,demo,height,Mean (SD),68.0 (4.0),67.6 (4.0),0.4392
64,other,NoduleCounts,Mean (SD),1.9 (1.3),1.8 (1.1),0.1098
65,other,NoduleCounts,Median (IQR),1 (1),1 (1),0.0
4,demo,weight,Median (IQR),180 (50),180 (47),0.0


Unnamed: 0,category,attribute,value,1.0,2.0,diff_1.0_2.0
3,demo,weight,Mean (SD),182.2 (38.9),183.7 (41.3),-1.5515
52,smoke,smokeyr,Mean (SD),41.1 (7.4),42.1 (6.6),-1.0355
39,smoke,smokeage,Median (IQR),16 (4),17 (4),-1.0
53,smoke,smokeyr,Median (IQR),41 (10),42 (9),-1.0
59,other,Diameter_mm,Mean (SD),9.0 (5.9),9.9 (6.8),-0.8664
18,demo,BMI,Median (IQR),26 (6),27 (6),-0.7401
17,demo,BMI,Mean (SD),27.6 (4.9),28.3 (5.8),-0.7176
38,smoke,smokeage,Mean (SD),16.5 (3.5),17.2 (3.5),-0.6908
60,other,Diameter_mm,Median (IQR),7 (5),7 (6),-0.65
25,demo,Age,Median (IQR),62 (8),62 (7),-0.5


ROC Isolations done!
Threshold isolations done!


## BMI

In [10]:
roc_bmi, thres_bmi = analyze_confounders(democol='Overweight', demosavename='bmi')

### Categorical Confounders

Unnamed: 0,category,attribute,value,False_freq,False_norm,True_freq,True_norm,diff_False_True
49,smoke,cigsmok,1.0,1183.0,62.2959,1798.0,44.8156,17.4803
3,demo,Gender,2.0,996.0,52.4487,1474.0,36.7398,15.7089
227,other,Emphysema,1.0,892.0,46.9721,1264.0,31.5055,15.4666
66,smoke,smokeyr > 40,1.0,1125.0,59.2417,2030.0,50.5982,8.6435
34,demo,marital,5.0,448.0,23.5914,675.0,16.8245,6.7669
56,smoke,smokeage > 16,1.0,910.0,47.92,1745.0,43.4945,4.4255
143,disease,diagpneu,1.0,501.0,26.3823,881.0,21.9591,4.4232
247,nodule,SemiSolid,1.0,290.0,15.2712,462.0,11.5155,3.7557
131,disease,diagemph,1.0,252.0,13.2701,383.0,9.5464,3.7237
251,nodule,NoduleInUpperLung,1.0,1031.0,54.2917,2030.0,50.5982,3.6935


Unnamed: 0,category,attribute,value,False_freq,False_norm,True_freq,True_norm,diff_False_True
11,demo,Overweight,1.0,0.0,0.0,4012.0,100.0,-100.0
44,demo,weight > 180,1.0,100.0,5.2659,2716.0,67.6969,-62.431
2,demo,Gender,1.0,903.0,47.5513,2538.0,63.2602,-15.7089
140,disease,diaghype,1.0,459.0,24.1706,1534.0,38.2353,-14.0647
29,demo,height > 68,1.0,751.0,39.5471,2016.0,50.2493,-10.7022
7,demo,Married,1.0,1152.0,60.6635,2822.0,70.339,-9.6755
31,demo,marital,2.0,1152.0,60.6635,2822.0,70.339,-9.6755
128,disease,diagdiab,1.0,68.0,3.5808,477.0,11.8893,-8.3085
51,smoke,pipe,1.0,314.0,16.535,987.0,24.6012,-8.0662
46,smoke,cigar,1.0,265.0,13.9547,883.0,22.009,-8.0543


### Numerical Confounders

Unnamed: 0,category,attribute,value,False,True,diff_False_True
52,smoke,smokeyr,Mean (SD),42.3 (7.2),40.6 (7.4),1.7294
25,demo,Age,Median (IQR),63 (8),62 (8),1.0
53,smoke,smokeyr,Median (IQR),42 (10),41 (9),1.0
59,other,Diameter_mm,Mean (SD),9.5 (6.7),8.8 (5.6),0.6977
38,smoke,smokeage,Mean (SD),16.7 (3.6),16.4 (3.5),0.3161
60,other,Diameter_mm,Median (IQR),7 (5),7 (4),0.3
24,demo,Age,Mean (SD),63.3 (5.3),63.1 (5.2),0.231
64,other,NoduleCounts,Mean (SD),1.9 (1.2),1.9 (1.3),0.0147
39,smoke,smokeage,Median (IQR),16 (3),16 (4),0.0
65,other,NoduleCounts,Median (IQR),1 (1),1 (1),0.0


Unnamed: 0,category,attribute,value,False,True,diff_False_True
3,demo,weight,Mean (SD),145.7 (22.7),198.9 (33.4),-53.1974
4,demo,weight,Median (IQR),145 (32),195 (45),-50.0
17,demo,BMI,Mean (SD),22.5 (1.9),29.9 (4.1),-7.3951
18,demo,BMI,Median (IQR),22 (2),29 (5),-6.0586
46,smoke,smokeday,Median (IQR),20 (10),25 (20),-5.0
11,demo,height,Median (IQR),67 (6),69 (5),-2.0
45,smoke,smokeday,Mean (SD),27.4 (11.3),29.2 (11.8),-1.8699
31,smoke,pkyr,Mean (SD),57.4 (24.5),58.7 (25.4),-1.318
10,demo,height,Mean (SD),67.3 (4.0),68.3 (3.9),-1.0464
32,smoke,pkyr,Median (IQR),50 (29),51 (29),-1.0


ROC Isolations done!
Threshold isolations done!


## Age

In [11]:
roc_age, thres_age = analyze_confounders(democol='Age > 61', demosavename='age', plot_thres=False) 

### Categorical Confounders

Unnamed: 0,category,attribute,value,False_freq,False_norm,True_freq,True_norm,diff_False_True
49,smoke,cigsmok,1.0,1480.0,57.2312,1501.0,45.1429,12.0883
58,smoke,smokeday > 25,1.0,1300.0,50.2707,1492.0,44.8722,5.3985
34,demo,marital,5.0,559.0,21.6164,564.0,16.9624,4.654
21,demo,educat,5.0,629.0,24.3233,672.0,20.2105,4.1128
240,nodule,GroundGlassOpacity,1.0,773.0,29.8917,863.0,25.9549,3.9368
3,demo,Gender,2.0,1125.0,43.5035,1345.0,40.4511,3.0524
60,smoke,smokelive,1.0,2324.0,89.8685,2887.0,86.8271,3.0414
22,demo,educat,6.0,437.0,16.8987,476.0,14.3158,2.5829
119,disease,diagchas,1.0,118.0,4.563,67.0,2.015,2.548
63,smoke,smokework,1.0,2286.0,88.3991,2855.0,85.8647,2.5344


Unnamed: 0,category,attribute,value,False_freq,False_norm,True_freq,True_norm,diff_False_True
1,demo,Age > 61,1.0,0.0,0.0,3325.0,100.0,-100.0
66,smoke,smokeyr > 40,1.0,868.0,33.5654,2287.0,68.782,-35.2166
140,disease,diaghype,1.0,758.0,29.3117,1235.0,37.1429,-7.8312
226,other,Diameter_mm > 6,1.0,1537.0,59.4354,2212.0,66.5263,-7.0909
56,smoke,smokeage > 16,1.0,1059.0,40.9513,1596.0,48.0,-7.0487
228,other,Emphysema,1.0,845.0,32.6759,1311.0,39.4286,-6.7527
32,demo,marital,3.0,125.0,4.8337,382.0,11.4887,-6.655
137,disease,diaghear,1.0,257.0,9.9381,545.0,16.391,-6.4529
54,smoke,pkyr > 55,1.0,1012.0,39.1338,1509.0,45.3835,-6.2497
131,disease,diagemph,1.0,195.0,7.5406,440.0,13.2331,-5.6925


### Numerical Confounders

Unnamed: 0,category,attribute,value,False,True,diff_False_True
46,smoke,smokeday,Median (IQR),30 (15),22 (15),8.0
3,demo,weight,Mean (SD),183.7 (40.8),180.5 (38.0),3.1649
45,smoke,smokeday,Mean (SD),29.1 (11.4),28.3 (11.9),0.7708
18,demo,BMI,Median (IQR),27 (6),26 (5),0.474
17,demo,BMI,Mean (SD),27.8 (5.2),27.4 (4.7),0.4276
10,demo,height,Mean (SD),68.0 (4.0),68.0 (4.0),0.0565
11,demo,height,Median (IQR),68 (6),68 (6),0.0
4,demo,weight,Median (IQR),180 (55),180 (48),0.0
39,smoke,smokeage,Median (IQR),16 (4),16 (3),0.0
65,other,NoduleCounts,Mean (SD),1.8 (1.2),2.0 (1.3),-0.1323


Unnamed: 0,category,attribute,value,False,True,diff_False_True
24,demo,Age,Mean (SD),58.3 (1.8),67.0 (3.7),-8.6547
25,demo,Age,Median (IQR),58 (3),66 (6),-8.0
31,smoke,pkyr,Mean (SD),54.1 (22.0),61.6 (26.9),-7.4457
53,smoke,smokeyr,Median (IQR),38 (8),45 (10),-6.5
52,smoke,smokeyr,Mean (SD),37.6 (5.8),43.9 (7.3),-6.325
32,smoke,pkyr,Median (IQR),46 (25),52 (32),-6.25
66,other,NoduleCounts,Median (IQR),1 (1),2 (1),-1.0
59,other,Diameter_mm,Mean (SD),8.6 (5.7),9.4 (6.2),-0.7442
60,other,Diameter_mm,Median (IQR),6 (4),7 (5),-0.7
38,smoke,smokeage,Mean (SD),16.2 (3.2),16.7 (3.7),-0.4957


ROC Isolations done!
Threshold isolations done!
