In [1]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
from IPython.display import display, Markdown

from utilities import data, roc, threshold, output
from utilities.info import *

In [None]:
RESULTS_DIR = f"{EXPERIMENT_DIR}/temp-results"
NUM_BOOTSTRAPS = 2
FILE_DIR, RESULTS_DIR

('C:/Users/shaur/OneDrive - Radboudumc/Documents - Master - Shaurya Gaur/General/Malignancy-Estimation Results/files',
 'C:/Users/shaur/OneDrive - Radboudumc/Documents - Master - Shaurya Gaur/General/Malignancy-Estimation Results/temp-results')

In [3]:
nlst_nodule = pd.read_csv(f"{FILE_DIR}/nlst_allmodels_demos.csv")

with open(f'{FILE_DIR}/nlst_democols.json') as json_data:
    nlst_demos_original = json.load(json_data)
    json_data.close()

nlst_data, nlst_demos, nlst_models = data.prep_nlst_preds(nlst_nodule, nlst_demos_original, scanlevel=True, sybil=True, bin_num=True)
print(len(nlst_data))
nlst_demos

5911


{'num': {'demo': ['weight', 'height', 'BMI', 'Age'],
  'smoke': ['pkyr', 'smokeage', 'smokeday', 'smokeyr'],
  'other': ['Diameter_mm', 'NoduleCounts']},
 'cat': {'demo': ['Age > 61',
   'Gender',
   'HighSchoolPlus',
   'Married',
   'NonHispanicWhite',
   'Overweight',
   'Unfinished_ed',
   'WhiteOrBlack',
   'educat',
   'ethnic',
   'height > 68',
   'marital',
   'race',
   'weight > 180'],
  'smoke': ['cigar',
   'cigsmok',
   'pipe',
   'pkyr > 55',
   'smokeage > 16',
   'smokeday > 25',
   'smokelive',
   'smokework',
   'smokeyr > 40'],
  'work': ['wrkasbe',
   'wrkbaki',
   'wrkbutc',
   'wrkchem',
   'wrkcoal',
   'wrkcott',
   'wrkfarm',
   'wrkfire',
   'wrkflou',
   'wrkfoun',
   'wrkhard',
   'wrkpain',
   'wrksand',
   'wrkweld'],
  'disease': ['diagadas',
   'diagasbe',
   'diagbron',
   'diagchas',
   'diagchro',
   'diagcopd',
   'diagdiab',
   'diagemph',
   'diagfibr',
   'diaghear',
   'diaghype',
   'diagpneu',
   'diagsarc',
   'diagsili',
   'diagstro',
   'd

In [4]:
nlst_models = {
    'Venkadesh': 'DL_cal',
    'Sybil year 1': 'sybil_year1',
    'PanCan2b': 'PanCan2b'
}

In [5]:
nlst_policies, _ = threshold.get_threshold_policies(nlst_data, models=nlst_models, policies=THRESHOLD_POLICIES, brock=True)
nlst_policies

Unnamed: 0,Sensitivity=0.9,Specificity=0.9,Brock
Venkadesh,0.049,0.222,0.06
Sybil year 1,0.003,0.058,0.06
PanCan2b,0.015,0.165,0.06


In [6]:
def analyze_confounders(
        df=nlst_data, demos=nlst_demos, models=nlst_models, 
        democol='Gender', demosavename='gender', plot_roc=False, plot_thres=False):
    split_groups = {k: v for k, v in df.groupby(democol)}
    sg = list(split_groups.keys())

    cat_df = data.combine_diff_dfs(nlst_demos['cat'], data.diffs_category_prevalence, split_groups)
    cat_df = cat_df.query('value != 0').dropna(subset='value', axis=0)

    display(Markdown(f"### Categorical Confounders"))
    display(cat_df.sort_values(by=f'diff_{sg[0]}_{sg[1]}', ascending=False).head(20))
    display(cat_df.sort_values(by=f'diff_{sg[0]}_{sg[1]}', ascending=True).head(20))
    
    display(Markdown(f"### Numerical Confounders"))
    num_df = data.combine_diff_dfs(nlst_demos['num'], data.diffs_numerical_means, split_groups)
    num_df2 = num_df[num_df['value'].isin(['Median (IQR)', 'Mean (SD)'])]
    display(num_df2.sort_values(by=f'diff_{sg[0]}_{sg[1]}', ascending=False).head(10))
    display(num_df2.sort_values(by=f'diff_{sg[0]}_{sg[1]}', ascending=True).head(10))

    print("ROC Isolations ...", end='\r')
    roc_df = None
    roc_df = roc.save_results_isolate_confounders(
        df, democol, demos['cat'], models, 
        csvpath=f'{RESULTS_DIR}/auroc-{demosavename}-by-factors-nlst-{len(df)}.csv',
        plot=plot_roc, num_bootstraps=NUM_BOOTSTRAPS)
    print("ROC Isolations done!")

    print("Threshold Isolations ...", end='\r')
    thres_df = threshold.save_results_isolate_confounders(
        df, democol, demos['cat'], nlst_policies, models, 
        csvpath=f'{RESULTS_DIR}/threshold-{demosavename}-by-factors-nlst-{len(df)}.csv', 
        plot=plot_thres, num_bootstraps=NUM_BOOTSTRAPS)
    print("Threshold isolations done!")

    return roc_df, thres_df

## Gender

In [7]:
roc_gender, thres_gender = analyze_confounders(democol='Gender', demosavename='gender', plot_thres=False)

### Categorical Confounders

Unnamed: 0,category,attribute,value,1_freq,1_norm,2_freq,2_norm,diff_1_2
2,demo,Gender,1.0,3441.0,100.0,0.0,0.0,100.0
29,demo,height > 68,1.0,2621.0,76.1697,146.0,5.9109,70.2588
44,demo,weight > 180,1.0,2226.0,64.6905,590.0,23.8866,40.8039
51,smoke,pipe,1.0,1256.0,36.501,45.0,1.8219,34.6791
46,smoke,cigar,1.0,1065.0,30.9503,83.0,3.3603,27.59
7,demo,Married,1.0,2669.0,77.5647,1305.0,52.834,24.7307
31,demo,marital,2.0,2669.0,77.5647,1305.0,52.834,24.7307
236,other,wrknomask,1.0,1184.0,34.4086,320.0,12.9555,21.4531
54,smoke,pkyr > 55,1.0,1699.0,49.3752,822.0,33.2794,16.0958
11,demo,Overweight,1.0,2538.0,73.7576,1474.0,59.6761,14.0815


Unnamed: 0,category,attribute,value,1_freq,1_norm,2_freq,2_norm,diff_1_2
3,demo,Gender,2.0,0.0,0.0,2470.0,100.0,-100.0
34,demo,marital,5.0,477.0,13.8622,646.0,26.1538,-12.2916
32,demo,marital,3.0,127.0,3.6908,380.0,15.3846,-11.6938
60,smoke,smokelive,1.0,2877.0,83.6094,2334.0,94.4939,-10.8845
143,disease,diagpneu,1.0,652.0,18.948,730.0,29.5547,-10.6067
56,smoke,smokeage > 16,1.0,1407.0,40.8893,1248.0,50.5263,-9.637
122,disease,diagchro,1.0,222.0,6.4516,397.0,16.0729,-9.6213
240,nodule,GroundGlassOpacity,1.0,818.0,23.7722,818.0,33.1174,-9.3452
110,disease,diagadas,1.0,117.0,3.4002,258.0,10.4453,-7.0451
19,demo,educat,3.0,768.0,22.3191,711.0,28.7854,-6.4663


### Numerical Confounders

Unnamed: 0,category,attribute,value,1,2,diff_1_2
3,demo,weight,Mean (SD),197.1 (35.0),160.7 (34.8),36.3716
4,demo,weight,Median (IQR),192 (46),157 (45),35.0
46,smoke,smokeday,Median (IQR),30 (20),20 (10),10.0
31,smoke,pkyr,Mean (SD),62.4 (27.1),52.6 (20.9),9.7587
32,smoke,pkyr,Median (IQR),55 (35),46 (22),9.0
11,demo,height,Median (IQR),71 (3),64 (3),7.0
10,demo,height,Mean (SD),70.5 (2.7),64.5 (2.6),5.9899
45,smoke,smokeday,Mean (SD),30.3 (12.3),26.3 (10.4),3.9389
53,smoke,smokeyr,Median (IQR),42 (10),40 (9),2.0
52,smoke,smokeyr,Mean (SD),41.6 (7.6),40.5 (7.0),1.0913


Unnamed: 0,category,attribute,value,1,2,diff_1_2
38,smoke,smokeage,Mean (SD),16.0 (3.4),17.1 (3.6),-1.083
39,smoke,smokeage,Median (IQR),16 (4),17 (4),-1.0
65,other,NoduleCounts,Median (IQR),1 (1),1 (1),0.0
64,other,NoduleCounts,Mean (SD),1.9 (1.3),1.9 (1.2),0.0014
60,other,Diameter_mm,Median (IQR),7 (5),7 (4),0.2
24,demo,Age,Mean (SD),63.4 (5.3),62.9 (5.1),0.5531
59,other,Diameter_mm,Mean (SD),9.3 (6.5),8.7 (5.2),0.6355
17,demo,BMI,Mean (SD),27.9 (4.4),27.1 (5.5),0.7129
18,demo,BMI,Median (IQR),27 (5),26 (7),0.888
25,demo,Age,Median (IQR),63 (8),62 (8),1.0


ROC Isolations ...

KeyboardInterrupt: 

## Race

In [None]:
roc_race, thres_race = analyze_confounders(democol='WhiteOrBlack', demosavename='race', plot_thres=False)

### Categorical Confounders

Unnamed: 0,category,attribute,value,1.0_freq,1.0_norm,2.0_freq,2.0_norm,diff_1.0_2.0
14,demo,WhiteOrBlack,1.0,5523.0,100.0,0.0,0.0,100.0
35,demo,race,1.0,5523.0,100.0,0.0,0.0,100.0
9,demo,NonHispanicWhite,1.0,5430.0,98.3161,0.0,0.0,98.3161
30,demo,marital,2.0,3759.0,68.0608,75.0,39.8936,28.1672
7,demo,Married,1.0,3759.0,68.0608,75.0,39.8936,28.1672
52,smoke,smokeday > 25,1.0,2667.0,48.289,41.0,21.8085,26.4805
48,smoke,pkyr > 55,1.0,2397.0,43.4003,40.0,21.2766,22.1237
45,smoke,pipe,1.0,1259.0,22.7956,17.0,9.0426,13.753
2,demo,Gender,1.0,3221.0,58.3198,85.0,45.2128,13.107
28,demo,height > 68,1.0,2637.0,47.7458,71.0,37.766,9.9798


Unnamed: 0,category,attribute,value,1.0_freq,1.0_norm,2.0_freq,2.0_norm,diff_1.0_2.0
15,demo,WhiteOrBlack,2.0,0.0,0.0,188.0,100.0,-100.0
36,demo,race,2.0,0.0,0.0,188.0,100.0,-100.0
121,disease,diaghype,1.0,1813.0,32.8264,102.0,54.2553,-21.4289
43,smoke,cigsmok,1.0,2734.0,49.5021,129.0,68.617,-19.1149
13,demo,Unfinished_ed,1.0,1466.0,26.5435,79.0,42.0213,-15.4778
33,demo,marital,5.0,1015.0,18.3777,63.0,33.5106,-15.1329
3,demo,Gender,2.0,2302.0,41.6802,103.0,54.7872,-13.107
20,demo,educat,5.0,1200.0,21.7273,61.0,32.4468,-10.7195
109,disease,diagdiab,1.0,462.0,8.365,35.0,18.617,-10.252
50,smoke,smokeage > 16,1.0,2468.0,44.6859,99.0,52.6596,-7.9737


### Numerical Confounders

Unnamed: 0,category,attribute,value,1.0,2.0,diff_1.0_2.0
30,smoke,pkyr,75%,70.0,54.0,16.0
44,smoke,smokeday,75%,37.0,24.25,12.75
31,smoke,pkyr,Mean (SD),58.6 (25.0),48.6 (18.6),10.0402
33,smoke,pkyr,mean,58.6096,48.5694,10.0402
32,smoke,pkyr,Median (IQR),51 (29),43 (18),8.0
29,smoke,pkyr,50%,51.0,43.0,8.0
34,smoke,pkyr,std,25.0058,18.5518,6.454
45,smoke,smokeday,Mean (SD),28.8 (11.7),23.1 (7.9),5.7245
47,smoke,smokeday,mean,28.8309,23.1064,5.7245
43,smoke,smokeday,50%,25.0,20.0,5.0


Unnamed: 0,category,attribute,value,1.0,2.0,diff_1.0_2.0
6,demo,weight,std,38.9205,41.2818,-2.3613
3,demo,weight,Mean (SD),182.2 (38.9),183.7 (41.3),-1.5515
5,demo,weight,mean,182.1825,183.734,-1.5515
58,other,Diameter_mm,75%,10.5,12.05,-1.55
54,smoke,smokeyr,mean,41.1028,42.1383,-1.0355
52,smoke,smokeyr,Mean (SD),41.1 (7.4),42.1 (6.6),-1.0355
35,smoke,smokeage,25%,14.0,15.0,-1.0
49,smoke,smokeyr,25%,36.0,37.0,-1.0
36,smoke,smokeage,50%,16.0,17.0,-1.0
37,smoke,smokeage,75%,18.0,19.0,-1.0


ROC Isolations ...

  se = np.sqrt(
  se = np.sqrt(
  se = np.sqrt(
  se = np.sqrt(
  se = np.sqrt(
  se = np.sqrt(
  se = np.sqrt(
  se = np.sqrt(
  se = np.sqrt(


ROC Isolations done!
Threshold isolations done!


Unnamed: 0,p,z,Group_1,AUC_1,AUC-CI-lo_1,AUC-CI-hi_1,Group_2,AUC_2,AUC-CI-lo_2,AUC-CI-hi_2,...,Group_1_pct,Group_1_pct_mal,Group_2_mal,Group_2_ben,Group_2_pct,Group_2_pct_mal,col,filter_by,filter_val,category
Venkadesh,0.944684,0.069384,1.0,0.905566,0.888317,0.924462,2.0,0.909625,0.907197,0.912438,...,93.503480,8.395368,12,70,3.170920,14.634146,DL_cal,Age > 61,0.0,demo
de Haas Local,0.450898,0.753919,1.0,0.888868,0.885428,0.893977,2.0,0.929125,0.908458,0.955492,...,93.503480,8.395368,12,70,3.170920,14.634146,Thijmen_local_cal,Age > 61,0.0,demo
de Haas Global (hidden nodule),0.514560,-0.651753,1.0,0.800613,0.786387,0.815407,2.0,0.747271,0.664773,0.826866,...,93.503480,8.395368,12,70,3.170920,14.634146,Thijmen_global_hidden_cal,Age > 61,0.0,demo
de Haas Global (shown nodule),0.560550,-0.582025,1.0,0.855118,0.847355,0.863359,2.0,0.810875,0.775124,0.852273,...,93.503480,8.395368,12,70,3.170920,14.634146,Thijmen_global_show_cal,Age > 61,0.0,demo
Sybil year 1,0.656202,0.445163,1.0,0.837989,0.833856,0.842958,2.0,0.868263,0.838542,0.900995,...,93.503480,8.395368,12,70,3.170920,14.634146,sybil_year1,Age > 61,0.0,demo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
de Haas Local,0.498664,-0.676593,1.0,0.878687,0.876119,0.882025,2.0,0.847508,0.804663,0.890752,...,93.423729,9.433962,28,160,3.186441,14.893617,Thijmen_local_cal,Spiculation,False,nodule
de Haas Global (hidden nodule),0.355223,0.924505,1.0,0.803051,0.798046,0.808368,2.0,0.846142,0.820859,0.872713,...,93.423729,9.433962,28,160,3.186441,14.893617,Thijmen_global_hidden_cal,Spiculation,False,nodule
de Haas Global (shown nodule),0.920681,0.099576,1.0,0.856241,0.850842,0.861898,2.0,0.860708,0.852515,0.871443,...,93.423729,9.433962,28,160,3.186441,14.893617,Thijmen_global_show_cal,Spiculation,False,nodule
Sybil year 1,0.863189,0.172316,1.0,0.844076,0.834528,0.854253,2.0,0.851976,0.847434,0.858037,...,93.423729,9.433962,28,160,3.186441,14.893617,sybil_year1,Spiculation,False,nodule


## BMI

In [None]:
roc_bmi, thres_bmi = analyze_confounders(democol='Overweight', demosavename='bmi')

### Categorical Confounders

Unnamed: 0,category,attribute,value,False_freq,False_norm,True_freq,True_norm,diff_False_True
49,smoke,cigsmok,1.0,1183.0,62.2959,1798.0,44.8156,17.4803
3,demo,Gender,2.0,996.0,52.4487,1474.0,36.7398,15.7089
227,other,Emphysema,1.0,892.0,46.9721,1264.0,31.5055,15.4666
66,smoke,smokeyr > 40,1.0,1125.0,59.2417,2030.0,50.5982,8.6435
34,demo,marital,5.0,448.0,23.5914,675.0,16.8245,6.7669
56,smoke,smokeage > 16,1.0,910.0,47.92,1745.0,43.4945,4.4255
143,disease,diagpneu,1.0,501.0,26.3823,881.0,21.9591,4.4232
247,nodule,SemiSolid,1.0,290.0,15.2712,462.0,11.5155,3.7557
131,disease,diagemph,1.0,252.0,13.2701,383.0,9.5464,3.7237
251,nodule,NoduleInUpperLung,1.0,1031.0,54.2917,2030.0,50.5982,3.6935


Unnamed: 0,category,attribute,value,False_freq,False_norm,True_freq,True_norm,diff_False_True
11,demo,Overweight,1.0,0.0,0.0,4012.0,100.0,-100.0
44,demo,weight > 180,1.0,100.0,5.2659,2716.0,67.6969,-62.431
2,demo,Gender,1.0,903.0,47.5513,2538.0,63.2602,-15.7089
140,disease,diaghype,1.0,459.0,24.1706,1534.0,38.2353,-14.0647
29,demo,height > 68,1.0,751.0,39.5471,2016.0,50.2493,-10.7022
7,demo,Married,1.0,1152.0,60.6635,2822.0,70.339,-9.6755
31,demo,marital,2.0,1152.0,60.6635,2822.0,70.339,-9.6755
128,disease,diagdiab,1.0,68.0,3.5808,477.0,11.8893,-8.3085
51,smoke,pipe,1.0,314.0,16.535,987.0,24.6012,-8.0662
46,smoke,cigar,1.0,265.0,13.9547,883.0,22.009,-8.0543


### Numerical Confounders

Unnamed: 0,category,attribute,value,False,True,diff_False_True
51,smoke,smokeyr,75%,47.0,45.0,2.0
52,smoke,smokeyr,Mean (SD),42.3 (7.2),40.6 (7.4),1.7294
54,smoke,smokeyr,mean,42.3423,40.6129,1.7294
62,other,Diameter_mm,std,6.6986,5.6105,1.0881
58,other,Diameter_mm,75%,11.2,10.2,1.0
50,smoke,smokeyr,50%,42.0,41.0,1.0
49,smoke,smokeyr,25%,37.0,36.0,1.0
53,smoke,smokeyr,Median (IQR),42 (10),41 (9),1.0
25,demo,Age,Median (IQR),63 (8),62 (8),1.0
35,smoke,smokeage,25%,15.0,14.0,1.0


Unnamed: 0,category,attribute,value,False,True,diff_False_True
2,demo,weight,75%,162.0,220.0,-58.0
3,demo,weight,Mean (SD),145.7 (22.7),198.9 (33.4),-53.1974
5,demo,weight,mean,145.7365,198.9339,-53.1974
1,demo,weight,50%,145.0,195.0,-50.0
4,demo,weight,Median (IQR),145 (32),195 (45),-50.0
0,demo,weight,25%,130.0,175.0,-45.0
6,demo,weight,std,22.7088,33.419,-10.7102
44,smoke,smokeday,75%,30.0,40.0,-10.0
16,demo,BMI,75%,24.1029,32.075,-7.9721
17,demo,BMI,Mean (SD),22.5 (1.9),29.9 (4.1),-7.3951


ROC Isolations done!
Threshold isolations done!


Unnamed: 0,p,z,Group_1,AUC_1,AUC-CI-lo_1,AUC-CI-hi_1,Group_2,AUC_2,AUC-CI-lo_2,AUC-CI-hi_2,...,Group_1_pct,Group_1_pct_mal,Group_2_mal,Group_2_ben,Group_2_pct,Group_2_pct_mal,col,filter_by,filter_val,category
Venkadesh,0.22874,-1.203611,True,0.917784,0.912753,0.926002,False,0.882592,0.854259,0.911966,...,68.561485,8.178229,78,735,31.438515,9.594096,DL_cal,Age > 61,0.0,demo
de Haas Local,0.816013,-0.232676,True,0.887044,0.872174,0.902321,False,0.879916,0.874962,0.887563,...,68.561485,8.178229,78,735,31.438515,9.594096,Thijmen_local_cal,Age > 61,0.0,demo
de Haas Global (hidden nodule),0.458246,0.741739,True,0.806968,0.795267,0.819399,False,0.833185,0.827682,0.838561,...,68.561485,8.178229,78,735,31.438515,9.594096,Thijmen_global_hidden_cal,Age > 61,0.0,demo
de Haas Global (shown nodule),0.446495,-0.761271,True,0.858444,0.854458,0.863902,False,0.832461,0.82408,0.840712,...,68.561485,8.178229,78,735,31.438515,9.594096,Thijmen_global_show_cal,Age > 61,0.0,demo
Sybil year 1,0.87656,0.155332,True,0.830781,0.818938,0.843465,False,0.836169,0.820668,0.851884,...,68.561485,8.178229,78,735,31.438515,9.594096,sybil_year1,Age > 61,0.0,demo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
de Haas Local,0.231767,-1.19582,True,0.874237,0.872236,0.877085,False,0.849839,0.839835,0.860832,...,67.932203,9.206587,202,1690,32.067797,10.676533,Thijmen_local_cal,Spiculation,False,nodule
de Haas Global (hidden nodule),0.762175,-0.302625,True,0.801893,0.797336,0.807163,False,0.794954,0.785373,0.804471,...,67.932203,9.206587,202,1690,32.067797,10.676533,Thijmen_global_hidden_cal,Spiculation,False,nodule
de Haas Global (shown nodule),0.010949,-2.544334,True,0.86798,0.865156,0.871212,False,0.813165,0.802845,0.824276,...,67.932203,9.206587,202,1690,32.067797,10.676533,Thijmen_global_show_cal,Spiculation,False,nodule
Sybil year 1,0.006224,-2.735737,True,0.857643,0.844898,0.870926,False,0.797217,0.796382,0.798378,...,67.932203,9.206587,202,1690,32.067797,10.676533,sybil_year1,Spiculation,False,nodule
