In [1]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
from IPython.display import display, Markdown

from utilities import data, roc, threshold, output
from utilities.info import *

In [2]:
FILE_DIR = f"{TEAMS_DIR}/files"
RESULTS_DIR = f"{TEAMS_DIR}/temp-results"
FILE_DIR, RESULTS_DIR

('C:/Users/shaur/OneDrive - Radboudumc/Documents - Master - Shaurya Gaur/General/Malignancy-Estimation Results/files',
 'C:/Users/shaur/OneDrive - Radboudumc/Documents - Master - Shaurya Gaur/General/Malignancy-Estimation Results/temp-results')

In [3]:
NUM_BOOTSTRAPS = 2
THRESHOLD_POLICIES

(('Sensitivity', 0.9), ('Specificity', 0.9))

In [4]:
nlst_nodule = pd.read_csv(f"{FILE_DIR}/nlst_allmodels_demos.csv")

with open(f'{FILE_DIR}/nlst_democols.json') as json_data:
    nlst_demos_original = json.load(json_data)
    json_data.close()

nlst_data, nlst_demos, nlst_models = data.prep_nlst_preds(nlst_nodule, nlst_demos_original, scanlevel=True, sybil=True, tijmen=False, bin_num=True)
print(len(nlst_data))
nlst_demos

5911


{'num': {'demo': ['weight', 'height', 'BMI', 'Age'],
  'smoke': ['pkyr', 'smokeage', 'smokeday', 'smokeyr'],
  'other': ['Diameter_mm', 'NoduleCounts']},
 'cat': {'demo': ['Age > 61',
   'Gender',
   'HighSchoolPlus',
   'Married',
   'NonHispanicWhite',
   'Overweight',
   'Unfinished_ed',
   'WhiteOrBlack',
   'educat',
   'ethnic',
   'height > 68',
   'marital',
   'race',
   'weight > 180'],
  'smoke': ['cigar',
   'cigsmok',
   'pipe',
   'pkyr > 55',
   'smokeage > 16',
   'smokeday > 25',
   'smokelive',
   'smokework',
   'smokeyr > 40'],
  'work': ['wrkasbe',
   'wrkbaki',
   'wrkbutc',
   'wrkchem',
   'wrkcoal',
   'wrkcott',
   'wrkfarm',
   'wrkfire',
   'wrkflou',
   'wrkfoun',
   'wrkhard',
   'wrkpain',
   'wrksand',
   'wrkweld'],
  'disease': ['diagadas',
   'diagasbe',
   'diagbron',
   'diagchas',
   'diagchro',
   'diagcopd',
   'diagdiab',
   'diagemph',
   'diagfibr',
   'diaghear',
   'diaghype',
   'diagpneu',
   'diagsarc',
   'diagsili',
   'diagstro',
   'd

In [5]:
nlst_models

{'Venkadesh': 'DL_cal',
 'de Haas Local': 'Thijmen_local_cal',
 'de Haas Global (hidden nodule)': 'Thijmen_global_hidden_cal',
 'de Haas Global (shown nodule)': 'Thijmen_global_show_cal',
 'Sybil year 1': 'sybil_year1',
 'PanCan2b': 'PanCan2b'}

In [6]:
nlst_policies, _ = threshold.get_threshold_policies(nlst_data, models=nlst_models, policies=THRESHOLD_POLICIES, brock=True)
nlst_policies

Unnamed: 0,Sensitivity=0.9,Specificity=0.9,Brock
Venkadesh,0.049,0.222,0.06
de Haas Local,0.045,0.226,0.06
de Haas Global (hidden nodule),0.066,0.265,0.06
de Haas Global (shown nodule),0.073,0.312,0.06
Sybil year 1,0.003,0.058,0.06
PanCan2b,0.015,0.165,0.06


In [None]:
def analyze_confounders(
        df=nlst_data, demos=nlst_demos, models=nlst_models, 
        democol='Gender', demosavename='gender', plot_roc=False, plot_thres=False):
    split_groups = {k: v for k, v in df.groupby(democol)}
    sg = list(split_groups.keys())

    cat_df = data.combine_diff_dfs(nlst_demos['cat'], data.diffs_category_prevalence, split_groups)
    cat_df = cat_df.query('value != 0').dropna(subset='value', axis=0)

    display(Markdown(f"### Categorical Confounders"))
    display(cat_df.sort_values(by=f'diff_{sg[0]}_{sg[1]}', ascending=False).head(20))
    display(cat_df.sort_values(by=f'diff_{sg[0]}_{sg[1]}', ascending=True).head(20))
    
    display(Markdown(f"### Numerical Confounders"))
    num_df = data.combine_diff_dfs(nlst_demos['num'], data.diffs_numerical_means, split_groups)
    display(num_df.sort_values(by=f'diff_{sg[0]}_{sg[1]}', ascending=False).head(10))
    display(num_df.sort_values(by=f'diff_{sg[0]}_{sg[1]}', ascending=True).head(10))

    print("ROC Isolations ...", end='\r')
    roc_df = None
    roc_df = roc.save_results_isolate_confounders(df, democol, demos['cat'], models, csvpath=f'{RESULTS_DIR}/auroc-{demosavename}-by-factors-nlst-{len(df)}.csv',plot=plot_roc, num_bootstraps=NUM_BOOTSTRAPS)
    print("ROC Isolations done!")

    print("Threshold Isolations ...", end='\r')
    thres_df = threshold.save_results_isolate_confounders(
        df, democol, demos['cat'], nlst_policies, models, csvpath=f'{RESULTS_DIR}/threshold-{demosavename}-by-factors-nlst-{len(df)}.csv', plot=plot_thres, num_bootstraps=NUM_BOOTSTRAPS)
    print("Threshold isolations done!")

    return roc_df, thres_df

## Gender

In [None]:
roc_gender, thres_gender = analyze_confounders(democol='Gender', demosavename='gender', plot_thres=True)

### Categorical Confounders

Unnamed: 0,category,attribute,value,1_freq,1_norm,2_freq,2_norm,diff_1_2
2,demo,Gender,1.0,3441.0,100.0,0.0,0.0,100.0
29,demo,height > 68,1.0,2621.0,76.1697,146.0,5.9109,70.2588
44,demo,weight > 180,1.0,2226.0,64.6905,590.0,23.8866,40.8039
51,smoke,pipe,1.0,1256.0,36.501,45.0,1.8219,34.6791
46,smoke,cigar,1.0,1065.0,30.9503,83.0,3.3603,27.59
7,demo,Married,1.0,2669.0,77.5647,1305.0,52.834,24.7307
31,demo,marital,2.0,2669.0,77.5647,1305.0,52.834,24.7307
236,other,wrknomask,1.0,1184.0,34.4086,320.0,12.9555,21.4531
54,smoke,pkyr > 55,1.0,1699.0,49.3752,822.0,33.2794,16.0958
11,demo,Overweight,1.0,2538.0,73.7576,1474.0,59.6761,14.0815


Unnamed: 0,category,attribute,value,1_freq,1_norm,2_freq,2_norm,diff_1_2
3,demo,Gender,2.0,0.0,0.0,2470.0,100.0,-100.0
34,demo,marital,5.0,477.0,13.8622,646.0,26.1538,-12.2916
32,demo,marital,3.0,127.0,3.6908,380.0,15.3846,-11.6938
60,smoke,smokelive,1.0,2877.0,83.6094,2334.0,94.4939,-10.8845
143,disease,diagpneu,1.0,652.0,18.948,730.0,29.5547,-10.6067
56,smoke,smokeage > 16,1.0,1407.0,40.8893,1248.0,50.5263,-9.637
122,disease,diagchro,1.0,222.0,6.4516,397.0,16.0729,-9.6213
240,nodule,GroundGlassOpacity,1.0,818.0,23.7722,818.0,33.1174,-9.3452
110,disease,diagadas,1.0,117.0,3.4002,258.0,10.4453,-7.0451
19,demo,educat,3.0,768.0,22.3191,711.0,28.7854,-6.4663


### Numerical Confounders

Unnamed: 0,category,attribute,value,1,2,diff_1_2
2,demo,weight,75%,218.0,180.0,38.0
0,demo,weight,25%,172.0,135.0,37.0
3,demo,weight,Mean (SD),197.1 (35.0),160.7 (34.8),36.3716
5,demo,weight,mean,197.1118,160.7402,36.3716
1,demo,weight,50%,192.0,157.0,35.0
4,demo,weight,Median (IQR),192 (46),157 (45),35.0
30,smoke,pkyr,75%,77.5,61.5,16.0
43,smoke,smokeday,50%,30.0,20.0,10.0
44,smoke,smokeday,75%,40.0,30.0,10.0
46,smoke,smokeday,Median (IQR),30 (20),20 (10),10.0


Unnamed: 0,category,attribute,value,1,2,diff_1_2
20,demo,BMI,std,4.4279,5.5428,-1.1149
38,smoke,smokeage,Mean (SD),16.0 (3.4),17.1 (3.6),-1.083
40,smoke,smokeage,mean,16.0401,17.1231,-1.083
36,smoke,smokeage,50%,16.0,17.0,-1.0
35,smoke,smokeage,25%,14.0,15.0,-1.0
39,smoke,smokeage,Median (IQR),16 (4),17 (4),-1.0
37,smoke,smokeage,75%,18.0,19.0,-1.0
41,smoke,smokeage,std,3.3902,3.578,-0.1878
42,smoke,smokeday,25%,20.0,20.0,0.0
21,demo,Age,25%,59.0,59.0,0.0


ROC Isolations done!
Threshold isolations done!


Unnamed: 0,p,z,Group_1,AUC_1,AUC-CI-lo_1,AUC-CI-hi_1,Group_2,AUC_2,AUC-CI-lo_2,AUC-CI-hi_2,...,Group_1_pct,Group_1_pct_mal,Group_2_mal,Group_2_ben,Group_2_pct,Group_2_pct_mal,col,filter_by,filter_val,category
Venkadesh,0.519877,-0.643535,1.0,0.908955,0.884559,0.935383,2.0,0.891310,0.871964,0.921738,...,56.496520,7.871321,108,1017,43.503480,9.60000,DL_cal,Age > 61,0.0,demo
de Haas Local,0.721480,0.356482,1.0,0.874786,0.838950,0.904574,2.0,0.885287,0.870441,0.920113,...,56.496520,7.871321,108,1017,43.503480,9.60000,Thijmen_local_cal,Age > 61,0.0,demo
de Haas Global (hidden nodule),0.028325,-2.192759,1.0,0.849653,0.828740,0.875709,2.0,0.774529,0.729972,0.799704,...,56.496520,7.871321,108,1017,43.503480,9.60000,Thijmen_global_hidden_cal,Age > 61,0.0,demo
de Haas Global (shown nodule),0.420301,-0.805899,1.0,0.863675,0.847727,0.899981,2.0,0.837937,0.816790,0.865408,...,56.496520,7.871321,108,1017,43.503480,9.60000,Thijmen_global_show_cal,Age > 61,0.0,demo
Sybil year 1,0.002247,3.055440,1.0,0.799464,0.769397,0.839103,2.0,0.896069,0.867745,0.916460,...,56.496520,7.871321,108,1017,43.503480,9.60000,sybil_year1,Age > 61,0.0,demo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
de Haas Local,0.376950,0.883530,1.0,0.867683,0.843868,0.892168,2.0,0.884276,0.867528,0.910229,...,58.220339,9.665211,239,2226,41.779661,9.69574,Thijmen_local_cal,Spiculation,False,nodule
de Haas Global (hidden nodule),0.085238,-1.721071,1.0,0.815428,0.783724,0.833336,2.0,0.776973,0.760925,0.794626,...,58.220339,9.665211,239,2226,41.779661,9.69574,Thijmen_global_hidden_cal,Spiculation,False,nodule
de Haas Global (shown nodule),0.855418,-0.182210,1.0,0.854583,0.837892,0.868223,2.0,0.850920,0.834993,0.872737,...,58.220339,9.665211,239,2226,41.779661,9.69574,Thijmen_global_show_cal,Spiculation,False,nodule
Sybil year 1,0.000058,4.020423,1.0,0.802734,0.777393,0.825869,2.0,0.883308,0.860305,0.901211,...,58.220339,9.665211,239,2226,41.779661,9.69574,sybil_year1,Spiculation,False,nodule


## Race

In [None]:
roc_race, thres_race = analyze_confounders(democol='WhiteOrBlack', demosavename='race', plot_thres=True)
roc_race

### Categorical Confounders

Unnamed: 0,category,attribute,value,1.0_freq,1.0_norm,2.0_freq,2.0_norm,diff_1.0_2.0
14,demo,WhiteOrBlack,1.0,5523.0,100.0,0.0,0.0,100.0
35,demo,race,1.0,5523.0,100.0,0.0,0.0,100.0
9,demo,NonHispanicWhite,1.0,5430.0,98.3161,0.0,0.0,98.3161
30,demo,marital,2.0,3759.0,68.0608,75.0,39.8936,28.1672
7,demo,Married,1.0,3759.0,68.0608,75.0,39.8936,28.1672
52,smoke,smokeday > 25,1.0,2667.0,48.289,41.0,21.8085,26.4805
48,smoke,pkyr > 55,1.0,2397.0,43.4003,40.0,21.2766,22.1237
45,smoke,pipe,1.0,1259.0,22.7956,17.0,9.0426,13.753
2,demo,Gender,1.0,3221.0,58.3198,85.0,45.2128,13.107
28,demo,height > 68,1.0,2637.0,47.7458,71.0,37.766,9.9798


Unnamed: 0,category,attribute,value,1.0_freq,1.0_norm,2.0_freq,2.0_norm,diff_1.0_2.0
15,demo,WhiteOrBlack,2.0,0.0,0.0,188.0,100.0,-100.0
36,demo,race,2.0,0.0,0.0,188.0,100.0,-100.0
121,disease,diaghype,1.0,1813.0,32.8264,102.0,54.2553,-21.4289
43,smoke,cigsmok,1.0,2734.0,49.5021,129.0,68.617,-19.1149
13,demo,Unfinished_ed,1.0,1466.0,26.5435,79.0,42.0213,-15.4778
33,demo,marital,5.0,1015.0,18.3777,63.0,33.5106,-15.1329
3,demo,Gender,2.0,2302.0,41.6802,103.0,54.7872,-13.107
20,demo,educat,5.0,1200.0,21.7273,61.0,32.4468,-10.7195
109,disease,diagdiab,1.0,462.0,8.365,35.0,18.617,-10.252
50,smoke,smokeage > 16,1.0,2468.0,44.6859,99.0,52.6596,-7.9737


### Numerical Confounders

Unnamed: 0,category,attribute,value,1.0,2.0,diff_1.0_2.0
30,smoke,pkyr,75%,70.0,54.0,16.0
44,smoke,smokeday,75%,37.0,24.25,12.75
31,smoke,pkyr,Mean (SD),58.6 (25.0),48.6 (18.6),10.0402
33,smoke,pkyr,mean,58.6096,48.5694,10.0402
32,smoke,pkyr,Median (IQR),51 (29),43 (18),8.0
29,smoke,pkyr,50%,51.0,43.0,8.0
34,smoke,pkyr,std,25.0058,18.5518,6.454
45,smoke,smokeday,Mean (SD),28.8 (11.7),23.1 (7.9),5.7245
47,smoke,smokeday,mean,28.8309,23.1064,5.7245
43,smoke,smokeday,50%,25.0,20.0,5.0


Unnamed: 0,category,attribute,value,1.0,2.0,diff_1.0_2.0
6,demo,weight,std,38.9205,41.2818,-2.3613
3,demo,weight,Mean (SD),182.2 (38.9),183.7 (41.3),-1.5515
5,demo,weight,mean,182.1825,183.734,-1.5515
58,other,Diameter_mm,75%,10.5,12.05,-1.55
54,smoke,smokeyr,mean,41.1028,42.1383,-1.0355
52,smoke,smokeyr,Mean (SD),41.1 (7.4),42.1 (6.6),-1.0355
35,smoke,smokeage,25%,14.0,15.0,-1.0
49,smoke,smokeyr,25%,36.0,37.0,-1.0
36,smoke,smokeage,50%,16.0,17.0,-1.0
37,smoke,smokeage,75%,18.0,19.0,-1.0


ROC Isolations done!
Threshold Isolations ...

ValueError: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.

## BMI

In [None]:
roc_bmi, thres_bmi = analyze_confounders(democol='Overweight', demosavename='bmi')
roc_bmi