In [1]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
from IPython.display import display, Markdown

from utilities import data, roc, threshold, output
from utilities.info import *

In [2]:
FILE_DIR = f"{TEAMS_DIR}/files"
RESULTS_DIR = f"{TEAMS_DIR}/temp-results"
FILE_DIR, RESULTS_DIR

('C:/Users/shaur/OneDrive - Radboudumc/Documents - Master - Shaurya Gaur/General/Malignancy-Estimation Results/files',
 'C:/Users/shaur/OneDrive - Radboudumc/Documents - Master - Shaurya Gaur/General/Malignancy-Estimation Results/temp-results')

In [None]:
NUM_BOOTSTRAPS = 100
THRESHOLD_POLICIES

(('Sensitivity', 0.9), ('Specificity', 0.9))

In [4]:
nlst_nodule = pd.read_csv(f"{FILE_DIR}/nlst_allmodels_demos.csv")

with open(f'{FILE_DIR}/nlst_democols.json') as json_data:
    nlst_demos_original = json.load(json_data)
    json_data.close()

nlst_data, nlst_demos, nlst_models = data.prep_nlst_preds(nlst_nodule, nlst_demos_original, scanlevel=True, sybil=True, tijmen=False, bin_num=True)
print(len(nlst_data))
nlst_demos

5911


{'num': {'demo': ['weight', 'height', 'BMI', 'Age'],
  'smoke': ['pkyr', 'smokeage', 'smokeday', 'smokeyr'],
  'other': ['Diameter_mm', 'NoduleCounts']},
 'cat': {'demo': ['Age > 61',
   'Gender',
   'HighSchoolPlus',
   'Married',
   'NonHispanicWhite',
   'Overweight',
   'Unfinished_ed',
   'WhiteOrBlack',
   'educat',
   'ethnic',
   'height > 68',
   'marital',
   'race',
   'weight > 180'],
  'smoke': ['cigar',
   'cigsmok',
   'pipe',
   'pkyr > 55',
   'smokeage > 16',
   'smokeday > 25',
   'smokelive',
   'smokework',
   'smokeyr > 40'],
  'work': ['wrkasbe',
   'wrkbaki',
   'wrkbutc',
   'wrkchem',
   'wrkcoal',
   'wrkcott',
   'wrkfarm',
   'wrkfire',
   'wrkflou',
   'wrkfoun',
   'wrkhard',
   'wrkpain',
   'wrksand',
   'wrkweld'],
  'disease': ['diagadas',
   'diagasbe',
   'diagbron',
   'diagchas',
   'diagchro',
   'diagcopd',
   'diagdiab',
   'diagemph',
   'diagfibr',
   'diaghear',
   'diaghype',
   'diagpneu',
   'diagsarc',
   'diagsili',
   'diagstro',
   'd

In [5]:
nlst_models

{'Venkadesh': 'DL_cal',
 'de Haas Local': 'Thijmen_local_cal',
 'de Haas Global (hidden nodule)': 'Thijmen_global_hidden_cal',
 'de Haas Global (shown nodule)': 'Thijmen_global_show_cal',
 'Sybil year 1': 'sybil_year1',
 'PanCan2b': 'PanCan2b'}

In [6]:
nlst_policies, _ = threshold.get_threshold_policies(nlst_data, models=nlst_models, policies=THRESHOLD_POLICIES, brock=True)
nlst_policies

Unnamed: 0,Sensitivity=0.9,Specificity=0.9,Brock
Venkadesh,0.049,0.222,0.06
de Haas Local,0.045,0.226,0.06
de Haas Global (hidden nodule),0.066,0.265,0.06
de Haas Global (shown nodule),0.073,0.312,0.06
Sybil year 1,0.003,0.058,0.06
PanCan2b,0.015,0.165,0.06


In [None]:
def analyze_confounders(
        df=nlst_data, demos=nlst_demos, models=nlst_models, 
        democol='Gender', demosavename='gender', plot_roc=False, plot_thres=False):
    split_groups = {k: v for k, v in df.groupby(democol)}
    sg = list(split_groups.keys())

    cat_df = data.combine_diff_dfs(nlst_demos['cat'], data.diffs_category_prevalence, split_groups)
    cat_df = cat_df.query('value != 0').dropna(subset='value', axis=0)

    display(Markdown(f"### Categorical Confounders"))
    display(cat_df.sort_values(by=f'diff_{sg[0]}_{sg[1]}', ascending=False).head(20))
    display(cat_df.sort_values(by=f'diff_{sg[0]}_{sg[1]}', ascending=True).head(20))
    
    display(Markdown(f"### Numerical Confounders"))
    num_df = data.combine_diff_dfs(nlst_demos['num'], data.diffs_numerical_means, split_groups)
    display(num_df.sort_values(by=f'diff_{sg[0]}_{sg[1]}', ascending=False).head(10))
    display(num_df.sort_values(by=f'diff_{sg[0]}_{sg[1]}', ascending=True).head(10))

    print("ROC Isolations ...", end='\r')
    roc_df = None
    roc_df = roc.save_results_isolate_confounders(df, democol, demos['cat'], models, csvpath=f'{RESULTS_DIR}/auroc-{demosavename}-by-factors-nlst-{len(df)}.csv',plot=plot_roc, num_bootstraps=NUM_BOOTSTRAPS)
    print("ROC Isolations done!")

    print("Threshold Isolations ...", end='\r')
    thres_df = threshold.save_results_isolate_confounders(
        df, democol, demos['cat'], nlst_policies, models, csvpath=f'{RESULTS_DIR}/threshold-{demosavename}-by-factors-nlst-{len(df)}.csv', plot=plot_thres, num_bootstraps=NUM_BOOTSTRAPS)
    print("Threshold isolations done!")

    return roc_df, thres_df

## Gender

In [10]:
roc_gender, thres_gender = analyze_confounders(democol='Gender', demosavename='gender', plot_thres=False)

### Categorical Confounders

Unnamed: 0,category,attribute,value,1_freq,1_norm,2_freq,2_norm,diff_1_2
2,demo,Gender,1.0,3441.0,100.0,0.0,0.0,100.0
29,demo,height > 68,1.0,2621.0,76.1697,146.0,5.9109,70.2588
44,demo,weight > 180,1.0,2226.0,64.6905,590.0,23.8866,40.8039
51,smoke,pipe,1.0,1256.0,36.501,45.0,1.8219,34.6791
46,smoke,cigar,1.0,1065.0,30.9503,83.0,3.3603,27.59
7,demo,Married,1.0,2669.0,77.5647,1305.0,52.834,24.7307
31,demo,marital,2.0,2669.0,77.5647,1305.0,52.834,24.7307
236,other,wrknomask,1.0,1184.0,34.4086,320.0,12.9555,21.4531
54,smoke,pkyr > 55,1.0,1699.0,49.3752,822.0,33.2794,16.0958
11,demo,Overweight,1.0,2538.0,73.7576,1474.0,59.6761,14.0815


Unnamed: 0,category,attribute,value,1_freq,1_norm,2_freq,2_norm,diff_1_2
3,demo,Gender,2.0,0.0,0.0,2470.0,100.0,-100.0
34,demo,marital,5.0,477.0,13.8622,646.0,26.1538,-12.2916
32,demo,marital,3.0,127.0,3.6908,380.0,15.3846,-11.6938
60,smoke,smokelive,1.0,2877.0,83.6094,2334.0,94.4939,-10.8845
143,disease,diagpneu,1.0,652.0,18.948,730.0,29.5547,-10.6067
56,smoke,smokeage > 16,1.0,1407.0,40.8893,1248.0,50.5263,-9.637
122,disease,diagchro,1.0,222.0,6.4516,397.0,16.0729,-9.6213
240,nodule,GroundGlassOpacity,1.0,818.0,23.7722,818.0,33.1174,-9.3452
110,disease,diagadas,1.0,117.0,3.4002,258.0,10.4453,-7.0451
19,demo,educat,3.0,768.0,22.3191,711.0,28.7854,-6.4663


### Numerical Confounders

Unnamed: 0,category,attribute,value,1,2,diff_1_2
2,demo,weight,75%,218.0,180.0,38.0
0,demo,weight,25%,172.0,135.0,37.0
3,demo,weight,Mean (SD),197.1 (35.0),160.7 (34.8),36.3716
5,demo,weight,mean,197.1118,160.7402,36.3716
1,demo,weight,50%,192.0,157.0,35.0
4,demo,weight,Median (IQR),192 (46),157 (45),35.0
30,smoke,pkyr,75%,77.5,61.5,16.0
43,smoke,smokeday,50%,30.0,20.0,10.0
44,smoke,smokeday,75%,40.0,30.0,10.0
46,smoke,smokeday,Median (IQR),30 (20),20 (10),10.0


Unnamed: 0,category,attribute,value,1,2,diff_1_2
20,demo,BMI,std,4.4279,5.5428,-1.1149
38,smoke,smokeage,Mean (SD),16.0 (3.4),17.1 (3.6),-1.083
40,smoke,smokeage,mean,16.0401,17.1231,-1.083
36,smoke,smokeage,50%,16.0,17.0,-1.0
35,smoke,smokeage,25%,14.0,15.0,-1.0
39,smoke,smokeage,Median (IQR),16 (4),17 (4),-1.0
37,smoke,smokeage,75%,18.0,19.0,-1.0
41,smoke,smokeage,std,3.3902,3.578,-0.1878
42,smoke,smokeday,25%,20.0,20.0,0.0
21,demo,Age,25%,59.0,59.0,0.0


ROC Isolations done!
Threshold isolations done!


## Race

In [None]:
roc_race, thres_race = analyze_confounders(democol='WhiteOrBlack', demosavename='race', plot_thres=False)
roc_race

## BMI

In [12]:
roc_bmi, thres_bmi = analyze_confounders(democol='Overweight', demosavename='bmi')
roc_bmi

### Categorical Confounders

Unnamed: 0,category,attribute,value,False_freq,False_norm,True_freq,True_norm,diff_False_True
49,smoke,cigsmok,1.0,1183.0,62.2959,1798.0,44.8156,17.4803
3,demo,Gender,2.0,996.0,52.4487,1474.0,36.7398,15.7089
227,other,Emphysema,1.0,892.0,46.9721,1264.0,31.5055,15.4666
66,smoke,smokeyr > 40,1.0,1125.0,59.2417,2030.0,50.5982,8.6435
34,demo,marital,5.0,448.0,23.5914,675.0,16.8245,6.7669
56,smoke,smokeage > 16,1.0,910.0,47.92,1745.0,43.4945,4.4255
143,disease,diagpneu,1.0,501.0,26.3823,881.0,21.9591,4.4232
247,nodule,SemiSolid,1.0,290.0,15.2712,462.0,11.5155,3.7557
131,disease,diagemph,1.0,252.0,13.2701,383.0,9.5464,3.7237
251,nodule,NoduleInUpperLung,1.0,1031.0,54.2917,2030.0,50.5982,3.6935


Unnamed: 0,category,attribute,value,False_freq,False_norm,True_freq,True_norm,diff_False_True
11,demo,Overweight,1.0,0.0,0.0,4012.0,100.0,-100.0
44,demo,weight > 180,1.0,100.0,5.2659,2716.0,67.6969,-62.431
2,demo,Gender,1.0,903.0,47.5513,2538.0,63.2602,-15.7089
140,disease,diaghype,1.0,459.0,24.1706,1534.0,38.2353,-14.0647
29,demo,height > 68,1.0,751.0,39.5471,2016.0,50.2493,-10.7022
7,demo,Married,1.0,1152.0,60.6635,2822.0,70.339,-9.6755
31,demo,marital,2.0,1152.0,60.6635,2822.0,70.339,-9.6755
128,disease,diagdiab,1.0,68.0,3.5808,477.0,11.8893,-8.3085
51,smoke,pipe,1.0,314.0,16.535,987.0,24.6012,-8.0662
46,smoke,cigar,1.0,265.0,13.9547,883.0,22.009,-8.0543


### Numerical Confounders

Unnamed: 0,category,attribute,value,False,True,diff_False_True
51,smoke,smokeyr,75%,47.0,45.0,2.0
52,smoke,smokeyr,Mean (SD),42.3 (7.2),40.6 (7.4),1.7294
54,smoke,smokeyr,mean,42.3423,40.6129,1.7294
62,other,Diameter_mm,std,6.6986,5.6105,1.0881
58,other,Diameter_mm,75%,11.2,10.2,1.0
50,smoke,smokeyr,50%,42.0,41.0,1.0
49,smoke,smokeyr,25%,37.0,36.0,1.0
53,smoke,smokeyr,Median (IQR),42 (10),41 (9),1.0
25,demo,Age,Median (IQR),63 (8),62 (8),1.0
35,smoke,smokeage,25%,15.0,14.0,1.0


Unnamed: 0,category,attribute,value,False,True,diff_False_True
2,demo,weight,75%,162.0,220.0,-58.0
3,demo,weight,Mean (SD),145.7 (22.7),198.9 (33.4),-53.1974
5,demo,weight,mean,145.7365,198.9339,-53.1974
1,demo,weight,50%,145.0,195.0,-50.0
4,demo,weight,Median (IQR),145 (32),195 (45),-50.0
0,demo,weight,25%,130.0,175.0,-45.0
6,demo,weight,std,22.7088,33.419,-10.7102
44,smoke,smokeday,75%,30.0,40.0,-10.0
16,demo,BMI,75%,24.1029,32.075,-7.9721
17,demo,BMI,Mean (SD),22.5 (1.9),29.9 (4.1),-7.3951


ROC Isolations done!
Threshold Isolations ...

KeyboardInterrupt: 