In [1]:
import pandas as pd
import os
import numpy as np
import json
import seaborn as sns
import matplotlib.pyplot as plt
import copy
from IPython.display import display, Markdown

from utilities import data
from utilities.info import *

%matplotlib inline

## Grab Data for Each Model

In [2]:
venk21_nodule = pd.read_csv(f"{FILE_DIR}/nlst_allmodels_demos.csv")

with open(f'{FILE_DIR}/nlst_democols.json') as json_data:
    venk21_demos_original = json.load(json_data)
    json_data.close()

venk21_data, venk21_demos, _ = data.prep_nlst_preds(venk21_nodule, venk21_demos_original, scanlevel=True, sybil=False, bin_num=False)
venk21_demos['cat']['other'].append('label')
venk21_demos

{'num': {'demo': ['weight', 'height', 'BMI', 'Age'],
  'smoke': ['pkyr', 'smokeage', 'smokeday', 'smokeyr'],
  'other': ['Diameter_mm', 'NoduleCounts']},
 'cat': {'demo': ['Overweight',
   'race',
   'ethnic',
   'Unfinished_ed',
   'educat',
   'NonHispanicWhite',
   'WhiteOrBlack',
   'marital',
   'Married',
   'HighSchoolPlus',
   'Gender'],
  'smoke': ['cigar', 'cigsmok', 'pipe', 'smokelive', 'smokework'],
  'work': ['wrkasbe',
   'wrkbaki',
   'wrkbutc',
   'wrkchem',
   'wrkcoal',
   'wrkcott',
   'wrkfarm',
   'wrkfire',
   'wrkflou',
   'wrkfoun',
   'wrkhard',
   'wrkpain',
   'wrksand',
   'wrkweld'],
  'disease': ['diagadas',
   'diagasbe',
   'diagbron',
   'diagchas',
   'diagchro',
   'diagcopd',
   'diagdiab',
   'diagemph',
   'diagfibr',
   'diaghear',
   'diaghype',
   'diagpneu',
   'diagsarc',
   'diagsili',
   'diagstro',
   'diagtube'],
  'canchist': ['cancblad',
   'cancbrea',
   'canccerv',
   'canccolo',
   'cancesop',
   'canckidn',
   'canclary',
   'canclun

In [3]:
sybil_data = pd.read_csv(f"{FILE_DIR}/nlst_sybil_demos.csv")

with open(f'{FILE_DIR}/nlst_sybil_democols.json') as json_data:
    sybil_demos = json.load(json_data)
    json_data.close()

sybil_demos

{'num': {'demo': ['weight', 'height', 'BMI', 'Age'],
  'smoke': ['pkyr', 'smokeage', 'smokeday', 'smokeyr']},
 'cat': {'demo': ['Overweight',
   'race',
   'ethnic',
   'Unfinished_ed',
   'educat',
   'NonHispanicWhite',
   'WhiteOrBlack',
   'marital',
   'Married',
   'HighSchoolPlus',
   'Gender'],
  'smoke': ['cigar', 'cigsmok', 'pipe', 'smokelive', 'smokework'],
  'work': ['wrkasbe',
   'wrkbaki',
   'wrkbutc',
   'wrkchem',
   'wrkcoal',
   'wrkcott',
   'wrkfarm',
   'wrkfire',
   'wrkflou',
   'wrkfoun',
   'wrkhard',
   'wrkpain',
   'wrksand',
   'wrkweld'],
  'disease': ['diagadas',
   'diagasbe',
   'diagbron',
   'diagchas',
   'diagchro',
   'diagcopd',
   'diagdiab',
   'diagemph',
   'diagfibr',
   'diaghear',
   'diaghype',
   'diagpneu',
   'diagsarc',
   'diagsili',
   'diagstro',
   'diagtube'],
  'canchist': ['cancblad',
   'cancbrea',
   'canccerv',
   'canccolo',
   'cancesop',
   'canckidn',
   'canclary',
   'canclung',
   'cancnasa',
   'cancoral',
   'cancpa

In [4]:
sybil_splits = {s: sybil_data.query(f'split == "{s}"') for s in ['train', 'dev', 'test']}
for s in ['train', 'dev', 'test']:
    print(s, len(sybil_splits[s]), 'Scans')

train 28160 Scans
dev 6838 Scans
test 6282 Scans


## Model Training Sets

In [5]:
training_sets = {
    'Venk21': venk21_data,
    'Sybil': sybil_splits['train']
}

### Categorical columns

In [6]:
cat_demo_splits = data.combine_diff_dfs(sybil_demos['cat'], data.diffs_category_prevalence, training_sets).dropna(subset='value', axis=0).query('value != 0')
display(cat_demo_splits.sort_values(by='diff_Venk21_Sybil', ascending=False).head(10))
display(cat_demo_splits.sort_values(by='diff_Venk21_Sybil', ascending=True ).head(10))

Unnamed: 0,category,attribute,value,Venk21_freq,Venk21_norm,Sybil_freq,Sybil_norm,diff_Venk21_Sybil
204,lungcanc,LC_stage,110.0,596.0,5.8529,549.0,1.9496,3.9033
218,other,FamilyHistoryLungCa,1.0,2622.0,25.7488,6232.0,22.1307,3.6181
193,lungcanc,Adenocarcinoma,1.0,539.0,5.2931,558.0,1.9815,3.3116
43,smoke,cigsmok,1.0,5088.0,49.9656,13386.0,47.5355,2.4301
117,disease,diagemph,1.0,1022.0,10.0363,2317.0,8.228,1.8083
38,demo,Gender,2.0,4349.0,42.7084,11588.0,41.1506,1.5578
16,demo,educat,3.0,2538.0,24.9239,6581.0,23.37,1.5539
111,disease,diagcopd,1.0,676.0,6.6385,1477.0,5.245,1.3935
195,lungcanc,Bronchiolo-alveolar_carcinoma,1.0,180.0,1.7677,123.0,0.4368,1.3309
29,demo,marital,3.0,864.0,8.4847,2055.0,7.2976,1.1871


Unnamed: 0,category,attribute,value,Venk21_freq,Venk21_norm,Sybil_freq,Sybil_norm,diff_Venk21_Sybil
1,demo,Overweight,1.0,6963.0,68.3787,19898.0,70.6605,-2.2818
40,smoke,cigar,1.0,1934.0,18.9924,5942.0,21.1009,-2.1085
19,demo,educat,6.0,1585.0,15.5652,4836.0,17.1733,-1.6081
37,demo,Gender,1.0,5834.0,57.2916,16572.0,58.8494,-1.5578
34,demo,Married,1.0,6855.0,67.3181,19282.0,68.473,-1.1549
28,demo,marital,2.0,6855.0,67.3181,19282.0,68.473,-1.1549
18,demo,educat,5.0,2287.0,22.459,6647.0,23.6044,-1.1454
45,smoke,pipe,1.0,2221.0,21.8109,6429.0,22.8303,-1.0194
13,demo,Unfinished_ed,1.0,2805.0,27.5459,7912.0,28.0966,-0.5507
27,demo,marital,1.0,416.0,4.0852,1274.0,4.5241,-0.4389


In [7]:
display(cat_demo_splits.query('category == "demo"').sort_values(by='diff_Venk21_Sybil', ascending=False).head(10))
cat_demo_splits.query('category == "demo"').sort_values(by='diff_Venk21_Sybil', ascending=True).head(10)

Unnamed: 0,category,attribute,value,Venk21_freq,Venk21_norm,Sybil_freq,Sybil_norm,diff_Venk21_Sybil
38,demo,Gender,2.0,4349.0,42.7084,11588.0,41.1506,1.5578
16,demo,educat,3.0,2538.0,24.9239,6581.0,23.37,1.5539
29,demo,marital,3.0,864.0,8.4847,2055.0,7.2976,1.1871
23,demo,NonHispanicWhite,1.0,9311.0,91.4367,25461.0,90.4155,1.0212
2,demo,race,1.0,9461.0,92.9098,25919.0,92.0419,0.8679
24,demo,WhiteOrBlack,1.0,9461.0,92.9098,25919.0,92.0419,0.8679
15,demo,educat,2.0,518.0,5.0869,1265.0,4.4922,0.5947
14,demo,educat,1.0,164.0,1.6105,353.0,1.2536,0.3569
20,demo,educat,7.0,1465.0,14.3867,3976.0,14.1193,0.2674
10,demo,ethnic,2.0,9977.0,97.977,27529.0,97.7592,0.2178


Unnamed: 0,category,attribute,value,Venk21_freq,Venk21_norm,Sybil_freq,Sybil_norm,diff_Venk21_Sybil
1,demo,Overweight,1.0,6963.0,68.3787,19898.0,70.6605,-2.2818
19,demo,educat,6.0,1585.0,15.5652,4836.0,17.1733,-1.6081
37,demo,Gender,1.0,5834.0,57.2916,16572.0,58.8494,-1.5578
34,demo,Married,1.0,6855.0,67.3181,19282.0,68.473,-1.1549
28,demo,marital,2.0,6855.0,67.3181,19282.0,68.473,-1.1549
18,demo,educat,5.0,2287.0,22.459,6647.0,23.6044,-1.1454
13,demo,Unfinished_ed,1.0,2805.0,27.5459,7912.0,28.0966,-0.5507
27,demo,marital,1.0,416.0,4.0852,1274.0,4.5241,-0.4389
4,demo,race,3.0,169.0,1.6596,575.0,2.0419,-0.3823
3,demo,race,2.0,338.0,3.3193,1036.0,3.679,-0.3597


### Numerical columns

In [8]:
num_demo_splits = data.combine_diff_dfs(sybil_demos['num'], data.diffs_numerical_means, training_sets)
display(num_demo_splits.sort_values(by='diff_Venk21_Sybil', ascending=False).query('diff_Venk21_Sybil > 0'))
num_demo_splits.sort_values(by='diff_Venk21_Sybil', ascending=True).query('diff_Venk21_Sybil < 0')

Unnamed: 0,category,attribute,value,Venk21,Sybil,diff_Venk21_Sybil
30,smoke,pkyr,75%,69.0,66.5,2.5
23,demo,Age,75%,67.0,65.0,2.0
44,smoke,smokeday,75%,35.0,33.0,2.0
21,demo,Age,25%,59.0,57.0,2.0
33,smoke,pkyr,mean,57.9729,56.1487,1.8242
31,smoke,pkyr,Mean (SD),58.0 (24.6),56.1 (23.7),1.8242
26,demo,Age,mean,63.2049,61.6496,1.5553
24,demo,Age,Mean (SD),63.2 (5.2),61.6 (5.1),1.5553
54,smoke,smokeyr,mean,41.0055,39.9523,1.0532
52,smoke,smokeyr,Mean (SD),41.0 (7.4),40.0 (7.4),1.0532


Unnamed: 0,category,attribute,value,Venk21,Sybil,diff_Venk21_Sybil
2,demo,weight,75%,205.0,208.0,-3.0
3,demo,weight,Mean (SD),181.8 (39.0),183.4 (39.1),-1.6016
5,demo,weight,mean,181.8113,183.4129,-1.6016
17,demo,BMI,Mean (SD),27.6 (5.0),27.9 (5.0),-0.2404
19,demo,BMI,mean,27.6277,27.8681,-0.2404
18,demo,BMI,Median (IQR),27 (6),27 (6),-0.2395
15,demo,BMI,50%,27.0197,27.2592,-0.2395
16,demo,BMI,75%,30.4066,30.559,-0.1524
6,demo,weight,std,38.9777,39.129,-0.1513
14,demo,BMI,25%,24.2738,24.4051,-0.1313


## Model Validation Sets

In [9]:
val_sets = {
    'Venk21': venk21_data,
    'SybilDev': sybil_splits['dev'],
    'SybilTest': sybil_splits['test']
}

### Categorical columns

In [10]:
cat_demo_val = data.combine_diff_dfs(sybil_demos['cat'], data.diffs_category_prevalence, val_sets).dropna(subset='value', axis=0).query('value != 0')
display(cat_demo_val.sort_values(by='diff_Venk21_SybilTest', ascending=False).head(10))
display(cat_demo_val.sort_values(by='diff_Venk21_SybilTest', ascending=True ).head(10))

Unnamed: 0,category,attribute,value,Venk21_freq,Venk21_norm,SybilDev_freq,SybilDev_norm,SybilTest_freq,SybilTest_norm,diff_Venk21_SybilDev,diff_Venk21_SybilTest,diff_SybilDev_SybilTest
217,other,FamilyHistoryLungCa,1.0,2622,25.7488,1469.0,21.4829,1362.0,21.681,4.2659,4.0678,-0.1981
203,lungcanc,LC_stage,110.0,596,5.8529,127.0,1.8573,113.0,1.7988,3.9956,4.0541,0.0585
192,lungcanc,Adenocarcinoma,1.0,539,5.2931,82.0,1.1992,107.0,1.7033,4.0939,3.5898,-0.5041
38,demo,Gender,2.0,4349,42.7084,2822.0,41.2694,2513.0,40.0032,1.439,2.7052,1.2662
51,smoke,smokework,1.0,8820,86.6149,5909.0,86.4142,5273.0,83.9382,0.2007,2.6767,2.476
117,disease,diagemph,1.0,1022,10.0363,576.0,8.4235,484.0,7.7046,1.6128,2.3317,0.7189
43,smoke,cigsmok,1.0,5088,49.9656,3241.0,47.3969,3026.0,48.1694,2.5687,1.7962,-0.7725
108,disease,diagchro,1.0,1064,10.4488,711.0,10.3978,551.0,8.7711,0.051,1.6777,1.6267
48,smoke,smokelive,1.0,8965,88.0389,5961.0,87.1746,5436.0,86.533,0.8643,1.5059,0.6416
194,lungcanc,Bronchiolo-alveolar_carcinoma,1.0,180,1.7677,59.0,0.8628,20.0,0.3184,0.9049,1.4493,0.5444


Unnamed: 0,category,attribute,value,Venk21_freq,Venk21_norm,SybilDev_freq,SybilDev_norm,SybilTest_freq,SybilTest_norm,diff_Venk21_SybilDev,diff_Venk21_SybilTest,diff_SybilDev_SybilTest
40,smoke,cigar,1.0,1934,18.9924,1450.0,21.205,1376.0,21.9039,-2.2126,-2.9115,-0.6989
37,demo,Gender,1.0,5834,57.2916,4016.0,58.7306,3769.0,59.9968,-1.439,-2.7052,-1.2662
45,smoke,pipe,1.0,2221,21.8109,1584.0,23.1647,1508.0,24.0051,-1.3538,-2.1942,-0.8404
34,demo,Married,1.0,6855,67.3181,4628.0,67.6806,4295.0,68.3699,-0.3625,-1.0518,-0.6893
28,demo,marital,2.0,6855,67.3181,4628.0,67.6806,4295.0,68.3699,-0.3625,-1.0518,-0.6893
1,demo,Overweight,1.0,6963,68.3787,4859.0,71.0588,4352.0,69.2773,-2.6801,-0.8986,1.7815
105,disease,diagchas,1.0,328,3.2211,188.0,2.7493,245.0,3.9,0.4718,-0.6789,-1.1507
20,demo,educat,7.0,1465,14.3867,1002.0,14.6534,942.0,14.9952,-0.2667,-0.6085,-0.3418
4,demo,race,3.0,169,1.6596,175.0,2.5592,142.0,2.2604,-0.8996,-0.6008,0.2988
7,demo,race,6.0,111,1.0901,100.0,1.4624,104.0,1.6555,-0.3723,-0.5654,-0.1931


### Numerical columns

In [11]:
num_demo_val = data.combine_diff_dfs(sybil_demos['num'], data.diffs_numerical_means, val_sets)    
display(num_demo_val.sort_values(by='diff_Venk21_SybilTest', ascending=False).head(10))
display(num_demo_val.sort_values(by='diff_Venk21_SybilTest', ascending=True ).head(10))

Unnamed: 0,category,attribute,value,Venk21,SybilDev,SybilTest,diff_Venk21_SybilDev,diff_Venk21_SybilTest,diff_SybilDev_SybilTest
23,demo,Age,75%,67.0,65.0,65.0,2.0,2.0,0.0
21,demo,Age,25%,59.0,57.0,57.0,2.0,2.0,0.0
26,demo,Age,mean,63.2049,61.6,61.5807,1.6049,1.6242,0.0193
24,demo,Age,Mean (SD),63.2 (5.2),61.6 (5.1),61.6 (5.1),1.6049,1.6242,0.0193
25,demo,Age,Median (IQR),62 (8),61 (8),61 (8),1.0,1.0,0.0
35,smoke,smokeage,25%,15.0,15.0,14.0,0.0,1.0,1.0
53,smoke,smokeyr,Median (IQR),41 (10),40 (10),40 (10),1.0,1.0,0.0
51,smoke,smokeyr,75%,46.0,45.0,45.0,1.0,1.0,0.0
50,smoke,smokeyr,50%,41.0,40.0,40.0,1.0,1.0,0.0
49,smoke,smokeyr,25%,36.0,35.0,35.0,1.0,1.0,0.0


Unnamed: 0,category,attribute,value,Venk21,SybilDev,SybilTest,diff_Venk21_SybilDev,diff_Venk21_SybilTest,diff_SybilDev_SybilTest
2,demo,weight,75%,205.0,205.0,210.0,0.0,-5.0,-5.0
3,demo,weight,Mean (SD),181.8 (39.0),183.7 (39.8),183.7 (40.2),-1.8493,-1.8751,-0.0258
5,demo,weight,mean,181.8113,183.6606,183.6864,-1.8493,-1.8751,-0.0258
6,demo,weight,std,38.9777,39.7804,40.1892,-0.8027,-1.2115,-0.4088
48,smoke,smokeday,std,11.4641,11.6416,11.9578,-0.1775,-0.4937,-0.3162
47,smoke,smokeday,mean,28.574,28.539,28.9032,0.035,-0.3292,-0.3642
45,smoke,smokeday,Mean (SD),28.6 (11.5),28.5 (11.6),28.9 (12.0),0.035,-0.3292,-0.3642
16,demo,BMI,75%,30.4066,30.6635,30.7242,-0.2569,-0.3176,-0.0607
34,smoke,pkyr,std,24.6293,25.1968,24.9038,-0.5675,-0.2745,0.293
17,demo,BMI,Mean (SD),27.6 (5.0),28.0 (5.1),27.9 (5.1),-0.339,-0.2397,0.0993


## Sybil Train vs. Validation Sets

In [12]:
sybil_splits["eval"] = venk21_data

In [13]:
cat_demo_shift = data.combine_diff_dfs(sybil_demos['cat'], data.diffs_category_prevalence, sybil_splits).dropna(subset='value', axis=0).query('value != 0')
display(cat_demo_shift.sort_values(by='diff_train_test', ascending=False).head(10))
cat_demo_shift.sort_values(by='diff_train_test', ascending=True).head(10)

Unnamed: 0,category,attribute,value,train_freq,train_norm,dev_freq,dev_norm,test_freq,test_norm,eval_freq,eval_norm,diff_train_dev,diff_train_test,diff_train_eval,diff_dev_test,diff_dev_eval,diff_test_eval
13,demo,Unfinished_ed,1.0,7912.0,28.0966,1801.0,26.3381,1658.0,26.3929,2805.0,27.5459,1.7585,1.7037,0.5507,-0.0548,-1.2078,-1.153
51,smoke,smokework,1.0,24110.0,85.6179,5909.0,86.4142,5273.0,83.9382,8820.0,86.6149,-0.7963,1.6797,-0.997,2.476,-0.2007,-2.6767
18,demo,educat,5.0,6647.0,23.6044,1521.0,22.2433,1394.0,22.1904,2287.0,22.459,1.3611,1.414,1.1454,0.0529,-0.2157,-0.2686
1,demo,Overweight,1.0,19898.0,70.6605,4859.0,71.0588,4352.0,69.2773,6963.0,68.3787,-0.3983,1.3832,2.2818,1.7815,2.6801,0.8986
38,demo,Gender,2.0,11588.0,41.1506,2822.0,41.2694,2513.0,40.0032,4349.0,42.7084,-0.1188,1.1474,-1.5578,1.2662,-1.439,-2.7052
19,demo,educat,6.0,4836.0,17.1733,1177.0,17.2126,1008.0,16.0458,1585.0,15.5652,-0.0393,1.1275,1.6081,1.1668,1.6474,0.4806
126,disease,diaghype,1.0,9856.0,35.0,2388.0,34.9225,2131.0,33.9223,3562.0,34.9799,0.0775,1.0777,0.0201,1.0002,-0.0574,-1.0576
108,disease,diagchro,1.0,2750.0,9.7656,711.0,10.3978,551.0,8.7711,1064.0,10.4488,-0.6322,0.9945,-0.6832,1.6267,-0.051,-1.6777
48,smoke,smokelive,1.0,24625.0,87.4467,5961.0,87.1746,5436.0,86.533,8965.0,88.0389,0.2721,0.9137,-0.5922,0.6416,-0.8643,-1.5059
25,demo,WhiteOrBlack,2.0,1036.0,3.679,290.0,4.241,187.0,2.9768,338.0,3.3193,-0.562,0.7022,0.3597,1.2642,0.9217,-0.3425


Unnamed: 0,category,attribute,value,train_freq,train_norm,dev_freq,dev_norm,test_freq,test_norm,eval_freq,eval_norm,diff_train_dev,diff_train_test,diff_train_eval,diff_dev_test,diff_dev_eval,diff_test_eval
16,demo,educat,3.0,6581.0,23.37,1682.0,24.5978,1571.0,25.008,2538.0,24.9239,-1.2278,-1.638,-1.5539,-0.4102,-0.3261,0.0841
45,smoke,pipe,1.0,6429.0,22.8303,1584.0,23.1647,1508.0,24.0051,2221.0,21.8109,-0.3344,-1.1748,1.0194,-0.8404,1.3538,2.1942
37,demo,Gender,1.0,16572.0,58.8494,4016.0,58.7306,3769.0,59.9968,5834.0,57.2916,0.1188,-1.1474,1.5578,-1.2662,1.439,2.7052
20,demo,educat,7.0,3976.0,14.1193,1002.0,14.6534,942.0,14.9952,1465.0,14.3867,-0.5341,-0.8759,-0.2674,-0.3418,0.2667,0.6085
40,smoke,cigar,1.0,5942.0,21.1009,1450.0,21.205,1376.0,21.9039,1934.0,18.9924,-0.1041,-0.803,2.1085,-0.6989,2.2126,2.9115
93,work,wrkweld,1.0,1597.0,5.6712,408.0,5.9667,401.0,6.3833,596.0,5.8529,-0.2955,-0.7121,-0.1817,-0.4166,0.1138,0.5304
43,smoke,cigsmok,1.0,13386.0,47.5355,3241.0,47.3969,3026.0,48.1694,5088.0,49.9656,0.1386,-0.6339,-2.4301,-0.7725,-2.5687,-1.7962
129,disease,diagpneu,1.0,6278.0,22.294,1613.0,23.5888,1439.0,22.9067,2356.0,23.1366,-1.2948,-0.6127,-0.8426,0.6821,0.4522,-0.2299
23,demo,NonHispanicWhite,1.0,25461.0,90.4155,6076.0,88.8564,5718.0,91.022,9311.0,91.4367,1.5591,-0.6065,-1.0212,-2.1656,-2.5803,-0.4147
10,demo,ethnic,2.0,27529.0,97.7592,6664.0,97.4554,6174.0,98.2808,9977.0,97.977,0.3038,-0.5216,-0.2178,-0.8254,-0.5216,0.3038


In [14]:
num_demo_shift = data.combine_diff_dfs(sybil_demos['num'], data.diffs_numerical_means, sybil_splits)
display(num_demo_shift.sort_values(by='diff_train_test', ascending=False).head(10))
num_demo_shift.sort_values(by='diff_train_test', ascending=True).head(10)

Unnamed: 0,category,attribute,value,train,dev,test,eval,diff_train_dev,diff_train_test,diff_train_eval,diff_dev_test,diff_dev_eval,diff_test_eval
35,smoke,smokeage,25%,15.0,15.0,14.0,15.0,0.0,1.0,0.0,1.0,0.0,-1.0
18,demo,BMI,Median (IQR),27 (6),27 (6),27 (6),27 (6),-0.0575,0.1665,0.2395,0.224,0.297,0.073
15,demo,BMI,50%,27.2592,27.3167,27.0927,27.0197,-0.0575,0.1665,0.2395,0.224,0.297,0.073
38,smoke,smokeage,Mean (SD),16.7 (3.6),16.7 (3.6),16.6 (3.7),16.6 (3.5),0.0048,0.072,0.1208,0.0672,0.116,0.0488
40,smoke,smokeage,mean,16.7199,16.7151,16.6479,16.5991,0.0048,0.072,0.1208,0.0672,0.116,0.0488
26,demo,Age,mean,61.6496,61.6,61.5807,63.2049,0.0496,0.0689,-1.5553,0.0193,-1.6049,-1.6242
24,demo,Age,Mean (SD),61.6 (5.1),61.6 (5.1),61.6 (5.1),63.2 (5.2),0.0496,0.0689,-1.5553,0.0193,-1.6049,-1.6242
14,demo,BMI,25%,24.4051,24.5371,24.3636,24.2738,-0.132,0.0415,0.1313,0.1735,0.2633,0.0898
19,demo,BMI,mean,27.8681,27.9667,27.8674,27.6277,-0.0986,0.0007,0.2404,0.0993,0.339,0.2397
17,demo,BMI,Mean (SD),27.9 (5.0),28.0 (5.1),27.9 (5.1),27.6 (5.0),-0.0986,0.0007,0.2404,0.0993,0.339,0.2397


Unnamed: 0,category,attribute,value,train,dev,test,eval,diff_train_dev,diff_train_test,diff_train_eval,diff_dev_test,diff_dev_eval,diff_test_eval
44,smoke,smokeday,75%,33.0,35.0,35.0,35.0,-2.0,-2.0,-2.0,0.0,0.0,0.0
2,demo,weight,75%,208.0,205.0,210.0,205.0,3.0,-2.0,3.0,-5.0,0.0,5.0
30,smoke,pkyr,75%,66.5,67.5,68.0,69.0,-1.0,-1.5,-2.5,-0.5,-1.5,-1.0
34,smoke,pkyr,std,23.6767,25.1968,24.9038,24.6293,-1.5201,-1.2271,-0.9526,0.293,0.5675,0.2745
31,smoke,pkyr,Mean (SD),56.1 (23.7),56.4 (25.2),57.2 (24.9),58.0 (24.6),-0.2614,-1.0742,-1.8242,-0.8128,-1.5628,-0.75
33,smoke,pkyr,mean,56.1487,56.4101,57.2229,57.9729,-0.2614,-1.0742,-1.8242,-0.8128,-1.5628,-0.75
6,demo,weight,std,39.129,39.7804,40.1892,38.9777,-0.6514,-1.0602,0.1513,-0.4088,0.8027,1.2115
48,smoke,smokeday,std,11.2621,11.6416,11.9578,11.4641,-0.3795,-0.6957,-0.202,-0.3162,0.1775,0.4937
29,smoke,pkyr,50%,49.0,48.0,49.5,50.0,1.0,-0.5,-1.0,-1.5,-2.0,-0.5
32,smoke,pkyr,Median (IQR),49 (26),48 (28),49 (28),50 (28),1.0,-0.5,-1.0,-1.5,-2.0,-0.5


Conclusion: not much demographic shift (1-2% overall not much, really). Besides family history.

## NLST vs. DLCST

In [15]:
dlcst_preds = pd.read_csv(f"{FILE_DIR}/dlcst_allmodels_cal.csv", header=0)
dlcst_preds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 599 entries, 0 to 598
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   PatientID            599 non-null    int64  
 1   StudyDate            599 non-null    int64  
 2   SeriesInstanceUID    599 non-null    object 
 3   Age                  599 non-null    int64  
 4   Sex                  599 non-null    int64  
 5   FamilyHistoryLungCa  599 non-null    int64  
 6   Emphysema            599 non-null    int64  
 7   NoduleCountPerScan   599 non-null    int64  
 8   sybil_year1          599 non-null    float64
 9   sybil_year2          599 non-null    float64
 10  sybil_year3          599 non-null    float64
 11  sybil_year4          599 non-null    float64
 12  sybil_year5          599 non-null    float64
 13  sybil_year6          599 non-null    float64
 14  PanCan2b             599 non-null    float64
 15  Ensemble_Kiran       599 non-null    flo

In [16]:
venk21_data['Sex'] = venk21_data['Gender']
venk21_data['NoduleCountPerScan'] = venk21_data['NoduleCounts']
venk21_data['Emphysema'] = venk21_data['Emphysema'].astype(int)

In [17]:
screening_sets = {
    # "nlst_sybil": sybil_demos,
    "nlst": venk21_data,
    "dlcst": dlcst_preds
}

In [18]:
cat_demo_dlcst = data.combine_diff_dfs(DLCST_DEMOCOLS['cat'], data.diffs_category_prevalence, screening_sets)
display(cat_demo_dlcst.sort_values(by='diff_nlst_dlcst', ascending=False).head(10))
cat_demo_dlcst.sort_values(by='diff_nlst_dlcst', ascending=True).head(10)

Unnamed: 0,category,attribute,value,nlst_freq,nlst_norm,dlcst_freq,dlcst_norm,diff_nlst_dlcst
4,other,Emphysema,0,6486,63.6944,194,32.3873,31.3071
3,other,FamilyHistoryLungCa,1,2622,25.7488,102,17.0284,8.7204
0,demo,Sex,1,5834,57.2916,323,53.9232,3.3684
1,demo,Sex,2,4349,42.7084,276,46.0768,-3.3684
2,other,FamilyHistoryLungCa,0,7561,74.2512,497,82.9716,-8.7204
5,other,Emphysema,1,3697,36.3056,405,67.6127,-31.3071


Unnamed: 0,category,attribute,value,nlst_freq,nlst_norm,dlcst_freq,dlcst_norm,diff_nlst_dlcst
5,other,Emphysema,1,3697,36.3056,405,67.6127,-31.3071
2,other,FamilyHistoryLungCa,0,7561,74.2512,497,82.9716,-8.7204
1,demo,Sex,2,4349,42.7084,276,46.0768,-3.3684
0,demo,Sex,1,5834,57.2916,323,53.9232,3.3684
3,other,FamilyHistoryLungCa,1,2622,25.7488,102,17.0284,8.7204
4,other,Emphysema,0,6486,63.6944,194,32.3873,31.3071


In [19]:
num_demo_dlcst = data.combine_diff_dfs(DLCST_DEMOCOLS['num'], data.diffs_numerical_means, screening_sets)
display(num_demo_dlcst.sort_values(by='diff_nlst_dlcst', ascending=False).head(10))
num_demo_dlcst.sort_values(by='diff_nlst_dlcst', ascending=True).head(10)

Unnamed: 0,category,attribute,value,nlst,dlcst,diff_nlst_dlcst
0,demo,Age,25%,59.0,54.0,5.0
2,demo,Age,75%,67.0,62.0,5.0
3,demo,Age,Mean (SD),63.2 (5.2),58.4 (4.9),4.8076
5,demo,Age,mean,63.2049,58.3973,4.8076
1,demo,Age,50%,62.0,58.0,4.0
4,demo,Age,Median (IQR),62 (8),58 (8),4.0
6,demo,Age,std,5.2401,4.9461,0.294
8,other,NoduleCountPerScan,Mean (SD),1.9 (1.3),1.8 (1.2),0.1139
10,other,NoduleCountPerScan,mean,1.8835,1.7696,0.1139
11,other,NoduleCountPerScan,std,1.256,1.1654,0.0906


Unnamed: 0,category,attribute,value,nlst,dlcst,diff_nlst_dlcst
7,other,NoduleCountPerScan,75%,2.0,2.0,0.0
9,other,NoduleCountPerScan,Median (IQR),1 (1),1 (1),0.0
11,other,NoduleCountPerScan,std,1.256,1.1654,0.0906
8,other,NoduleCountPerScan,Mean (SD),1.9 (1.3),1.8 (1.2),0.1139
10,other,NoduleCountPerScan,mean,1.8835,1.7696,0.1139
6,demo,Age,std,5.2401,4.9461,0.294
1,demo,Age,50%,62.0,58.0,4.0
4,demo,Age,Median (IQR),62 (8),58 (8),4.0
3,demo,Age,Mean (SD),63.2 (5.2),58.4 (4.9),4.8076
5,demo,Age,mean,63.2049,58.3973,4.8076


### Different validation sets

In [20]:
all_nodules = pd.read_csv(f"{FILE_DIR}/nlst_allmodels_demos.csv")
all_nodules, nlst_democols_nodules, _ = data.prep_nlst_preds(all_nodules, democols=venk21_demos, scanlevel=False, sybil=False)
print(len(all_nodules))

16077


In [21]:
all_scans, _, _ = data.prep_nlst_preds(all_nodules, democols=venk21_demos_original, scanlevel=True, sybil=True)
print(len(all_scans))

5911


In [22]:
valsets = {
    "allnodules": all_nodules,
    "allscans": all_scans,
}

In [23]:
cat_demo_shift = data.combine_diff_dfs(venk21_demos['cat'], data.diffs_category_prevalence, valsets)
num_demo_shift = data.combine_diff_dfs(venk21_demos['num'], data.diffs_numerical_means, valsets)

#### Difference between Nodule sets and Scan sets

In [24]:
display(cat_demo_shift.sort_values(by='diff_allnodules_allscans', ascending=False).head(10))
cat_demo_shift.sort_values(by='diff_allnodules_allscans', ascending=True).head(10)

Unnamed: 0,category,attribute,value,allnodules_freq,allnodules_norm,allscans_freq,allscans_norm,diff_allnodules_allscans
236,nodule,NoduleInUpperLung,0.0,9331,58.0394,2850.0,48.2152,9.8242
222,nodule,Solid,0.0,6391,39.7524,1911.0,32.3296,7.4228
224,nodule,GroundGlassOpacity,0.0,12466,77.5393,4275.0,72.3228,5.2165
232,nodule,SemiSolid,0.0,14605,90.8441,5159.0,87.278,3.5661
226,nodule,Perifissural,0.0,15148,94.2216,5387.0,91.1352,3.0864
220,other,label,0.0,14828,92.2311,5330.0,90.1709,2.0602
211,lungcanc,LC_stage,,14859,92.424,5345.0,90.4246,1.9994
218,other,Emphysema,0.0,10385,64.5954,3755.0,63.5256,1.0698
114,disease,diagdiab,1.0,1632,10.1511,545.0,9.2201,0.931
39,smoke,cigar,0.0,13031,81.0537,4738.0,80.1556,0.8981


Unnamed: 0,category,attribute,value,allnodules_freq,allnodules_norm,allscans_freq,allscans_norm,diff_allnodules_allscans
237,nodule,NoduleInUpperLung,1.0,6746,41.9606,3061.0,51.7848,-9.8242
223,nodule,Solid,1.0,9686,60.2476,4000.0,67.6704,-7.4228
225,nodule,GroundGlassOpacity,1.0,3611,22.4607,1636.0,27.6772,-5.2165
233,nodule,SemiSolid,1.0,1472,9.1559,752.0,12.722,-3.5661
227,nodule,Perifissural,1.0,929,5.7784,524.0,8.8648,-3.0864
221,other,label,1.0,1249,7.7689,581.0,9.8291,-2.0602
203,lungcanc,LC_stage,110.0,625,3.8875,295.0,4.9907,-1.1032
219,other,Emphysema,1.0,5692,35.4046,2156.0,36.4744,-1.0698
113,disease,diagdiab,0.0,14428,89.7431,5362.0,90.7122,-0.9691
125,disease,diaghype,0.0,10489,65.2423,3909.0,66.1309,-0.8886


In [25]:
display(num_demo_shift.sort_values(by='diff_allnodules_allscans', ascending=False).head(10))
num_demo_shift.sort_values(by='diff_allnodules_allscans', ascending=True).head(10)

Unnamed: 0,category,attribute,value,allnodules,allscans,diff_allnodules_allscans
35,smoke,smokeage,25%,15.0,14.0,1.0
22,demo,Age,50%,63.0,62.0,1.0
66,other,NoduleCounts,Median (IQR),2 (2),1 (1),1.0
64,other,NoduleCounts,75%,3.0,2.0,1.0
63,other,NoduleCounts,50%,2.0,1.0,1.0
25,demo,Age,Median (IQR),63 (8),62 (8),1.0
67,other,NoduleCounts,mean,2.555,1.9,0.655
65,other,NoduleCounts,Mean (SD),2.6 (1.6),1.9 (1.3),0.655
3,demo,weight,Mean (SD),182.4 (38.9),181.9 (39.2),0.4928
5,demo,weight,mean,182.4158,181.923,0.4928


Unnamed: 0,category,attribute,value,allnodules,allscans,diff_allnodules_allscans
58,other,Diameter_mm,75%,9.1,10.5,-1.4
61,other,Diameter_mm,mean,8.0177,9.048,-1.0303
59,other,Diameter_mm,Mean (SD),8.0 (5.3),9.0 (6.0),-1.0303
60,other,Diameter_mm,Median (IQR),6 (4),7 (5),-0.9
57,other,Diameter_mm,50%,6.2,7.1,-0.9
34,smoke,pkyr,std,24.3554,25.1504,-0.795
62,other,Diameter_mm,std,5.2795,5.99,-0.7105
30,smoke,pkyr,75%,69.0,69.5,-0.5
56,other,Diameter_mm,25%,4.9,5.3,-0.4
6,demo,weight,std,38.8744,39.2448,-0.3704


## What about men vs. women?

### Training sets

In [26]:
gender_train_sets = {
    "M":sybil_splits['train'].query('Gender == 1'),
    "F":sybil_splits['train'].query('Gender == 2'),
}

In [27]:
cat_demo_gender = data.combine_diff_dfs(sybil_demos['cat'], data.diffs_category_prevalence, gender_train_sets, include_stat=True).query('value != 0')
display(cat_demo_gender.sort_values(by='diff_M_F', ascending=False))
cat_demo_gender.sort_values(by='diff_M_F', ascending=True)

  ratio = p1 / p2
  odds_ratio = p1 / (1 - p1) / p2 * (1 - p2)
  statistic = diff_stat / np.sqrt(var)
  zstat = value / std
  ratio = p1 / p2
  odds_ratio = p1 / (1 - p1) / p2 * (1 - p2)
  ratio = p1 / p2
  odds_ratio = p1 / (1 - p1) / p2 * (1 - p2)
  ratio = p1 / p2
  odds_ratio = p1 / (1 - p1) / p2 * (1 - p2)
  ratio = p1 / p2
  odds_ratio = p1 / (1 - p1) / p2 * (1 - p2)
  ratio = p1 / p2
  odds_ratio = p1 / (1 - p1) / p2 * (1 - p2)
  ratio = p1 / p2
  odds_ratio = p1 / (1 - p1) / p2 * (1 - p2)
  ratio = p1 / p2
  odds_ratio = p1 / (1 - p1) / p2 * (1 - p2)
  ratio = p1 / p2
  odds_ratio = p1 / (1 - p1) / p2 * (1 - p2)
  ratio = p1 / p2
  odds_ratio = p1 / (1 - p1) / p2 * (1 - p2)
  ratio = p1 / p2
  odds_ratio = p1 / (1 - p1) / p2 * (1 - p2)
  ratio = p1 / p2
  odds_ratio = p1 / (1 - p1) / p2 * (1 - p2)
  ratio = p1 / p2
  odds_ratio = p1 / (1 - p1) / p2 * (1 - p2)
  ratio = p1 / p2
  odds_ratio = p1 / (1 - p1) / p2 * (1 - p2)
  ratio = p1 / p2
  odds_ratio = p1 / (1 - p1) / p2 * (1 

Unnamed: 0,category,attribute,value,M_freq,M_norm,F_freq,F_norm,diff_M_F,stat_M_F,p_M_F
37,demo,Gender,1.0,16572.0,100.0000,0.0,0.0000,100.0000,inf,0.000000e+00
45,smoke,pipe,1.0,6186.0,37.3280,243.0,2.0970,35.2310,88.386383,0.000000e+00
40,smoke,cigar,1.0,5465.0,32.9773,477.0,4.1163,28.8610,70.533064,0.000000e+00
34,demo,Married,1.0,12987.0,78.3671,6295.0,54.3234,24.0437,42.742945,0.000000e+00
28,demo,marital,2.0,12987.0,78.3671,6295.0,54.3234,24.0437,42.742945,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...
129,disease,diagpneu,1.0,3095.0,18.6761,3183.0,27.4681,-8.7920,-17.125074,9.648654e-66
48,smoke,smokelive,1.0,13873.0,83.7135,10752.0,92.7856,-9.0721,-24.243129,7.813098e-130
29,demo,marital,3.0,566.0,3.4154,1489.0,12.8495,-9.4341,-27.634809,4.249472e-168
31,demo,marital,5.0,2148.0,12.9616,3057.0,26.3807,-13.4191,-27.641877,3.494461e-168


Unnamed: 0,category,attribute,value,M_freq,M_norm,F_freq,F_norm,diff_M_F,stat_M_F,p_M_F
38,demo,Gender,2.0,0.0,0.0000,11588.0,100.0000,-100.0000,-inf,0.000000e+00
31,demo,marital,5.0,2148.0,12.9616,3057.0,26.3807,-13.4191,-27.641877,3.494461e-168
29,demo,marital,3.0,566.0,3.4154,1489.0,12.8495,-9.4341,-27.634809,4.249472e-168
48,smoke,smokelive,1.0,13873.0,83.7135,10752.0,92.7856,-9.0721,-24.243129,7.813098e-130
129,disease,diagpneu,1.0,3095.0,18.6761,3183.0,27.4681,-8.7920,-17.125074,9.648654e-66
...,...,...,...,...,...,...,...,...,...,...
34,demo,Married,1.0,12987.0,78.3671,6295.0,54.3234,24.0437,42.742945,0.000000e+00
28,demo,marital,2.0,12987.0,78.3671,6295.0,54.3234,24.0437,42.742945,0.000000e+00
40,smoke,cigar,1.0,5465.0,32.9773,477.0,4.1163,28.8610,70.533064,0.000000e+00
45,smoke,pipe,1.0,6186.0,37.3280,243.0,2.0970,35.2310,88.386383,0.000000e+00


In [28]:
display(cat_demo_gender.sort_values(by='diff_M_F', ascending=False).query('category == "lungcanc"'))
cat_demo_gender.sort_values(by='diff_M_F', ascending=True).query('category == "lungcanc"')

Unnamed: 0,category,attribute,value,M_freq,M_norm,F_freq,F_norm,diff_M_F,stat_M_F,p_M_F
190,lungcanc,Squamous_cell_carcinoma,1.0,250.0,1.5086,106.0,0.9147,0.5939,4.583207,5e-06
209,lungcanc,LC_stage,400.0,254.0,1.5327,130.0,1.1219,0.4108,3.006118,0.002646
205,lungcanc,LC_stage,210.0,70.0,0.4224,28.0,0.2416,0.1808,2.66004,0.007813
211,lungcanc,LC_stage,,15737.0,94.9614,10988.0,94.8222,0.1392,0.521339,0.602131
188,lungcanc,Small_cell_carcinoma,1.0,119.0,0.7181,80.0,0.6904,0.0277,0.274115,0.783996
206,lungcanc,LC_stage,220.0,29.0,0.175,18.0,0.1553,0.0197,0.401954,0.687718
198,lungcanc,Adenosquamous_carcinoma,1.0,14.0,0.0845,10.0,0.0863,-0.0018,-0.051304,0.959084
210,lungcanc,LC_stage,900.0,3.0,0.0181,3.0,0.0259,-0.0078,-0.426947,0.669418
202,lungcanc,Unclassified_carcinoma,1.0,92.0,0.5552,66.0,0.5696,-0.0144,-0.158859,0.87378
207,lungcanc,LC_stage,310.0,102.0,0.6155,74.0,0.6386,-0.0231,-0.241224,0.809382


Unnamed: 0,category,attribute,value,M_freq,M_norm,F_freq,F_norm,diff_M_F,stat_M_F,p_M_F
203,lungcanc,LC_stage,110.0,284.0,1.7137,265.0,2.2868,-0.5731,-3.339789,0.000838
194,lungcanc,Bronchiolo-alveolar_carcinoma,1.0,47.0,0.2836,76.0,0.6559,-0.3723,-4.348065,1.4e-05
192,lungcanc,Adenocarcinoma,1.0,305.0,1.8405,253.0,2.1833,-0.3428,-2.001829,0.045303
208,lungcanc,LC_stage,320.0,32.0,0.1931,32.0,0.2761,-0.083,-1.395978,0.162721
204,lungcanc,LC_stage,120.0,61.0,0.3681,50.0,0.4315,-0.0634,-0.823843,0.410029
196,lungcanc,Large_cell_carcinoma,1.0,18.0,0.1086,17.0,0.1467,-0.0381,-0.869463,0.384594
200,lungcanc,Carcinoid_tumor,1.0,1.0,0.006,4.0,0.0345,-0.0285,-1.558146,0.119199
207,lungcanc,LC_stage,310.0,102.0,0.6155,74.0,0.6386,-0.0231,-0.241224,0.809382
202,lungcanc,Unclassified_carcinoma,1.0,92.0,0.5552,66.0,0.5696,-0.0144,-0.158859,0.87378
210,lungcanc,LC_stage,900.0,3.0,0.0181,3.0,0.0259,-0.0078,-0.426947,0.669418


In [29]:
num_demo_gender = data.combine_diff_dfs(sybil_demos['num'], data.diffs_numerical_means, gender_train_sets, include_stat=True)
display(num_demo_gender.sort_values(by='diff_M_F', ascending=False).head(10))
num_demo_gender.sort_values(by='diff_M_F', ascending=True).head(10)

Unnamed: 0,category,attribute,value,M,F,diff_M_F,stat_M_F,p_M_F
2,demo,weight,75%,220.0,180.0,40.0,85.449791,0.0
4,demo,weight,Median (IQR),195 (45),157 (42),38.0,85.449791,0.0
1,demo,weight,50%,195.0,157.0,38.0,85.449791,0.0
0,demo,weight,25%,175.0,138.0,37.0,85.449791,0.0
3,demo,weight,Mean (SD),198.3 (35.1),162.1 (34.5),36.1275,85.449791,0.0
5,demo,weight,mean,198.2521,162.1246,36.1275,85.449791,0.0
30,smoke,pkyr,75%,72.0,60.0,12.0,30.427542,4.14073e-200
46,smoke,smokeday,Median (IQR),30 (20),20 (10),10.0,26.078155,3.715016e-148
43,smoke,smokeday,50%,30.0,20.0,10.0,26.078155,3.715016e-148
44,smoke,smokeday,75%,40.0,30.0,10.0,26.078155,3.715016e-148


Unnamed: 0,category,attribute,value,M,F,diff_M_F,stat_M_F,p_M_F
38,smoke,smokeage,Mean (SD),16.3 (3.6),17.4 (3.5),-1.0866,-25.186671,1.915804e-138
40,smoke,smokeage,mean,16.2727,17.3593,-1.0866,-25.186671,1.915804e-138
37,smoke,smokeage,75%,18.0,19.0,-1.0,-25.186671,1.915804e-138
35,smoke,smokeage,25%,14.0,15.0,-1.0,-25.186671,1.915804e-138
36,smoke,smokeage,50%,16.0,17.0,-1.0,-25.186671,1.915804e-138
39,smoke,smokeage,Median (IQR),16 (4),17 (4),-1.0,-25.186671,1.915804e-138
20,demo,BMI,std,4.5235,5.5232,-0.9997,12.938133,3.5265779999999996e-38
42,smoke,smokeday,25%,20.0,20.0,0.0,26.078155,3.715016e-148
49,smoke,smokeyr,25%,35.0,35.0,0.0,10.415331,2.34694e-25
21,demo,Age,25%,57.0,57.0,0.0,7.108716,1.199168e-12


### Evaluation sets (Venk21 data)

In [30]:
gender_eval_sets = {
    "M":venk21_data.query('Gender == 1'),
    "F":venk21_data.query('Gender == 2'),
}

In [31]:
cat_gender_eval = data.combine_diff_dfs(venk21_demos['cat'], data.diffs_category_prevalence, gender_eval_sets).query('value != 0')
display(cat_gender_eval.sort_values(by='diff_M_F', ascending=False).head(40))
cat_gender_eval.sort_values(by='diff_M_F', ascending=True).head(40)

Unnamed: 0,category,attribute,value,M_freq,M_norm,F_freq,F_norm,diff_M_F
37,demo,Gender,1.0,5834.0,100.0,0.0,0.0,100.0
45,smoke,pipe,1.0,2133.0,36.5615,88.0,2.0235,34.538
40,smoke,cigar,1.0,1795.0,30.7679,139.0,3.1961,27.5718
34,demo,Married,1.0,4531.0,77.6654,2324.0,53.4376,24.2278
28,demo,marital,2.0,4531.0,77.6654,2324.0,53.4376,24.2278
213,other,wrknomask,1.0,2005.0,34.3675,577.0,13.2674,21.1001
1,demo,Overweight,1.0,4318.0,74.0144,2645.0,60.8186,13.1958
72,work,wrkfarm,1.0,907.0,15.5468,262.0,6.0244,9.5224
223,nodule,Solid,1.0,4074.0,69.832,2626.0,60.3817,9.4503
123,disease,diaghear,1.0,983.0,16.8495,349.0,8.0248,8.8247


Unnamed: 0,category,attribute,value,M_freq,M_norm,F_freq,F_norm,diff_M_F
38,demo,Gender,2.0,0.0,0.0,4349.0,100.0,-100.0
31,demo,marital,5.0,797.0,13.6613,1107.0,25.4541,-11.7928
29,demo,marital,3.0,205.0,3.5139,659.0,15.1529,-11.639
129,disease,diagpneu,1.0,1086.0,18.615,1270.0,29.2021,-10.5871
48,smoke,smokelive,1.0,4887.0,83.7676,4078.0,93.7687,-10.0011
225,nodule,GroundGlassOpacity,1.0,1406.0,24.1001,1453.0,33.41,-9.3099
108,disease,diagchro,1.0,393.0,6.7364,671.0,15.4288,-8.6924
96,disease,diagadas,1.0,205.0,3.5139,461.0,10.6001,-7.0862
16,demo,educat,3.0,1284.0,22.0089,1254.0,28.8342,-6.8253
215,other,PersonalCancerHist,1.0,120.0,2.0569,357.0,8.2088,-6.1519


In [32]:
display(cat_gender_eval.query('category == "nodule"').sort_values(by='diff_M_F', ascending=False).head(40))
cat_gender_eval.query('category == "nodule"').sort_values(by='diff_M_F', ascending=True).head(40)

Unnamed: 0,category,attribute,value,M_freq,M_norm,F_freq,F_norm,diff_M_F
223,nodule,Solid,1.0,4074.0,69.832,2626.0,60.3817,9.4503
227,nodule,Perifissural,1.0,564.0,9.6675,261.0,6.0014,3.6661
235,nodule,Calcified,1.0,5.0,0.0857,3.0,0.069,0.0167
239,nodule,Spiculation,1.0,10.0,0.1714,10.0,0.2299,-0.0585
229,nodule,NonSolid,1.0,66.0,1.1313,58.0,1.3336,-0.2023
231,nodule,PartSolid,1.0,101.0,1.7312,139.0,3.1961,-1.4649
233,nodule,SemiSolid,1.0,666.0,11.4158,640.0,14.716,-3.3002
237,nodule,NoduleInUpperLung,1.0,2919.0,50.0343,2343.0,53.8745,-3.8402
225,nodule,GroundGlassOpacity,1.0,1406.0,24.1001,1453.0,33.41,-9.3099


Unnamed: 0,category,attribute,value,M_freq,M_norm,F_freq,F_norm,diff_M_F
225,nodule,GroundGlassOpacity,1.0,1406.0,24.1001,1453.0,33.41,-9.3099
237,nodule,NoduleInUpperLung,1.0,2919.0,50.0343,2343.0,53.8745,-3.8402
233,nodule,SemiSolid,1.0,666.0,11.4158,640.0,14.716,-3.3002
231,nodule,PartSolid,1.0,101.0,1.7312,139.0,3.1961,-1.4649
229,nodule,NonSolid,1.0,66.0,1.1313,58.0,1.3336,-0.2023
239,nodule,Spiculation,1.0,10.0,0.1714,10.0,0.2299,-0.0585
235,nodule,Calcified,1.0,5.0,0.0857,3.0,0.069,0.0167
227,nodule,Perifissural,1.0,564.0,9.6675,261.0,6.0014,3.6661
223,nodule,Solid,1.0,4074.0,69.832,2626.0,60.3817,9.4503


In [33]:
display(cat_gender_eval.query('attribute == "LC_stage"').sort_values(by='diff_M_F', ascending=False).head(40))
cat_gender_eval.query('attribute == "LC_stage"').sort_values(by='diff_M_F', ascending=True).head(40)

Unnamed: 0,category,attribute,value,M_freq,M_norm,F_freq,F_norm,diff_M_F
209,lungcanc,LC_stage,400.0,125.0,2.1426,44.0,1.0117,1.1309
205,lungcanc,LC_stage,210.0,65.0,1.1142,19.0,0.4369,0.6773
208,lungcanc,LC_stage,320.0,20.0,0.3428,11.0,0.2529,0.0899
207,lungcanc,LC_stage,310.0,78.0,1.337,58.0,1.3336,0.0034
210,lungcanc,LC_stage,900.0,1.0,0.0171,3.0,0.069,-0.0519
204,lungcanc,LC_stage,120.0,61.0,1.0456,51.0,1.1727,-0.1271
206,lungcanc,LC_stage,220.0,18.0,0.3085,22.0,0.5059,-0.1974
211,lungcanc,LC_stage,,5154.0,88.3442,3857.0,88.6871,-0.3429
203,lungcanc,LC_stage,110.0,312.0,5.348,284.0,6.5302,-1.1822


Unnamed: 0,category,attribute,value,M_freq,M_norm,F_freq,F_norm,diff_M_F
203,lungcanc,LC_stage,110.0,312.0,5.348,284.0,6.5302,-1.1822
211,lungcanc,LC_stage,,5154.0,88.3442,3857.0,88.6871,-0.3429
206,lungcanc,LC_stage,220.0,18.0,0.3085,22.0,0.5059,-0.1974
204,lungcanc,LC_stage,120.0,61.0,1.0456,51.0,1.1727,-0.1271
210,lungcanc,LC_stage,900.0,1.0,0.0171,3.0,0.069,-0.0519
207,lungcanc,LC_stage,310.0,78.0,1.337,58.0,1.3336,0.0034
208,lungcanc,LC_stage,320.0,20.0,0.3428,11.0,0.2529,0.0899
205,lungcanc,LC_stage,210.0,65.0,1.1142,19.0,0.4369,0.6773
209,lungcanc,LC_stage,400.0,125.0,2.1426,44.0,1.0117,1.1309


In [34]:
num_gender_eval = data.combine_diff_dfs(venk21_demos['num'], data.diffs_numerical_means, gender_eval_sets)
display(num_gender_eval.sort_values(by='diff_M_F', ascending=False).head(10))
num_gender_eval.sort_values(by='diff_M_F', ascending=True).head(10)

Unnamed: 0,category,attribute,value,M,F,diff_M_F
2,demo,weight,75%,217.5,180.0,37.5
4,demo,weight,Median (IQR),193 (45),157 (43),36.0
1,demo,weight,50%,193.0,157.0,36.0
3,demo,weight,Mean (SD),196.8 (34.9),161.7 (34.9),35.0666
5,demo,weight,mean,196.796,161.7294,35.0666
0,demo,weight,25%,172.0,137.0,35.0
30,smoke,pkyr,75%,76.0,61.25,14.75
46,smoke,smokeday,Median (IQR),30 (20),20 (10),10.0
44,smoke,smokeday,75%,40.0,30.0,10.0
43,smoke,smokeday,50%,30.0,20.0,10.0


Unnamed: 0,category,attribute,value,M,F,diff_M_F
20,demo,BMI,std,4.4235,5.6322,-1.2087
38,smoke,smokeage,Mean (SD),16.1 (3.5),17.2 (3.5),-1.0729
40,smoke,smokeage,mean,16.1409,17.2138,-1.0729
35,smoke,smokeage,25%,14.0,15.0,-1.0
36,smoke,smokeage,50%,16.0,17.0,-1.0
37,smoke,smokeage,75%,18.0,19.0,-1.0
39,smoke,smokeage,Median (IQR),16 (4),17 (4),-1.0
41,smoke,smokeage,std,3.4612,3.5421,-0.0809
64,other,NoduleCounts,Mean (SD),1.9 (1.3),1.9 (1.2),-0.0038
66,other,NoduleCounts,mean,1.8819,1.8857,-0.0038


## What about BMI?

### Sybil training set

In [35]:
sybil_train_over = sybil_splits['train'].query('Overweight == 1')
sybil_train_normal = sybil_splits['train'].query('Overweight == 0')

overweight_train_sets = {
    "over":sybil_train_over,
    "normal":sybil_train_normal,
}

In [36]:
cat_demo_overweight = data.combine_diff_dfs(sybil_demos['cat'], data.diffs_category_prevalence, overweight_train_sets).query('value != 0')
display(cat_demo_overweight.sort_values(by='diff_over_normal', ascending=False).head(40))
cat_demo_overweight.sort_values(by='diff_over_normal', ascending=True).head(40)

Unnamed: 0,category,attribute,value,over_freq,over_norm,normal_freq,normal_norm,diff_over_normal
1,demo,Overweight,1.0,19898.0,100.0,0.0,0.0,100.0
37,demo,Gender,1.0,12764.0,64.1472,3808.0,46.0905,18.0567
126,disease,diaghype,1.0,7824.0,39.3205,2032.0,24.5945,14.726
114,disease,diagdiab,1.0,2441.0,12.2676,340.0,4.1152,8.1524
34,demo,Married,1.0,14090.0,70.8111,5192.0,62.8419,7.9692
28,demo,marital,2.0,14090.0,70.8111,5192.0,62.8419,7.9692
45,smoke,pipe,1.0,4998.0,25.1181,1431.0,17.3203,7.7978
213,other,wrknomask,1.0,5338.0,26.8268,1643.0,19.8862,6.9406
40,smoke,cigar,1.0,4601.0,23.1229,1341.0,16.2309,6.892
123,disease,diaghear,1.0,2893.0,14.5391,761.0,9.2108,5.3283


Unnamed: 0,category,attribute,value,over_freq,over_norm,normal_freq,normal_norm,diff_over_normal
43,smoke,cigsmok,1.0,8339.0,41.9087,5047.0,61.0869,-19.1782
38,demo,Gender,2.0,7134.0,35.8528,4454.0,53.9095,-18.0567
31,demo,marital,5.0,3347.0,16.8208,1858.0,22.4885,-5.6677
111,disease,diagcopd,1.0,925.0,4.6487,552.0,6.6812,-2.0325
117,disease,diagemph,1.0,1533.0,7.7043,784.0,9.4892,-1.7849
48,smoke,smokelive,1.0,17307.0,86.9786,7318.0,88.5742,-1.5956
20,demo,educat,7.0,2732.0,13.73,1244.0,15.0569,-1.3269
27,demo,marital,1.0,835.0,4.1964,439.0,5.3135,-1.1171
192,lungcanc,Adenocarcinoma,1.0,333.0,1.6735,225.0,2.7233,-1.0498
4,demo,race,3.0,350.0,1.759,225.0,2.7233,-0.9643


In [37]:
display(cat_demo_overweight.sort_values(by='diff_over_normal', ascending=False).query('category == "lungcanc"'))
cat_demo_overweight.sort_values(by='diff_over_normal', ascending=True).query('category == "lungcanc"')

Unnamed: 0,category,attribute,value,over_freq,over_norm,normal_freq,normal_norm,diff_over_normal
211,lungcanc,LC_stage,,18989.0,95.4317,7736.0,93.6335,1.7982
198,lungcanc,Adenosquamous_carcinoma,1.0,20.0,0.1005,4.0,0.0484,0.0521
208,lungcanc,LC_stage,320.0,46.0,0.2312,18.0,0.2179,0.0133
200,lungcanc,Carcinoid_tumor,1.0,4.0,0.0201,1.0,0.0121,0.008
210,lungcanc,LC_stage,900.0,3.0,0.0151,3.0,0.0363,-0.0212
206,lungcanc,LC_stage,220.0,27.0,0.1357,20.0,0.2421,-0.1064
205,lungcanc,LC_stage,210.0,63.0,0.3166,35.0,0.4236,-0.107
196,lungcanc,Large_cell_carcinoma,1.0,18.0,0.0905,17.0,0.2058,-0.1153
207,lungcanc,LC_stage,310.0,117.0,0.588,59.0,0.7141,-0.1261
204,lungcanc,LC_stage,120.0,70.0,0.3518,41.0,0.4962,-0.1444


Unnamed: 0,category,attribute,value,over_freq,over_norm,normal_freq,normal_norm,diff_over_normal
192,lungcanc,Adenocarcinoma,1.0,333.0,1.6735,225.0,2.7233,-1.0498
203,lungcanc,LC_stage,110.0,342.0,1.7188,207.0,2.5054,-0.7866
209,lungcanc,LC_stage,400.0,241.0,1.2112,143.0,1.7308,-0.5196
202,lungcanc,Unclassified_carcinoma,1.0,98.0,0.4925,60.0,0.7262,-0.2337
194,lungcanc,Bronchiolo-alveolar_carcinoma,1.0,75.0,0.3769,48.0,0.581,-0.2041
188,lungcanc,Small_cell_carcinoma,1.0,131.0,0.6584,68.0,0.823,-0.1646
190,lungcanc,Squamous_cell_carcinoma,1.0,243.0,1.2212,113.0,1.3677,-0.1465
204,lungcanc,LC_stage,120.0,70.0,0.3518,41.0,0.4962,-0.1444
207,lungcanc,LC_stage,310.0,117.0,0.588,59.0,0.7141,-0.1261
196,lungcanc,Large_cell_carcinoma,1.0,18.0,0.0905,17.0,0.2058,-0.1153


In [38]:
num_demo_overweight = data.combine_diff_dfs(sybil_demos['num'], data.diffs_numerical_means, overweight_train_sets)
display(num_demo_overweight.sort_values(by='diff_over_normal', ascending=False).head(10))
num_demo_overweight.sort_values(by='diff_over_normal', ascending=True).head(10)

Unnamed: 0,category,attribute,value,over,normal,diff_over_normal
2,demo,weight,75%,220.0,160.0,60.0
3,demo,weight,Mean (SD),198.7 (34.2),146.2 (21.4),52.4938
5,demo,weight,mean,198.7192,146.2254,52.4938
4,demo,weight,Median (IQR),195 (45),145 (30),50.0
1,demo,weight,50%,195.0,145.0,50.0
0,demo,weight,25%,175.0,130.0,45.0
6,demo,weight,std,34.2041,21.3641,12.84
43,smoke,smokeday,50%,30.0,20.0,10.0
46,smoke,smokeday,Median (IQR),30 (20),20 (10),10.0
44,smoke,smokeday,75%,40.0,30.0,10.0


Unnamed: 0,category,attribute,value,over,normal,diff_over_normal
51,smoke,smokeyr,75%,44.0,46.0,-2.0
52,smoke,smokeyr,Mean (SD),39.5 (7.4),41.1 (7.3),-1.5877
54,smoke,smokeyr,mean,39.4865,41.0742,-1.5877
53,smoke,smokeyr,Median (IQR),40 (9),41 (10),-1.0
50,smoke,smokeyr,50%,40.0,41.0,-1.0
49,smoke,smokeyr,25%,35.0,36.0,-1.0
39,smoke,smokeage,Median (IQR),16 (3),17 (4),-1.0
37,smoke,smokeage,75%,18.0,19.0,-1.0
36,smoke,smokeage,50%,16.0,17.0,-1.0
38,smoke,smokeage,Mean (SD),16.6 (3.6),17.0 (3.6),-0.3418


### Evaluation set (venk21 Data)

In [39]:
venk21_data_over = venk21_data.query('Overweight == 1')
venk21_data_normal = venk21_data.query('Overweight == 0')

overweight_eval_sets = {
    "over":venk21_data_over,
    "normal":venk21_data_normal,
}

In [40]:
cat_eval_overweight = data.combine_diff_dfs(venk21_demos['cat'], data.diffs_category_prevalence, overweight_eval_sets).query('value != 0')
display(cat_eval_overweight.sort_values(by='diff_over_normal', ascending=False).head(40))
cat_eval_overweight.sort_values(by='diff_over_normal', ascending=True).head(40)

Unnamed: 0,category,attribute,value,over_freq,over_norm,normal_freq,normal_norm,diff_over_normal
1,demo,Overweight,1.0,6963.0,100.0,0.0,0.0,100.0
37,demo,Gender,1.0,4318.0,62.0135,1516.0,47.0807,14.9328
126,disease,diaghype,1.0,2749.0,39.4801,813.0,25.2484,14.2317
28,demo,marital,2.0,4878.0,70.056,1977.0,61.3975,8.6585
34,demo,Married,1.0,4878.0,70.056,1977.0,61.3975,8.6585
114,disease,diagdiab,1.0,867.0,12.4515,124.0,3.8509,8.6006
45,smoke,pipe,1.0,1679.0,24.1132,542.0,16.8323,7.2809
40,smoke,cigar,1.0,1473.0,21.1547,461.0,14.3168,6.8379
123,disease,diaghear,1.0,1051.0,15.0941,281.0,8.7267,6.3674
213,other,wrknomask,1.0,1835.0,26.3536,747.0,23.1988,3.1548


Unnamed: 0,category,attribute,value,over_freq,over_norm,normal_freq,normal_norm,diff_over_normal
43,smoke,cigsmok,1.0,3084.0,44.2913,2004.0,62.236,-17.9447
219,other,Emphysema,1.0,2195.0,31.5238,1502.0,46.646,-15.1222
38,demo,Gender,2.0,2645.0,37.9865,1704.0,52.9193,-14.9328
31,demo,marital,5.0,1176.0,16.8893,728.0,22.6087,-5.7194
237,nodule,NoduleInUpperLung,1.0,3499.0,50.2513,1763.0,54.7516,-4.5003
129,disease,diagpneu,1.0,1522.0,21.8584,834.0,25.9006,-4.0422
117,disease,diagemph,1.0,629.0,9.0335,393.0,12.205,-3.1715
111,disease,diagcopd,1.0,397.0,5.7016,279.0,8.6646,-2.963
233,nodule,SemiSolid,1.0,831.0,11.9345,475.0,14.7516,-2.8171
221,other,label,1.0,768.0,11.0297,431.0,13.3851,-2.3554


In [41]:
display(cat_eval_overweight.sort_values(by='diff_over_normal', ascending=False).query('category == "nodule"'))
cat_eval_overweight.sort_values(by='diff_over_normal', ascending=True).query('category == "nodule"')

Unnamed: 0,category,attribute,value,over_freq,over_norm,normal_freq,normal_norm,diff_over_normal
227,nodule,Perifissural,1.0,625.0,8.976,200.0,6.2112,2.7648
225,nodule,GroundGlassOpacity,1.0,1989.0,28.5653,870.0,27.0186,1.5467
229,nodule,NonSolid,1.0,89.0,1.2782,35.0,1.087,0.1912
235,nodule,Calcified,1.0,3.0,0.0431,5.0,0.1553,-0.1122
223,nodule,Solid,1.0,4577.0,65.7332,2123.0,65.9317,-0.1985
239,nodule,Spiculation,1.0,7.0,0.1005,13.0,0.4037,-0.3032
231,nodule,PartSolid,1.0,138.0,1.9819,102.0,3.1677,-1.1858
233,nodule,SemiSolid,1.0,831.0,11.9345,475.0,14.7516,-2.8171
237,nodule,NoduleInUpperLung,1.0,3499.0,50.2513,1763.0,54.7516,-4.5003


Unnamed: 0,category,attribute,value,over_freq,over_norm,normal_freq,normal_norm,diff_over_normal
237,nodule,NoduleInUpperLung,1.0,3499.0,50.2513,1763.0,54.7516,-4.5003
233,nodule,SemiSolid,1.0,831.0,11.9345,475.0,14.7516,-2.8171
231,nodule,PartSolid,1.0,138.0,1.9819,102.0,3.1677,-1.1858
239,nodule,Spiculation,1.0,7.0,0.1005,13.0,0.4037,-0.3032
223,nodule,Solid,1.0,4577.0,65.7332,2123.0,65.9317,-0.1985
235,nodule,Calcified,1.0,3.0,0.0431,5.0,0.1553,-0.1122
229,nodule,NonSolid,1.0,89.0,1.2782,35.0,1.087,0.1912
225,nodule,GroundGlassOpacity,1.0,1989.0,28.5653,870.0,27.0186,1.5467
227,nodule,Perifissural,1.0,625.0,8.976,200.0,6.2112,2.7648


In [42]:
num_eval_overweight = data.combine_diff_dfs(venk21_demos['num'], data.diffs_numerical_means, overweight_eval_sets)
display(num_eval_overweight.sort_values(by='diff_over_normal', ascending=False).head(10))
num_eval_overweight.sort_values(by='diff_over_normal', ascending=True).head(10)

Unnamed: 0,category,attribute,value,over,normal,diff_over_normal
2,demo,weight,75%,218.0,160.0,58.0
3,demo,weight,Mean (SD),198.4 (33.6),145.8 (21.7),52.6235
5,demo,weight,mean,198.3737,145.7502,52.6235
1,demo,weight,50%,195.0,145.0,50.0
4,demo,weight,Median (IQR),195 (43),145 (30),50.0
0,demo,weight,25%,175.0,130.0,45.0
6,demo,weight,std,33.5956,21.7271,11.8685
44,smoke,smokeday,75%,40.0,30.0,10.0
46,smoke,smokeday,Median (IQR),28 (20),20 (10),8.0
43,smoke,smokeday,50%,28.0,20.0,8.0


Unnamed: 0,category,attribute,value,over,normal,diff_over_normal
51,smoke,smokeyr,75%,45.0,47.0,-2.0
50,smoke,smokeyr,50%,40.0,42.0,-2.0
49,smoke,smokeyr,25%,35.0,37.0,-2.0
53,smoke,smokeyr,Median (IQR),40 (10),42 (10),-2.0
54,smoke,smokeyr,mean,40.4237,42.2637,-1.84
52,smoke,smokeyr,Mean (SD),40.4 (7.4),42.3 (7.2),-1.84
62,other,Diameter_mm,std,5.7077,6.7521,-1.0444
25,demo,Age,Median (IQR),62 (8),63 (8),-1.0
35,smoke,smokeage,25%,14.0,15.0,-1.0
37,smoke,smokeage,75%,18.0,19.0,-1.0


## What about race?

### venk21 data

In [43]:
venk21_data_white = venk21_data.query('race == 1')
venk21_data_black = venk21_data.query('race == 2')

race_venk21_sets = {
    "white":venk21_data_white,
    "black":venk21_data_black,
}

In [44]:
cat_race_venk21 = data.combine_diff_dfs(venk21_demos['cat'], data.diffs_category_prevalence, race_venk21_sets, include_stat=True).query('value != 0')
display(cat_race_venk21.sort_values(by='diff_white_black', ascending=False).head(40))
cat_race_venk21.sort_values(by='diff_white_black', ascending=True).head(40)

  ratio = p1 / p2
  odds_ratio = p1 / (1 - p1) / p2 * (1 - p2)
  statistic = diff_stat / np.sqrt(var)
  zstat = value / std
  ratio = p1 / p2
  odds_ratio = p1 / (1 - p1) / p2 * (1 - p2)
  ratio = p1 / p2
  odds_ratio = p1 / (1 - p1) / p2 * (1 - p2)
  statistic = diff_stat / np.sqrt(var)
  zstat = value / std
  ratio = p1 / p2
  odds_ratio = p1 / (1 - p1) / p2 * (1 - p2)
  ratio = p1 / p2
  odds_ratio = p1 / (1 - p1) / p2 * (1 - p2)
  ratio = p1 / p2
  odds_ratio = p1 / (1 - p1) / p2 * (1 - p2)
  ratio = p1 / p2
  odds_ratio = p1 / (1 - p1) / p2 * (1 - p2)
  ratio = p1 / p2
  odds_ratio = p1 / (1 - p1) / p2 * (1 - p2)
  ratio = p1 / p2
  odds_ratio = p1 / (1 - p1) / p2 * (1 - p2)
  ratio = p1 / p2
  odds_ratio = p1 / (1 - p1) / p2 * (1 - p2)
  ratio = p1 / p2
  odds_ratio = p1 / (1 - p1) / p2 * (1 - p2)
  ratio = p1 / p2
  odds_ratio = p1 / (1 - p1) / p2 * (1 - p2)
  ratio = p1 / p2
  odds_ratio = p1 / (1 - p1) / p2 * (1 - p2)
  ratio = p1 / p2
  odds_ratio = p1 / (1 - p1) / p2 * (1 - 

Unnamed: 0,category,attribute,value,white_freq,white_norm,black_freq,black_norm,diff_white_black,stat_white_black,p_white_black
2,demo,race,1.0,9461.0,100.0,0.0,0.0,100.0,inf,0.0
19,demo,WhiteOrBlack,1.0,9461.0,100.0,0.0,0.0,100.0,inf,0.0
18,demo,NonHispanicWhite,1.0,9311.0,98.4145,0.0,0.0,98.4145,766.339224,0.0
28,demo,Married,1.0,6464.0,68.3226,127.0,37.574,30.7486,11.484578,1.577044e-30
22,demo,marital,2.0,6464.0,68.3226,127.0,37.574,30.7486,11.484578,1.577044e-30
39,smoke,pipe,1.0,2147.0,22.6932,27.0,7.9882,14.705,9.572134,1.047217e-21
31,demo,Gender,1.0,5432.0,57.4146,149.0,44.0828,13.3318,4.851527,1.225147e-06
62,work,wrkfarm,1.0,1125.0,11.8909,9.0,2.6627,9.2282,9.851018,6.785323000000001e-23
11,demo,educat,3.0,2410.0,25.473,61.0,18.0473,7.4257,3.471116,0.0005182993
34,smoke,cigar,1.0,1838.0,19.4271,41.0,12.1302,7.2969,4.005365,6.192181e-05


Unnamed: 0,category,attribute,value,white_freq,white_norm,black_freq,black_norm,diff_white_black,stat_white_black,p_white_black
3,demo,race,2.0,0.0,0.0,338.0,100.0,-100.0,-inf,0.0
20,demo,WhiteOrBlack,2.0,0.0,0.0,338.0,100.0,-100.0,-inf,0.0
109,disease,diaghype,1.0,3203.0,33.8548,195.0,57.6923,-23.8375,-8.728667,2.5769140000000002e-18
37,smoke,cigsmok,1.0,4665.0,49.3077,232.0,68.6391,-19.3314,-7.506103,6.091341e-14
8,demo,Unfinished_ed,1.0,2547.0,26.921,152.0,44.9704,-18.0494,-6.577752,4.776144e-11
25,demo,marital,5.0,1715.0,18.127,114.0,33.7278,-15.6008,-5.995891,2.023732e-09
32,demo,Gender,2.0,4029.0,42.5854,189.0,55.9172,-13.3318,-4.851527,1.225147e-06
97,disease,diagdiab,1.0,831.0,8.7834,67.0,19.8225,-11.0391,-5.045557,4.522031e-07
10,demo,educat,2.0,454.0,4.7986,48.0,14.2012,-9.4026,-4.919388,8.681501e-07
13,demo,educat,5.0,2093.0,22.1224,104.0,30.7692,-8.6468,-3.395645,0.0006846721


In [45]:
display(cat_race_venk21.query('category == "nodule"').sort_values(by='diff_white_black', ascending=False).head(40))
cat_race_venk21.query('category == "nodule"').sort_values(by='diff_white_black', ascending=True).head(40)

Unnamed: 0,category,attribute,value,white_freq,white_norm,black_freq,black_norm,diff_white_black,stat_white_black,p_white_black
210,nodule,Perifissural,1.0,779.0,8.2338,18.0,5.3254,2.9084,2.319991,0.020341
206,nodule,Solid,1.0,6275.0,66.3249,218.0,64.497,1.8279,0.690341,0.48998
214,nodule,PartSolid,1.0,226.0,2.3888,5.0,1.4793,0.9095,1.34705,0.177964
212,nodule,NonSolid,1.0,120.0,1.2684,3.0,0.8876,0.3808,0.728127,0.466536
222,nodule,Spiculation,1.0,19.0,0.2008,0.0,0.0,0.2008,4.363282,1.3e-05
218,nodule,Calcified,1.0,8.0,0.0846,0.0,0.0,0.0846,2.829624,0.00466
220,nodule,NoduleInUpperLung,1.0,4879.0,51.5696,179.0,52.9586,-1.389,-0.502695,0.615179
208,nodule,GroundGlassOpacity,1.0,2632.0,27.8195,99.0,29.2899,-1.4704,-0.584011,0.559213
216,nodule,SemiSolid,1.0,1194.0,12.6202,50.0,14.7929,-2.1727,-1.107909,0.267901


Unnamed: 0,category,attribute,value,white_freq,white_norm,black_freq,black_norm,diff_white_black,stat_white_black,p_white_black
216,nodule,SemiSolid,1.0,1194.0,12.6202,50.0,14.7929,-2.1727,-1.107909,0.267901
208,nodule,GroundGlassOpacity,1.0,2632.0,27.8195,99.0,29.2899,-1.4704,-0.584011,0.559213
220,nodule,NoduleInUpperLung,1.0,4879.0,51.5696,179.0,52.9586,-1.389,-0.502695,0.615179
218,nodule,Calcified,1.0,8.0,0.0846,0.0,0.0,0.0846,2.829624,0.00466
222,nodule,Spiculation,1.0,19.0,0.2008,0.0,0.0,0.2008,4.363282,1.3e-05
212,nodule,NonSolid,1.0,120.0,1.2684,3.0,0.8876,0.3808,0.728127,0.466536
214,nodule,PartSolid,1.0,226.0,2.3888,5.0,1.4793,0.9095,1.34705,0.177964
206,nodule,Solid,1.0,6275.0,66.3249,218.0,64.497,1.8279,0.690341,0.48998
210,nodule,Perifissural,1.0,779.0,8.2338,18.0,5.3254,2.9084,2.319991,0.020341


In [46]:
display(cat_race_venk21.query('attribute == "LC_stage"').sort_values(by='diff_white_black', ascending=False).head(40))
cat_race_venk21.query('attribute == "LC_stage"').sort_values(by='diff_white_black', ascending=True).head(40)

Unnamed: 0,category,attribute,value,white_freq,white_norm,black_freq,black_norm,diff_white_black,stat_white_black,p_white_black
194,lungcanc,LC_stage,,8387.0,88.6481,282.0,83.432,5.2161,2.546442,0.010883
193,lungcanc,LC_stage,900.0,4.0,0.0423,0.0,0.0,0.0423,2.000423,0.045455
187,lungcanc,LC_stage,120.0,106.0,1.1204,4.0,1.1834,-0.063,-0.10541,0.916051
188,lungcanc,LC_stage,210.0,77.0,0.8139,3.0,0.8876,-0.0737,-0.142165,0.88695
189,lungcanc,LC_stage,220.0,36.0,0.3805,2.0,0.5917,-0.2112,-0.500559,0.616682
192,lungcanc,LC_stage,400.0,157.0,1.6594,7.0,2.071,-0.4116,-0.523833,0.600395
190,lungcanc,LC_stage,310.0,119.0,1.2578,6.0,1.7751,-0.5173,-0.711313,0.47689
191,lungcanc,LC_stage,320.0,26.0,0.2748,5.0,1.4793,-1.2045,-1.828154,0.067526
186,lungcanc,LC_stage,110.0,549.0,5.8028,29.0,8.5799,-2.7771,-1.800737,0.071744


Unnamed: 0,category,attribute,value,white_freq,white_norm,black_freq,black_norm,diff_white_black,stat_white_black,p_white_black
186,lungcanc,LC_stage,110.0,549.0,5.8028,29.0,8.5799,-2.7771,-1.800737,0.071744
191,lungcanc,LC_stage,320.0,26.0,0.2748,5.0,1.4793,-1.2045,-1.828154,0.067526
190,lungcanc,LC_stage,310.0,119.0,1.2578,6.0,1.7751,-0.5173,-0.711313,0.47689
192,lungcanc,LC_stage,400.0,157.0,1.6594,7.0,2.071,-0.4116,-0.523833,0.600395
189,lungcanc,LC_stage,220.0,36.0,0.3805,2.0,0.5917,-0.2112,-0.500559,0.616682
188,lungcanc,LC_stage,210.0,77.0,0.8139,3.0,0.8876,-0.0737,-0.142165,0.88695
187,lungcanc,LC_stage,120.0,106.0,1.1204,4.0,1.1834,-0.063,-0.10541,0.916051
193,lungcanc,LC_stage,900.0,4.0,0.0423,0.0,0.0,0.0423,2.000423,0.045455
194,lungcanc,LC_stage,,8387.0,88.6481,282.0,83.432,5.2161,2.546442,0.010883


In [47]:
num_race_venk21 = data.combine_diff_dfs(venk21_demos['num'], data.diffs_numerical_means, race_venk21_sets, include_stat=True)
display(num_race_venk21.sort_values(by='diff_white_black', ascending=False).head(10))
num_race_venk21.sort_values(by='diff_white_black', ascending=True).head(10)

Unnamed: 0,category,attribute,value,white,black,diff_white_black,stat_white_black,p_white_black
30,smoke,pkyr,75%,70.0,54.0,16.0,6.577601,5.02514e-11
44,smoke,smokeday,75%,35.0,24.0,11.0,8.534595,1.6152390000000002e-17
33,smoke,pkyr,mean,58.2982,49.4726,8.8256,6.577601,5.02514e-11
31,smoke,pkyr,Mean (SD),58.3 (24.4),49.5 (20.4),8.8256,6.577601,5.02514e-11
32,smoke,pkyr,Median (IQR),51 (29),44 (17),7.0,6.577601,5.02514e-11
29,smoke,pkyr,50%,51.0,44.0,7.0,6.577601,5.02514e-11
47,smoke,smokeday,mean,28.7498,23.3994,5.3504,8.534595,1.6152390000000002e-17
45,smoke,smokeday,Mean (SD),28.7 (11.4),23.4 (8.4),5.3504,8.534595,1.6152390000000002e-17
46,smoke,smokeday,Median (IQR),25 (15),20 (4),5.0,8.534595,1.6152390000000002e-17
43,smoke,smokeday,50%,25.0,20.0,5.0,8.534595,1.6152390000000002e-17


Unnamed: 0,category,attribute,value,white,black,diff_white_black,stat_white_black,p_white_black
49,smoke,smokeyr,25%,36.0,38.0,-2.0,-3.247603,0.001168
37,smoke,smokeage,75%,18.0,19.75,-1.75,-1.577698,0.114667
20,demo,BMI,std,4.9116,6.4535,-1.5419,-2.577897,0.009955
52,smoke,smokeyr,Mean (SD),41.0 (7.3),42.3 (6.7),-1.3173,-3.247603,0.001168
54,smoke,smokeyr,mean,40.9904,42.3077,-1.3173,-3.247603,0.001168
16,demo,BMI,75%,30.3765,31.6659,-1.2894,-2.577897,0.009955
58,other,Diameter_mm,75%,10.9,12.0,-1.1,-1.900713,0.057369
39,smoke,smokeage,Median (IQR),16 (3),17 (5),-1.0,-1.577698,0.114667
53,smoke,smokeyr,Median (IQR),41 (10),42 (8),-1.0,-3.247603,0.001168
36,smoke,smokeage,50%,16.0,17.0,-1.0,-1.577698,0.114667
