In [5]:
import pandas as pd
import os
import numpy as np
import json
import seaborn as sns
import matplotlib.pyplot as plt
import copy
from IPython.display import display, Markdown

from utilities import data
from utilities.info import *

%matplotlib inline

FILE_DIR = f"{TEAMS_DIR}/files"
FILE_DIR

'C:/Users/shaur/OneDrive - Radboudumc/Documents - Master - Shaurya Gaur/General/Malignancy-Estimation Results/files'

## Grab Data for Each Model

In [6]:
kiran_nodule = pd.read_csv(f"{FILE_DIR}/nlst_allmodels_demos.csv")

with open(f'{FILE_DIR}/nlst_democols.json') as json_data:
    kiran_demos_original = json.load(json_data)
    json_data.close()

kiran_data, kiran_demos, _ = data.prep_nlst_preds(kiran_nodule, kiran_demos_original, scanlevel=True, sybil=False, tijmen=False, bin_num=False)
kiran_demos['cat']['other'].append('label')
kiran_demos

{'num': {'demo': ['weight', 'height', 'BMI', 'Age'],
  'smoke': ['pkyr', 'smokeage', 'smokeday', 'smokeyr'],
  'other': ['Diameter_mm', 'NoduleCounts']},
 'cat': {'demo': ['Overweight',
   'race',
   'ethnic',
   'Unfinished_ed',
   'educat',
   'NonHispanicWhite',
   'WhiteOrBlack',
   'marital',
   'Married',
   'HighSchoolPlus',
   'Gender'],
  'smoke': ['cigar', 'cigsmok', 'pipe', 'smokelive', 'smokework'],
  'work': ['wrkasbe',
   'wrkbaki',
   'wrkbutc',
   'wrkchem',
   'wrkcoal',
   'wrkcott',
   'wrkfarm',
   'wrkfire',
   'wrkflou',
   'wrkfoun',
   'wrkhard',
   'wrkpain',
   'wrksand',
   'wrkweld'],
  'disease': ['diagadas',
   'diagasbe',
   'diagbron',
   'diagchas',
   'diagchro',
   'diagcopd',
   'diagdiab',
   'diagemph',
   'diagfibr',
   'diaghear',
   'diaghype',
   'diagpneu',
   'diagsarc',
   'diagsili',
   'diagstro',
   'diagtube'],
  'canchist': ['cancblad',
   'cancbrea',
   'canccerv',
   'canccolo',
   'cancesop',
   'canckidn',
   'canclary',
   'canclun

In [7]:
sybil_data = pd.read_csv(f"{FILE_DIR}/nlst_sybil_demos.csv")

with open(f'{FILE_DIR}/nlst_sybil_democols.json') as json_data:
    sybil_demos = json.load(json_data)
    json_data.close()

sybil_demos

{'num': {'demo': ['weight', 'height', 'BMI', 'Age'],
  'smoke': ['pkyr', 'smokeage', 'smokeday', 'smokeyr']},
 'cat': {'demo': ['Overweight',
   'race',
   'ethnic',
   'Unfinished_ed',
   'educat',
   'NonHispanicWhite',
   'WhiteOrBlack',
   'marital',
   'Married',
   'HighSchoolPlus',
   'Gender'],
  'smoke': ['cigar', 'cigsmok', 'pipe', 'smokelive', 'smokework'],
  'work': ['wrkasbe',
   'wrkbaki',
   'wrkbutc',
   'wrkchem',
   'wrkcoal',
   'wrkcott',
   'wrkfarm',
   'wrkfire',
   'wrkflou',
   'wrkfoun',
   'wrkhard',
   'wrkpain',
   'wrksand',
   'wrkweld'],
  'disease': ['diagadas',
   'diagasbe',
   'diagbron',
   'diagchas',
   'diagchro',
   'diagcopd',
   'diagdiab',
   'diagemph',
   'diagfibr',
   'diaghear',
   'diaghype',
   'diagpneu',
   'diagsarc',
   'diagsili',
   'diagstro',
   'diagtube'],
  'canchist': ['cancblad',
   'cancbrea',
   'canccerv',
   'canccolo',
   'cancesop',
   'canckidn',
   'canclary',
   'canclung',
   'cancnasa',
   'cancoral',
   'cancpa

Get data for Tijmen's linear layer.

In [8]:
tijmen_train = kiran_data[kiran_data['Thijmen_mean'].isna()]
print("train:", len(tijmen_train), "Scans")
tijmen_val = kiran_data[~kiran_data['Thijmen_mean'].isna()]
print("val:", len(tijmen_val), "Scans")

train: 8121 Scans
val: 2062 Scans


In [9]:
sybil_splits = {s: sybil_data.query(f'split == "{s}"') for s in ['train', 'dev', 'test']}
for s in ['train', 'dev', 'test']:
    print(s, len(sybil_splits[s]), 'Scans')

train 28160 Scans
dev 6838 Scans
test 6282 Scans


## Model Training Sets

In [10]:
training_sets = {
    'Kiran': kiran_data,
    'Tijmen': tijmen_train,
    'Sybil': sybil_splits['train']
}

### Categorical columns

In [11]:
cat_demo_splits = data.combine_diff_dfs(sybil_demos['cat'], data.diffs_category_prevalence, training_sets).dropna(subset='value', axis=0).query('value != 0')
display(cat_demo_splits.sort_values(by='diff_Kiran_Sybil', ascending=False).head(10))
display(cat_demo_splits.sort_values(by='diff_Kiran_Sybil', ascending=True ).head(10))

Unnamed: 0,category,attribute,value,Kiran_freq,Kiran_norm,Tijmen_freq,Tijmen_norm,Sybil_freq,Sybil_norm,diff_Kiran_Tijmen,diff_Kiran_Sybil,diff_Tijmen_Sybil
204,lungcanc,LC_stage,110.0,596.0,5.8529,458.0,5.6397,549.0,1.9496,0.2132,3.9033,3.6901
218,other,FamilyHistoryLungCa,1.0,2622.0,25.7488,2110.0,25.982,6232.0,22.1307,-0.2332,3.6181,3.8513
193,lungcanc,Adenocarcinoma,1.0,539.0,5.2931,410.0,5.0486,558.0,1.9815,0.2445,3.3116,3.0671
43,smoke,cigsmok,1.0,5088.0,49.9656,4028.0,49.5998,13386.0,47.5355,0.3658,2.4301,2.0643
117,disease,diagemph,1.0,1022.0,10.0363,817.0,10.0603,2317.0,8.228,-0.024,1.8083,1.8323
38,demo,Gender,2.0,4349.0,42.7084,3466.0,42.6795,11588.0,41.1506,0.0289,1.5578,1.5289
16,demo,educat,3.0,2538.0,24.9239,2001.0,24.6398,6581.0,23.37,0.2841,1.5539,1.2698
111,disease,diagcopd,1.0,676.0,6.6385,533.0,6.5632,1477.0,5.245,0.0753,1.3935,1.3182
195,lungcanc,Bronchiolo-alveolar_carcinoma,1.0,180.0,1.7677,161.0,1.9825,123.0,0.4368,-0.2148,1.3309,1.5457
29,demo,marital,3.0,864.0,8.4847,659.0,8.1148,2055.0,7.2976,0.3699,1.1871,0.8172


Unnamed: 0,category,attribute,value,Kiran_freq,Kiran_norm,Tijmen_freq,Tijmen_norm,Sybil_freq,Sybil_norm,diff_Kiran_Tijmen,diff_Kiran_Sybil,diff_Tijmen_Sybil
1,demo,Overweight,1.0,6963.0,68.3787,5568.0,68.563,19898.0,70.6605,-0.1843,-2.2818,-2.0975
40,smoke,cigar,1.0,1934.0,18.9924,1550.0,19.0863,5942.0,21.1009,-0.0939,-2.1085,-2.0146
19,demo,educat,6.0,1585.0,15.5652,1253.0,15.4291,4836.0,17.1733,0.1361,-1.6081,-1.7442
37,demo,Gender,1.0,5834.0,57.2916,4655.0,57.3205,16572.0,58.8494,-0.0289,-1.5578,-1.5289
34,demo,Married,1.0,6855.0,67.3181,5467.0,67.3193,19282.0,68.473,-0.0012,-1.1549,-1.1537
28,demo,marital,2.0,6855.0,67.3181,5467.0,67.3193,19282.0,68.473,-0.0012,-1.1549,-1.1537
18,demo,educat,5.0,2287.0,22.459,1858.0,22.879,6647.0,23.6044,-0.42,-1.1454,-0.7254
45,smoke,pipe,1.0,2221.0,21.8109,1764.0,21.7215,6429.0,22.8303,0.0894,-1.0194,-1.1088
13,demo,Unfinished_ed,1.0,2805.0,27.5459,2259.0,27.8168,7912.0,28.0966,-0.2709,-0.5507,-0.2798
27,demo,marital,1.0,416.0,4.0852,338.0,4.162,1274.0,4.5241,-0.0768,-0.4389,-0.3621


In [12]:
display(cat_demo_splits.query('category == "demo"').sort_values(by='diff_Kiran_Sybil', ascending=False).head(10))
cat_demo_splits.query('category == "demo"').sort_values(by='diff_Kiran_Sybil', ascending=True).head(10)

Unnamed: 0,category,attribute,value,Kiran_freq,Kiran_norm,Tijmen_freq,Tijmen_norm,Sybil_freq,Sybil_norm,diff_Kiran_Tijmen,diff_Kiran_Sybil,diff_Tijmen_Sybil
38,demo,Gender,2.0,4349.0,42.7084,3466.0,42.6795,11588.0,41.1506,0.0289,1.5578,1.5289
16,demo,educat,3.0,2538.0,24.9239,2001.0,24.6398,6581.0,23.37,0.2841,1.5539,1.2698
29,demo,marital,3.0,864.0,8.4847,659.0,8.1148,2055.0,7.2976,0.3699,1.1871,0.8172
23,demo,NonHispanicWhite,1.0,9311.0,91.4367,7402.0,91.1464,25461.0,90.4155,0.2903,1.0212,0.7309
2,demo,race,1.0,9461.0,92.9098,7528.0,92.6979,25919.0,92.0419,0.2119,0.8679,0.656
24,demo,WhiteOrBlack,1.0,9461.0,92.9098,7528.0,92.6979,25919.0,92.0419,0.2119,0.8679,0.656
15,demo,educat,2.0,518.0,5.0869,401.0,4.9378,1265.0,4.4922,0.1491,0.5947,0.4456
14,demo,educat,1.0,164.0,1.6105,122.0,1.5023,353.0,1.2536,0.1082,0.3569,0.2487
20,demo,educat,7.0,1465.0,14.3867,1156.0,14.2347,3976.0,14.1193,0.152,0.2674,0.1154
10,demo,ethnic,2.0,9977.0,97.977,7945.0,97.8328,27529.0,97.7592,0.1442,0.2178,0.0736


Unnamed: 0,category,attribute,value,Kiran_freq,Kiran_norm,Tijmen_freq,Tijmen_norm,Sybil_freq,Sybil_norm,diff_Kiran_Tijmen,diff_Kiran_Sybil,diff_Tijmen_Sybil
1,demo,Overweight,1.0,6963.0,68.3787,5568.0,68.563,19898.0,70.6605,-0.1843,-2.2818,-2.0975
19,demo,educat,6.0,1585.0,15.5652,1253.0,15.4291,4836.0,17.1733,0.1361,-1.6081,-1.7442
37,demo,Gender,1.0,5834.0,57.2916,4655.0,57.3205,16572.0,58.8494,-0.0289,-1.5578,-1.5289
28,demo,marital,2.0,6855.0,67.3181,5467.0,67.3193,19282.0,68.473,-0.0012,-1.1549,-1.1537
34,demo,Married,1.0,6855.0,67.3181,5467.0,67.3193,19282.0,68.473,-0.0012,-1.1549,-1.1537
18,demo,educat,5.0,2287.0,22.459,1858.0,22.879,6647.0,23.6044,-0.42,-1.1454,-0.7254
13,demo,Unfinished_ed,1.0,2805.0,27.5459,2259.0,27.8168,7912.0,28.0966,-0.2709,-0.5507,-0.2798
27,demo,marital,1.0,416.0,4.0852,338.0,4.162,1274.0,4.5241,-0.0768,-0.4389,-0.3621
4,demo,race,3.0,169.0,1.6596,146.0,1.7978,575.0,2.0419,-0.1382,-0.3823,-0.2441
3,demo,race,2.0,338.0,3.3193,274.0,3.374,1036.0,3.679,-0.0547,-0.3597,-0.305


### Numerical columns

In [13]:
num_demo_splits = data.combine_diff_dfs(sybil_demos['num'], data.diffs_numerical_means, training_sets)
display(num_demo_splits.sort_values(by='diff_Kiran_Sybil', ascending=False).query('diff_Kiran_Sybil > 0'))
num_demo_splits.sort_values(by='diff_Kiran_Sybil', ascending=True).query('diff_Kiran_Sybil < 0')

Unnamed: 0,category,attribute,value,Kiran,Tijmen,Sybil,diff_Kiran_Tijmen,diff_Kiran_Sybil,diff_Tijmen_Sybil
30,smoke,pkyr,75%,69.0,69.0,66.5,0.0,2.5,2.5
23,demo,Age,75%,67.0,67.0,65.0,0.0,2.0,2.0
44,smoke,smokeday,75%,35.0,35.0,33.0,0.0,2.0,2.0
21,demo,Age,25%,59.0,59.0,57.0,0.0,2.0,2.0
31,smoke,pkyr,Mean (SD),58.0 (24.6),58.0 (25.0),56.1 (23.7),0.013,1.8242,1.8112
33,smoke,pkyr,mean,57.9729,57.9599,56.1487,0.013,1.8242,1.8112
24,demo,Age,Mean (SD),63.2 (5.2),63.2 (5.2),61.6 (5.1),0.0298,1.5553,1.5255
26,demo,Age,mean,63.2049,63.1751,61.6496,0.0298,1.5553,1.5255
52,smoke,smokeyr,Mean (SD),41.0 (7.4),41.0 (7.3),40.0 (7.4),0.0211,1.0532,1.0321
54,smoke,smokeyr,mean,41.0055,40.9844,39.9523,0.0211,1.0532,1.0321


Unnamed: 0,category,attribute,value,Kiran,Tijmen,Sybil,diff_Kiran_Tijmen,diff_Kiran_Sybil,diff_Tijmen_Sybil
2,demo,weight,75%,205.0,205.0,208.0,0.0,-3.0,-3.0
3,demo,weight,Mean (SD),181.8 (39.0),181.9 (39.0),183.4 (39.1),-0.0556,-1.6016,-1.546
5,demo,weight,mean,181.8113,181.8669,183.4129,-0.0556,-1.6016,-1.546
17,demo,BMI,Mean (SD),27.6 (5.0),27.6 (5.0),27.9 (5.0),-0.0199,-0.2404,-0.2205
19,demo,BMI,mean,27.6277,27.6476,27.8681,-0.0199,-0.2404,-0.2205
18,demo,BMI,Median (IQR),27 (6),27 (6),27 (6),-0.0349,-0.2395,-0.2046
15,demo,BMI,50%,27.0197,27.0546,27.2592,-0.0349,-0.2395,-0.2046
16,demo,BMI,75%,30.4066,30.4066,30.559,0.0,-0.1524,-0.1524
6,demo,weight,std,38.9777,38.9825,39.129,-0.0048,-0.1513,-0.1465
14,demo,BMI,25%,24.2738,24.3253,24.4051,-0.0515,-0.1313,-0.0798


## Model Validation Sets

In [14]:
val_sets = {
    'Kiran': kiran_data,
    'Tijmen': tijmen_val,
    'SybilDev': sybil_splits['dev'],
    'SybilTest': sybil_splits['test']
}

### Categorical columns

In [15]:
cat_demo_val = data.combine_diff_dfs(sybil_demos['cat'], data.diffs_category_prevalence, val_sets).dropna(subset='value', axis=0).query('value != 0')
display(cat_demo_val.sort_values(by='diff_Kiran_SybilTest', ascending=False).head(10))
display(cat_demo_val.sort_values(by='diff_Kiran_SybilTest', ascending=True ).head(10))

Unnamed: 0,category,attribute,value,Kiran_freq,Kiran_norm,Tijmen_freq,Tijmen_norm,SybilDev_freq,SybilDev_norm,SybilTest_freq,SybilTest_norm,diff_Kiran_Tijmen,diff_Kiran_SybilDev,diff_Kiran_SybilTest,diff_Tijmen_SybilDev,diff_Tijmen_SybilTest,diff_SybilDev_SybilTest
217,other,FamilyHistoryLungCa,1.0,2622,25.7488,512.0,24.8303,1469.0,21.4829,1362.0,21.681,0.9185,4.2659,4.0678,3.3474,3.1493,-0.1981
203,lungcanc,LC_stage,110.0,596,5.8529,138.0,6.6925,127.0,1.8573,113.0,1.7988,-0.8396,3.9956,4.0541,4.8352,4.8937,0.0585
192,lungcanc,Adenocarcinoma,1.0,539,5.2931,129.0,6.2561,82.0,1.1992,107.0,1.7033,-0.963,4.0939,3.5898,5.0569,4.5528,-0.5041
38,demo,Gender,2.0,4349,42.7084,883.0,42.8225,2822.0,41.2694,2513.0,40.0032,-0.1141,1.439,2.7052,1.5531,2.8193,1.2662
51,smoke,smokework,1.0,8820,86.6149,1779.0,86.2755,5909.0,86.4142,5273.0,83.9382,0.3394,0.2007,2.6767,-0.1387,2.3373,2.476
117,disease,diagemph,1.0,1022,10.0363,205.0,9.9418,576.0,8.4235,484.0,7.7046,0.0945,1.6128,2.3317,1.5183,2.2372,0.7189
43,smoke,cigsmok,1.0,5088,49.9656,1060.0,51.4064,3241.0,47.3969,3026.0,48.1694,-1.4408,2.5687,1.7962,4.0095,3.237,-0.7725
108,disease,diagchro,1.0,1064,10.4488,226.0,10.9602,711.0,10.3978,551.0,8.7711,-0.5114,0.051,1.6777,0.5624,2.1891,1.6267
48,smoke,smokelive,1.0,8965,88.0389,1796.0,87.0999,5961.0,87.1746,5436.0,86.533,0.939,0.8643,1.5059,-0.0747,0.5669,0.6416
194,lungcanc,Bronchiolo-alveolar_carcinoma,1.0,180,1.7677,19.0,0.9214,59.0,0.8628,20.0,0.3184,0.8463,0.9049,1.4493,0.0586,0.603,0.5444


Unnamed: 0,category,attribute,value,Kiran_freq,Kiran_norm,Tijmen_freq,Tijmen_norm,SybilDev_freq,SybilDev_norm,SybilTest_freq,SybilTest_norm,diff_Kiran_Tijmen,diff_Kiran_SybilDev,diff_Kiran_SybilTest,diff_Tijmen_SybilDev,diff_Tijmen_SybilTest,diff_SybilDev_SybilTest
40,smoke,cigar,1.0,1934,18.9924,384.0,18.6227,1450.0,21.205,1376.0,21.9039,0.3697,-2.2126,-2.9115,-2.5823,-3.2812,-0.6989
37,demo,Gender,1.0,5834,57.2916,1179.0,57.1775,4016.0,58.7306,3769.0,59.9968,0.1141,-1.439,-2.7052,-1.5531,-2.8193,-1.2662
45,smoke,pipe,1.0,2221,21.8109,457.0,22.1629,1584.0,23.1647,1508.0,24.0051,-0.352,-1.3538,-2.1942,-1.0018,-1.8422,-0.8404
28,demo,marital,2.0,6855,67.3181,1388.0,67.3133,4628.0,67.6806,4295.0,68.3699,0.0048,-0.3625,-1.0518,-0.3673,-1.0566,-0.6893
34,demo,Married,1.0,6855,67.3181,1388.0,67.3133,4628.0,67.6806,4295.0,68.3699,0.0048,-0.3625,-1.0518,-0.3673,-1.0566,-0.6893
1,demo,Overweight,1.0,6963,68.3787,1395.0,67.6528,4859.0,71.0588,4352.0,69.2773,0.7259,-2.6801,-0.8986,-3.406,-1.6245,1.7815
105,disease,diagchas,1.0,328,3.2211,72.0,3.4918,188.0,2.7493,245.0,3.9,-0.2707,0.4718,-0.6789,0.7425,-0.4082,-1.1507
20,demo,educat,7.0,1465,14.3867,309.0,14.9855,1002.0,14.6534,942.0,14.9952,-0.5988,-0.2667,-0.6085,0.3321,-0.0097,-0.3418
4,demo,race,3.0,169,1.6596,23.0,1.1154,175.0,2.5592,142.0,2.2604,0.5442,-0.8996,-0.6008,-1.4438,-1.145,0.2988
7,demo,race,6.0,111,1.0901,16.0,0.7759,100.0,1.4624,104.0,1.6555,0.3142,-0.3723,-0.5654,-0.6865,-0.8796,-0.1931


### Numerical columns

In [16]:
num_demo_val = data.combine_diff_dfs(sybil_demos['num'], data.diffs_numerical_means, val_sets)    
display(num_demo_val.sort_values(by='diff_Kiran_SybilTest', ascending=False).head(10))
display(num_demo_val.sort_values(by='diff_Kiran_SybilTest', ascending=True ).head(10))

Unnamed: 0,category,attribute,value,Kiran,Tijmen,SybilDev,SybilTest,diff_Kiran_Tijmen,diff_Kiran_SybilDev,diff_Kiran_SybilTest,diff_Tijmen_SybilDev,diff_Tijmen_SybilTest,diff_SybilDev_SybilTest
23,demo,Age,75%,67.0,67.0,65.0,65.0,0.0,2.0,2.0,2.0,2.0,0.0
21,demo,Age,25%,59.0,59.0,57.0,57.0,0.0,2.0,2.0,2.0,2.0,0.0
26,demo,Age,mean,63.2049,63.3225,61.6,61.5807,-0.1176,1.6049,1.6242,1.7225,1.7418,0.0193
24,demo,Age,Mean (SD),63.2 (5.2),63.3 (5.2),61.6 (5.1),61.6 (5.1),-0.1176,1.6049,1.6242,1.7225,1.7418,0.0193
30,smoke,pkyr,75%,69.0,69.75,67.5,68.0,-0.75,1.5,1.0,2.25,1.75,-0.5
25,demo,Age,Median (IQR),62 (8),63 (8),61 (8),61 (8),-1.0,1.0,1.0,2.0,2.0,0.0
22,demo,Age,50%,62.0,63.0,61.0,61.0,-1.0,1.0,1.0,2.0,2.0,0.0
35,smoke,smokeage,25%,15.0,14.0,15.0,14.0,1.0,0.0,1.0,-1.0,0.0,1.0
53,smoke,smokeyr,Median (IQR),41 (10),41 (10),40 (10),40 (10),0.0,1.0,1.0,1.0,1.0,0.0
51,smoke,smokeyr,75%,46.0,46.0,45.0,45.0,0.0,1.0,1.0,1.0,1.0,0.0


Unnamed: 0,category,attribute,value,Kiran,Tijmen,SybilDev,SybilTest,diff_Kiran_Tijmen,diff_Kiran_SybilDev,diff_Kiran_SybilTest,diff_Tijmen_SybilDev,diff_Tijmen_SybilTest,diff_SybilDev_SybilTest
2,demo,weight,75%,205.0,205.0,205.0,210.0,0.0,0.0,-5.0,0.0,-5.0,-5.0
3,demo,weight,Mean (SD),181.8 (39.0),181.6 (39.0),183.7 (39.8),183.7 (40.2),0.2186,-1.8493,-1.8751,-2.0679,-2.0937,-0.0258
5,demo,weight,mean,181.8113,181.5927,183.6606,183.6864,0.2186,-1.8493,-1.8751,-2.0679,-2.0937,-0.0258
6,demo,weight,std,38.9777,38.9676,39.7804,40.1892,0.0101,-0.8027,-1.2115,-0.8128,-1.2216,-0.4088
48,smoke,smokeday,std,11.4641,11.4531,11.6416,11.9578,0.011,-0.1775,-0.4937,-0.1885,-0.5047,-0.3162
45,smoke,smokeday,Mean (SD),28.6 (11.5),28.7 (11.5),28.5 (11.6),28.9 (12.0),-0.1166,0.035,-0.3292,0.1516,-0.2126,-0.3642
47,smoke,smokeday,mean,28.574,28.6906,28.539,28.9032,-0.1166,0.035,-0.3292,0.1516,-0.2126,-0.3642
16,demo,BMI,75%,30.4066,30.4066,30.6635,30.7242,0.0,-0.2569,-0.3176,-0.2569,-0.3176,-0.0607
34,smoke,pkyr,std,24.6293,23.2674,25.1968,24.9038,1.3619,-0.5675,-0.2745,-1.9294,-1.6364,0.293
17,demo,BMI,Mean (SD),27.6 (5.0),27.5 (5.0),28.0 (5.1),27.9 (5.1),0.0785,-0.339,-0.2397,-0.4175,-0.3182,0.0993


## Sybil Train vs. Validation Sets

In [17]:
sybil_splits["eval"] = kiran_data

In [18]:
cat_demo_shift = data.combine_diff_dfs(sybil_demos['cat'], data.diffs_category_prevalence, sybil_splits).dropna(subset='value', axis=0).query('value != 0')
display(cat_demo_shift.sort_values(by='diff_train_test', ascending=False).head(10))
cat_demo_shift.sort_values(by='diff_train_test', ascending=True).head(10)

Unnamed: 0,category,attribute,value,train_freq,train_norm,dev_freq,dev_norm,test_freq,test_norm,eval_freq,eval_norm,diff_train_dev,diff_train_test,diff_train_eval,diff_dev_test,diff_dev_eval,diff_test_eval
13,demo,Unfinished_ed,1.0,7912.0,28.0966,1801.0,26.3381,1658.0,26.3929,2805.0,27.5459,1.7585,1.7037,0.5507,-0.0548,-1.2078,-1.153
51,smoke,smokework,1.0,24110.0,85.6179,5909.0,86.4142,5273.0,83.9382,8820.0,86.6149,-0.7963,1.6797,-0.997,2.476,-0.2007,-2.6767
18,demo,educat,5.0,6647.0,23.6044,1521.0,22.2433,1394.0,22.1904,2287.0,22.459,1.3611,1.414,1.1454,0.0529,-0.2157,-0.2686
1,demo,Overweight,1.0,19898.0,70.6605,4859.0,71.0588,4352.0,69.2773,6963.0,68.3787,-0.3983,1.3832,2.2818,1.7815,2.6801,0.8986
38,demo,Gender,2.0,11588.0,41.1506,2822.0,41.2694,2513.0,40.0032,4349.0,42.7084,-0.1188,1.1474,-1.5578,1.2662,-1.439,-2.7052
19,demo,educat,6.0,4836.0,17.1733,1177.0,17.2126,1008.0,16.0458,1585.0,15.5652,-0.0393,1.1275,1.6081,1.1668,1.6474,0.4806
126,disease,diaghype,1.0,9856.0,35.0,2388.0,34.9225,2131.0,33.9223,3562.0,34.9799,0.0775,1.0777,0.0201,1.0002,-0.0574,-1.0576
108,disease,diagchro,1.0,2750.0,9.7656,711.0,10.3978,551.0,8.7711,1064.0,10.4488,-0.6322,0.9945,-0.6832,1.6267,-0.051,-1.6777
48,smoke,smokelive,1.0,24625.0,87.4467,5961.0,87.1746,5436.0,86.533,8965.0,88.0389,0.2721,0.9137,-0.5922,0.6416,-0.8643,-1.5059
25,demo,WhiteOrBlack,2.0,1036.0,3.679,290.0,4.241,187.0,2.9768,338.0,3.3193,-0.562,0.7022,0.3597,1.2642,0.9217,-0.3425


Unnamed: 0,category,attribute,value,train_freq,train_norm,dev_freq,dev_norm,test_freq,test_norm,eval_freq,eval_norm,diff_train_dev,diff_train_test,diff_train_eval,diff_dev_test,diff_dev_eval,diff_test_eval
16,demo,educat,3.0,6581.0,23.37,1682.0,24.5978,1571.0,25.008,2538.0,24.9239,-1.2278,-1.638,-1.5539,-0.4102,-0.3261,0.0841
45,smoke,pipe,1.0,6429.0,22.8303,1584.0,23.1647,1508.0,24.0051,2221.0,21.8109,-0.3344,-1.1748,1.0194,-0.8404,1.3538,2.1942
37,demo,Gender,1.0,16572.0,58.8494,4016.0,58.7306,3769.0,59.9968,5834.0,57.2916,0.1188,-1.1474,1.5578,-1.2662,1.439,2.7052
20,demo,educat,7.0,3976.0,14.1193,1002.0,14.6534,942.0,14.9952,1465.0,14.3867,-0.5341,-0.8759,-0.2674,-0.3418,0.2667,0.6085
40,smoke,cigar,1.0,5942.0,21.1009,1450.0,21.205,1376.0,21.9039,1934.0,18.9924,-0.1041,-0.803,2.1085,-0.6989,2.2126,2.9115
93,work,wrkweld,1.0,1597.0,5.6712,408.0,5.9667,401.0,6.3833,596.0,5.8529,-0.2955,-0.7121,-0.1817,-0.4166,0.1138,0.5304
43,smoke,cigsmok,1.0,13386.0,47.5355,3241.0,47.3969,3026.0,48.1694,5088.0,49.9656,0.1386,-0.6339,-2.4301,-0.7725,-2.5687,-1.7962
129,disease,diagpneu,1.0,6278.0,22.294,1613.0,23.5888,1439.0,22.9067,2356.0,23.1366,-1.2948,-0.6127,-0.8426,0.6821,0.4522,-0.2299
23,demo,NonHispanicWhite,1.0,25461.0,90.4155,6076.0,88.8564,5718.0,91.022,9311.0,91.4367,1.5591,-0.6065,-1.0212,-2.1656,-2.5803,-0.4147
10,demo,ethnic,2.0,27529.0,97.7592,6664.0,97.4554,6174.0,98.2808,9977.0,97.977,0.3038,-0.5216,-0.2178,-0.8254,-0.5216,0.3038


In [19]:
num_demo_shift = data.combine_diff_dfs(sybil_demos['num'], data.diffs_numerical_means, sybil_splits)
display(num_demo_shift.sort_values(by='diff_train_test', ascending=False).head(10))
num_demo_shift.sort_values(by='diff_train_test', ascending=True).head(10)

Unnamed: 0,category,attribute,value,train,dev,test,eval,diff_train_dev,diff_train_test,diff_train_eval,diff_dev_test,diff_dev_eval,diff_test_eval
35,smoke,smokeage,25%,15.0,15.0,14.0,15.0,0.0,1.0,0.0,1.0,0.0,-1.0
18,demo,BMI,Median (IQR),27 (6),27 (6),27 (6),27 (6),-0.0575,0.1665,0.2395,0.224,0.297,0.073
15,demo,BMI,50%,27.2592,27.3167,27.0927,27.0197,-0.0575,0.1665,0.2395,0.224,0.297,0.073
38,smoke,smokeage,Mean (SD),16.7 (3.6),16.7 (3.6),16.6 (3.7),16.6 (3.5),0.0048,0.072,0.1208,0.0672,0.116,0.0488
40,smoke,smokeage,mean,16.7199,16.7151,16.6479,16.5991,0.0048,0.072,0.1208,0.0672,0.116,0.0488
24,demo,Age,Mean (SD),61.6 (5.1),61.6 (5.1),61.6 (5.1),63.2 (5.2),0.0496,0.0689,-1.5553,0.0193,-1.6049,-1.6242
26,demo,Age,mean,61.6496,61.6,61.5807,63.2049,0.0496,0.0689,-1.5553,0.0193,-1.6049,-1.6242
14,demo,BMI,25%,24.4051,24.5371,24.3636,24.2738,-0.132,0.0415,0.1313,0.1735,0.2633,0.0898
19,demo,BMI,mean,27.8681,27.9667,27.8674,27.6277,-0.0986,0.0007,0.2404,0.0993,0.339,0.2397
17,demo,BMI,Mean (SD),27.9 (5.0),28.0 (5.1),27.9 (5.1),27.6 (5.0),-0.0986,0.0007,0.2404,0.0993,0.339,0.2397


Unnamed: 0,category,attribute,value,train,dev,test,eval,diff_train_dev,diff_train_test,diff_train_eval,diff_dev_test,diff_dev_eval,diff_test_eval
2,demo,weight,75%,208.0,205.0,210.0,205.0,3.0,-2.0,3.0,-5.0,0.0,5.0
44,smoke,smokeday,75%,33.0,35.0,35.0,35.0,-2.0,-2.0,-2.0,0.0,0.0,0.0
30,smoke,pkyr,75%,66.5,67.5,68.0,69.0,-1.0,-1.5,-2.5,-0.5,-1.5,-1.0
34,smoke,pkyr,std,23.6767,25.1968,24.9038,24.6293,-1.5201,-1.2271,-0.9526,0.293,0.5675,0.2745
31,smoke,pkyr,Mean (SD),56.1 (23.7),56.4 (25.2),57.2 (24.9),58.0 (24.6),-0.2614,-1.0742,-1.8242,-0.8128,-1.5628,-0.75
33,smoke,pkyr,mean,56.1487,56.4101,57.2229,57.9729,-0.2614,-1.0742,-1.8242,-0.8128,-1.5628,-0.75
6,demo,weight,std,39.129,39.7804,40.1892,38.9777,-0.6514,-1.0602,0.1513,-0.4088,0.8027,1.2115
48,smoke,smokeday,std,11.2621,11.6416,11.9578,11.4641,-0.3795,-0.6957,-0.202,-0.3162,0.1775,0.4937
29,smoke,pkyr,50%,49.0,48.0,49.5,50.0,1.0,-0.5,-1.0,-1.5,-2.0,-0.5
32,smoke,pkyr,Median (IQR),49 (26),48 (28),49 (28),50 (28),1.0,-0.5,-1.0,-1.5,-2.0,-0.5


Conclusion: not much demographic shift (1-2% overall not much, really). Besides family history.

## NLST vs. DLCST

In [20]:
dlcst_preds = pd.read_csv(f"{FILE_DIR}/dlcst_allmodels_cal.csv", header=0)
dlcst_preds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 599 entries, 0 to 598
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   PatientID            599 non-null    int64  
 1   StudyDate            599 non-null    int64  
 2   SeriesInstanceUID    599 non-null    object 
 3   Age                  599 non-null    int64  
 4   Sex                  599 non-null    int64  
 5   FamilyHistoryLungCa  599 non-null    int64  
 6   Emphysema            599 non-null    int64  
 7   NoduleCountPerScan   599 non-null    int64  
 8   sybil_year1          599 non-null    float64
 9   sybil_year2          599 non-null    float64
 10  sybil_year3          599 non-null    float64
 11  sybil_year4          599 non-null    float64
 12  sybil_year5          599 non-null    float64
 13  sybil_year6          599 non-null    float64
 14  PanCan2b             599 non-null    float64
 15  Ensemble_Kiran       599 non-null    flo

In [21]:
kiran_data['Sex'] = kiran_data['Gender']
kiran_data['NoduleCountPerScan'] = kiran_data['NoduleCounts']
kiran_data['Emphysema'] = kiran_data['Emphysema'].astype(int)

In [22]:
screening_sets = {
    # "nlst_sybil": sybil_demos,
    "nlst": kiran_data,
    "dlcst": dlcst_preds
}

In [23]:
cat_demo_dlcst = data.combine_diff_dfs(DLCST_DEMOCOLS['cat'], data.diffs_category_prevalence, screening_sets)
display(cat_demo_dlcst.sort_values(by='diff_nlst_dlcst', ascending=False).head(10))
cat_demo_dlcst.sort_values(by='diff_nlst_dlcst', ascending=True).head(10)

Unnamed: 0,category,attribute,value,nlst_freq,nlst_norm,dlcst_freq,dlcst_norm,diff_nlst_dlcst
4,other,Emphysema,0,6486,63.6944,194,32.3873,31.3071
3,other,FamilyHistoryLungCa,1,2622,25.7488,102,17.0284,8.7204
0,demo,Sex,1,5834,57.2916,323,53.9232,3.3684
1,demo,Sex,2,4349,42.7084,276,46.0768,-3.3684
2,other,FamilyHistoryLungCa,0,7561,74.2512,497,82.9716,-8.7204
5,other,Emphysema,1,3697,36.3056,405,67.6127,-31.3071


Unnamed: 0,category,attribute,value,nlst_freq,nlst_norm,dlcst_freq,dlcst_norm,diff_nlst_dlcst
5,other,Emphysema,1,3697,36.3056,405,67.6127,-31.3071
2,other,FamilyHistoryLungCa,0,7561,74.2512,497,82.9716,-8.7204
1,demo,Sex,2,4349,42.7084,276,46.0768,-3.3684
0,demo,Sex,1,5834,57.2916,323,53.9232,3.3684
3,other,FamilyHistoryLungCa,1,2622,25.7488,102,17.0284,8.7204
4,other,Emphysema,0,6486,63.6944,194,32.3873,31.3071


In [24]:
num_demo_dlcst = data.combine_diff_dfs(DLCST_DEMOCOLS['num'], data.diffs_numerical_means, screening_sets)
display(num_demo_dlcst.sort_values(by='diff_nlst_dlcst', ascending=False).head(10))
num_demo_dlcst.sort_values(by='diff_nlst_dlcst', ascending=True).head(10)

Unnamed: 0,category,attribute,value,nlst,dlcst,diff_nlst_dlcst
0,demo,Age,25%,59.0,54.0,5.0
2,demo,Age,75%,67.0,62.0,5.0
3,demo,Age,Mean (SD),63.2 (5.2),58.4 (4.9),4.8076
5,demo,Age,mean,63.2049,58.3973,4.8076
1,demo,Age,50%,62.0,58.0,4.0
4,demo,Age,Median (IQR),62 (8),58 (8),4.0
6,demo,Age,std,5.2401,4.9461,0.294
8,other,NoduleCountPerScan,Mean (SD),1.9 (1.3),1.8 (1.2),0.1139
10,other,NoduleCountPerScan,mean,1.8835,1.7696,0.1139
11,other,NoduleCountPerScan,std,1.256,1.1654,0.0906


Unnamed: 0,category,attribute,value,nlst,dlcst,diff_nlst_dlcst
7,other,NoduleCountPerScan,75%,2.0,2.0,0.0
9,other,NoduleCountPerScan,Median (IQR),1 (1),1 (1),0.0
11,other,NoduleCountPerScan,std,1.256,1.1654,0.0906
8,other,NoduleCountPerScan,Mean (SD),1.9 (1.3),1.8 (1.2),0.1139
10,other,NoduleCountPerScan,mean,1.8835,1.7696,0.1139
6,demo,Age,std,5.2401,4.9461,0.294
1,demo,Age,50%,62.0,58.0,4.0
4,demo,Age,Median (IQR),62 (8),58 (8),4.0
5,demo,Age,mean,63.2049,58.3973,4.8076
3,demo,Age,Mean (SD),63.2 (5.2),58.4 (4.9),4.8076


### Different validation sets

In [25]:
all_nodules = pd.read_csv(f"{FILE_DIR}/nlst_allmodels_demos.csv")
some_nodules, nlst_democols_nodules, _ = data.prep_nlst_preds(all_nodules, democols=kiran_demos, scanlevel=False, tijmen=True, sybil=False)
print(len(all_nodules), len(some_nodules))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, query_string] = df.eval(query_string_backticks)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, query_string] = df.eval(query_string_backticks)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, query_string] = df.eval(query_string_backticks)
A value is trying to be set on 

16077 3240


In [27]:
all_scans, _, _ = data.prep_nlst_preds(all_nodules, democols=kiran_demos_original, scanlevel=True, tijmen=False, sybil=True)
some_scans, _, _ = data.prep_nlst_preds(all_nodules, democols=kiran_demos_original, scanlevel=True, tijmen=True, sybil=True)
print(len(all_scans), len(some_scans))

5911 1172


In [28]:
valsets = {
    "allnodules": all_nodules,
    "somenodules": some_nodules,
    "allscans": all_scans,
    "somescans": some_scans
}

In [29]:
cat_demo_shift = data.combine_diff_dfs(kiran_demos['cat'], data.diffs_category_prevalence, valsets)
num_demo_shift = data.combine_diff_dfs(kiran_demos['num'], data.diffs_numerical_means, valsets)

#### Difference between Nodule sets and Scan sets

In [30]:
display(cat_demo_shift.sort_values(by='diff_allnodules_allscans', ascending=False).head(10))
cat_demo_shift.sort_values(by='diff_allnodules_allscans', ascending=True).head(10)

Unnamed: 0,category,attribute,value,allnodules_freq,allnodules_norm,somenodules_freq,somenodules_norm,allscans_freq,allscans_norm,somescans_freq,somescans_norm,diff_allnodules_somenodules,diff_allnodules_allscans,diff_allnodules_somescans,diff_somenodules_allscans,diff_somenodules_somescans,diff_allscans_somescans
236,nodule,NoduleInUpperLung,0.0,9331,58.0394,1866.0,57.5926,2850.0,48.2152,572.0,48.8055,0.4468,9.8242,9.2339,9.3774,8.7871,-0.5903
222,nodule,Solid,0.0,6391,39.7524,1256.0,38.7654,1911.0,32.3296,376.0,32.0819,0.987,7.4228,7.6705,6.4358,6.6835,0.2477
224,nodule,GroundGlassOpacity,0.0,12466,77.5393,2513.0,77.5617,4275.0,72.3228,841.0,71.7577,-0.0224,5.2165,5.7816,5.2389,5.804,0.5651
232,nodule,SemiSolid,0.0,14605,90.8441,2954.0,91.1728,5159.0,87.278,1029.0,87.7986,-0.3287,3.5661,3.0455,3.8948,3.3742,-0.5206
226,nodule,Perifissural,0.0,15148,94.2216,3067.0,94.6605,5387.0,91.1352,1090.0,93.0034,-0.4389,3.0864,1.2182,3.5253,1.6571,-1.8682
220,other,label,0.0,14828,92.2311,2959.0,91.3272,5330.0,90.1709,1045.0,89.1638,0.9039,2.0602,3.0673,1.1563,2.1634,1.0071
211,lungcanc,LC_stage,,14859,92.424,2963.0,91.4506,5345.0,90.4246,1047.0,89.3345,0.9734,1.9994,3.0895,1.026,2.1161,1.0901
218,other,Emphysema,0.0,10385,64.5954,2118.0,65.3704,3755.0,63.5256,733.0,62.5427,-0.775,1.0698,2.0527,1.8448,2.8277,0.9829
114,disease,diagdiab,1.0,1632,10.1511,360.0,11.1111,545.0,9.2201,126.0,10.7509,-0.96,0.931,-0.5998,1.891,0.3602,-1.5308
39,smoke,cigar,0.0,13031,81.0537,2657.0,82.0062,4738.0,80.1556,931.0,79.4369,-0.9525,0.8981,1.6168,1.8506,2.5693,0.7187


Unnamed: 0,category,attribute,value,allnodules_freq,allnodules_norm,somenodules_freq,somenodules_norm,allscans_freq,allscans_norm,somescans_freq,somescans_norm,diff_allnodules_somenodules,diff_allnodules_allscans,diff_allnodules_somescans,diff_somenodules_allscans,diff_somenodules_somescans,diff_allscans_somescans
237,nodule,NoduleInUpperLung,1.0,6746,41.9606,1374.0,42.4074,3061.0,51.7848,600.0,51.1945,-0.4468,-9.8242,-9.2339,-9.3774,-8.7871,0.5903
223,nodule,Solid,1.0,9686,60.2476,1984.0,61.2346,4000.0,67.6704,796.0,67.9181,-0.987,-7.4228,-7.6705,-6.4358,-6.6835,-0.2477
225,nodule,GroundGlassOpacity,1.0,3611,22.4607,727.0,22.4383,1636.0,27.6772,331.0,28.2423,0.0224,-5.2165,-5.7816,-5.2389,-5.804,-0.5651
233,nodule,SemiSolid,1.0,1472,9.1559,286.0,8.8272,752.0,12.722,143.0,12.2014,0.3287,-3.5661,-3.0455,-3.8948,-3.3742,0.5206
227,nodule,Perifissural,1.0,929,5.7784,173.0,5.3395,524.0,8.8648,82.0,6.9966,0.4389,-3.0864,-1.2182,-3.5253,-1.6571,1.8682
221,other,label,1.0,1249,7.7689,281.0,8.6728,581.0,9.8291,127.0,10.8362,-0.9039,-2.0602,-3.0673,-1.1563,-2.1634,-1.0071
203,lungcanc,LC_stage,110.0,625,3.8875,147.0,4.537,295.0,4.9907,72.0,6.1433,-0.6495,-1.1032,-2.2558,-0.4537,-1.6063,-1.1526
219,other,Emphysema,1.0,5692,35.4046,1122.0,34.6296,2156.0,36.4744,439.0,37.4573,0.775,-1.0698,-2.0527,-1.8448,-2.8277,-0.9829
113,disease,diagdiab,0.0,14428,89.7431,2880.0,88.8889,5362.0,90.7122,1046.0,89.2491,0.8542,-0.9691,0.494,-1.8233,-0.3602,1.4631
125,disease,diaghype,0.0,10489,65.2423,2185.0,67.4383,3909.0,66.1309,778.0,66.3823,-2.196,-0.8886,-1.14,1.3074,1.056,-0.2514


In [31]:
display(num_demo_shift.sort_values(by='diff_allnodules_allscans', ascending=False).head(10))
num_demo_shift.sort_values(by='diff_allnodules_allscans', ascending=True).head(10)

Unnamed: 0,category,attribute,value,allnodules,somenodules,allscans,somescans,diff_allnodules_somenodules,diff_allnodules_allscans,diff_allnodules_somescans,diff_somenodules_allscans,diff_somenodules_somescans,diff_allscans_somescans
64,other,NoduleCounts,75%,3.0,3.0,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0
66,other,NoduleCounts,Median (IQR),2 (2),2 (2),1 (1),1 (1),0.0,1.0,1.0,1.0,1.0,0.0
22,demo,Age,50%,63.0,63.0,62.0,63.0,0.0,1.0,0.0,1.0,0.0,-1.0
35,smoke,smokeage,25%,15.0,14.0,14.0,14.0,1.0,1.0,1.0,0.0,0.0,0.0
25,demo,Age,Median (IQR),63 (8),63 (8),62 (8),63 (8),0.0,1.0,0.0,1.0,0.0,-1.0
63,other,NoduleCounts,50%,2.0,2.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0
65,other,NoduleCounts,Mean (SD),2.6 (1.6),2.6 (1.7),1.9 (1.3),1.9 (1.3),0.0035,0.655,0.6719,0.6515,0.6684,0.0169
67,other,NoduleCounts,mean,2.555,2.5515,1.9,1.8831,0.0035,0.655,0.6719,0.6515,0.6684,0.0169
5,demo,weight,mean,182.4158,182.1708,181.923,182.8019,0.245,0.4928,-0.3861,0.2478,-0.6311,-0.8789
3,demo,weight,Mean (SD),182.4 (38.9),182.2 (39.0),181.9 (39.2),182.8 (39.5),0.245,0.4928,-0.3861,0.2478,-0.6311,-0.8789


Unnamed: 0,category,attribute,value,allnodules,somenodules,allscans,somescans,diff_allnodules_somenodules,diff_allnodules_allscans,diff_allnodules_somescans,diff_somenodules_allscans,diff_somenodules_somescans,diff_allscans_somescans
58,other,Diameter_mm,75%,9.1,9.0,10.5,10.8,0.1,-1.4,-1.7,-1.5,-1.8,-0.3
59,other,Diameter_mm,Mean (SD),8.0 (5.3),8.0 (5.3),9.0 (6.0),9.4 (6.6),-0.0197,-1.0303,-1.3999,-1.0106,-1.3802,-0.3696
61,other,Diameter_mm,mean,8.0177,8.0374,9.048,9.4176,-0.0197,-1.0303,-1.3999,-1.0106,-1.3802,-0.3696
57,other,Diameter_mm,50%,6.2,6.3,7.1,7.5,-0.1,-0.9,-1.3,-0.8,-1.2,-0.4
60,other,Diameter_mm,Median (IQR),6 (4),6 (4),7 (5),7 (5),-0.1,-0.9,-1.3,-0.8,-1.2,-0.4
34,smoke,pkyr,std,24.3554,23.2994,25.1504,23.3406,1.056,-0.795,1.0148,-1.851,-0.0412,1.8098
62,other,Diameter_mm,std,5.2795,5.2981,5.99,6.62,-0.0186,-0.7105,-1.3405,-0.6919,-1.3219,-0.63
30,smoke,pkyr,75%,69.0,68.0,69.5,69.0,1.0,-0.5,0.0,-1.5,-1.0,0.5
56,other,Diameter_mm,25%,4.9,5.0,5.3,5.6,-0.1,-0.4,-0.7,-0.3,-0.6,-0.3
6,demo,weight,std,38.8744,39.0324,39.2448,39.5068,-0.158,-0.3704,-0.6324,-0.2124,-0.4744,-0.262


#### Diff between sets for Tijmen's combined model vs. the rest

In [32]:
display(cat_demo_shift.sort_values(by='diff_allscans_somescans', ascending=False).head(10))
cat_demo_shift.sort_values(by='diff_allscans_somescans', ascending=True).head(10)

Unnamed: 0,category,attribute,value,allnodules_freq,allnodules_norm,somenodules_freq,somenodules_norm,allscans_freq,allscans_norm,somescans_freq,somescans_norm,diff_allnodules_somenodules,diff_allnodules_allscans,diff_allnodules_somescans,diff_somenodules_allscans,diff_somenodules_somescans,diff_allscans_somescans
53,work,wrkasbe,0.0,15277,95.0239,2980.0,91.9753,5610.0,94.9078,1082.0,92.3208,3.0486,0.1161,2.7031,-2.9325,-0.3455,2.587
44,smoke,pipe,0.0,12542,78.0121,2519.0,77.7469,4575.0,77.3981,882.0,75.256,0.2652,0.614,2.7561,0.3488,2.4909,2.1421
17,demo,educat,4.0,2307,14.3497,443.0,13.6728,852.0,14.4138,144.0,12.2867,0.6769,-0.0641,2.063,-0.741,1.3861,2.1271
227,nodule,Perifissural,1.0,929,5.7784,173.0,5.3395,524.0,8.8648,82.0,6.9966,0.4389,-3.0864,-1.2182,-3.5253,-1.6571,1.8682
42,smoke,cigsmok,0.0,7911,49.2069,1592.0,49.1358,2930.0,49.5686,560.0,47.7816,0.0711,-0.3617,1.4253,-0.4328,1.3542,1.787
31,demo,marital,5.0,2957,18.3927,574.0,17.716,1123.0,18.9985,202.0,17.2355,0.6767,-0.6058,1.1572,-1.2825,0.4805,1.763
113,disease,diagdiab,0.0,14428,89.7431,2880.0,88.8889,5362.0,90.7122,1046.0,89.2491,0.8542,-0.9691,0.494,-1.8233,-0.3602,1.4631
18,demo,educat,5.0,3539,22.0128,680.0,20.9877,1301.0,22.0098,244.0,20.8191,1.0251,0.003,1.1937,-1.0221,0.1686,1.1907
201,lungcanc,Unclassified_carcinoma,0.0,15953,99.2287,3194.0,98.5802,5840.0,98.7988,1145.0,97.6962,0.6485,0.4299,1.5325,-0.2186,0.884,1.1026
116,disease,diagemph,0.0,14283,88.8412,2908.0,89.7531,5265.0,89.0712,1031.0,87.9693,-0.9119,-0.23,0.8719,0.6819,1.7838,1.1019


Unnamed: 0,category,attribute,value,allnodules_freq,allnodules_norm,somenodules_freq,somenodules_norm,allscans_freq,allscans_norm,somescans_freq,somescans_norm,diff_allnodules_somenodules,diff_allnodules_allscans,diff_allnodules_somescans,diff_somenodules_allscans,diff_somenodules_somescans,diff_allscans_somescans
54,work,wrkasbe,1.0,789,4.9076,259.0,7.9938,300.0,5.0753,90.0,7.6792,-3.0862,-0.1677,-2.7716,2.9185,0.3146,-2.6039
45,smoke,pipe,1.0,3438,21.3846,702.0,21.6667,1301.0,22.0098,283.0,24.1468,-0.2821,-0.6252,-2.7622,-0.3431,-2.4801,-2.137
226,nodule,Perifissural,0.0,15148,94.2216,3067.0,94.6605,5387.0,91.1352,1090.0,93.0034,-0.4389,3.0864,1.2182,3.5253,1.6571,-1.8682
43,smoke,cigsmok,1.0,8166,50.7931,1648.0,50.8642,2981.0,50.4314,612.0,52.2184,-0.0711,0.3617,-1.4253,0.4328,-1.3542,-1.787
29,demo,marital,3.0,1327,8.254,315.0,9.7222,507.0,8.5772,120.0,10.2389,-1.4682,-0.3232,-1.9849,1.145,-0.5167,-1.6617
114,disease,diagdiab,1.0,1632,10.1511,360.0,11.1111,545.0,9.2201,126.0,10.7509,-0.96,0.931,-0.5998,1.891,0.3602,-1.5308
16,demo,educat,3.0,4077,25.3592,801.0,24.7222,1479.0,25.0211,307.0,26.1945,0.637,0.3381,-0.8353,-0.2989,-1.4723,-1.1734
203,lungcanc,LC_stage,110.0,625,3.8875,147.0,4.537,295.0,4.9907,72.0,6.1433,-0.6495,-1.1032,-2.2558,-0.4537,-1.6063,-1.1526
10,demo,ethnic,2.0,15779,98.1464,3200.0,98.7654,5779.0,97.7669,1159.0,98.8908,-0.619,0.3795,-0.7444,0.9985,-0.1254,-1.1239
202,lungcanc,Unclassified_carcinoma,1.0,124,0.7713,46.0,1.4198,71.0,1.2012,27.0,2.3038,-0.6485,-0.4299,-1.5325,0.2186,-0.884,-1.1026


In [33]:
display(num_demo_shift.sort_values(by='diff_allscans_somescans', ascending=False).head(10))
num_demo_shift.sort_values(by='diff_allscans_somescans', ascending=True).head(10)

Unnamed: 0,category,attribute,value,allnodules,somenodules,allscans,somescans,diff_allnodules_somenodules,diff_allnodules_allscans,diff_allnodules_somescans,diff_somenodules_allscans,diff_somenodules_somescans,diff_allscans_somescans
34,smoke,pkyr,std,24.3554,23.2994,25.1504,23.3406,1.056,-0.795,1.0148,-1.851,-0.0412,1.8098
30,smoke,pkyr,75%,69.0,68.0,69.5,69.0,1.0,-0.5,0.0,-1.5,-1.0,0.5
33,smoke,pkyr,mean,58.114,57.6895,58.3153,58.1227,0.4245,-0.2013,-0.0087,-0.6258,-0.4332,0.1926
31,smoke,pkyr,Mean (SD),58.1 (24.4),57.7 (23.3),58.3 (25.2),58.1 (23.3),0.4245,-0.2013,-0.0087,-0.6258,-0.4332,0.1926
48,smoke,smokeday,std,11.4249,11.8276,11.7108,11.5505,-0.4027,-0.2859,-0.1256,0.1168,0.2771,0.1603
13,demo,height,std,3.9278,3.7122,3.9714,3.8432,0.2156,-0.0436,0.0846,-0.2592,-0.131,0.1282
20,demo,BMI,std,4.9761,5.0482,4.9368,4.8659,-0.0721,0.0393,0.1102,0.1114,0.1823,0.0709
40,smoke,smokeage,mean,16.5716,16.5861,16.4926,16.4241,-0.0145,0.079,0.1475,0.0935,0.162,0.0685
38,smoke,smokeage,Mean (SD),16.6 (3.5),16.6 (3.4),16.5 (3.5),16.4 (3.4),-0.0145,0.079,0.1475,0.0935,0.162,0.0685
41,smoke,smokeage,std,3.4878,3.4069,3.5105,3.4498,0.0809,-0.0227,0.038,-0.1036,-0.0429,0.0607


Unnamed: 0,category,attribute,value,allnodules,somenodules,allscans,somescans,diff_allnodules_somenodules,diff_allnodules_allscans,diff_allnodules_somescans,diff_somenodules_allscans,diff_somenodules_somescans,diff_allscans_somescans
44,smoke,smokeday,75%,35.0,32.0,35.0,40.0,3.0,0.0,-5.0,-3.0,-8.0,-5.0
0,demo,weight,25%,155.0,155.0,155.0,157.0,0.0,0.0,-2.0,0.0,-2.0,-2.0
25,demo,Age,Median (IQR),63 (8),63 (8),62 (8),63 (8),0.0,1.0,0.0,1.0,0.0,-1.0
29,smoke,pkyr,50%,51.0,51.0,51.0,52.0,0.0,0.0,-1.0,0.0,-1.0,-1.0
32,smoke,pkyr,Median (IQR),51 (28),51 (27),51 (29),52 (28),0.0,0.0,-1.0,0.0,-1.0,-1.0
22,demo,Age,50%,63.0,63.0,62.0,63.0,0.0,1.0,0.0,1.0,0.0,-1.0
5,demo,weight,mean,182.4158,182.1708,181.923,182.8019,0.245,0.4928,-0.3861,0.2478,-0.6311,-0.8789
3,demo,weight,Mean (SD),182.4 (38.9),182.2 (39.0),181.9 (39.2),182.8 (39.5),0.245,0.4928,-0.3861,0.2478,-0.6311,-0.8789
62,other,Diameter_mm,std,5.2795,5.2981,5.99,6.62,-0.0186,-0.7105,-1.3405,-0.6919,-1.3219,-0.63
57,other,Diameter_mm,50%,6.2,6.3,7.1,7.5,-0.1,-0.9,-1.3,-0.8,-1.2,-0.4


## What about men vs. women?

### Training sets

In [None]:
gender_train_sets = {
    "M":sybil_splits['train'].query('Gender == 1'),
    "F":sybil_splits['train'].query('Gender == 2'),
}

In [None]:
cat_demo_gender = data.combine_diff_dfs(nlst_democols_sybil['cat'], data.diffs_category_prevalence, gender_train_sets, include_stat=True).query('value != 0')
display(cat_demo_gender.sort_values(by='diff_M_F', ascending=False))
cat_demo_gender.sort_values(by='diff_M_F', ascending=True)

  odds_ratio = p1 / (1 - p1) / p2 * (1 - p2)


Unnamed: 0,category,attribute,value,M_freq,M_norm,F_freq,F_norm,diff_M_F,stat_M_F,p_M_F
60,smoke,pipe,1.0,6186.0,37.3280,243.0,2.0970,35.2310,88.386383,0.0
52,smoke,cigar,1.0,5465.0,32.9773,477.0,4.1163,28.8610,70.533064,0.0
14,demo,Married,1.0,12987.0,78.3671,6295.0,54.3234,24.0437,42.742945,0.0
29,demo,marital,2.0,12987.0,78.3671,6295.0,54.3234,24.0437,42.742945,0.0
201,other,wrknomask,1.0,5486.0,33.1040,1495.0,12.9013,20.2027,42.070788,0.0
...,...,...,...,...,...,...,...,...,...,...
167,canchist,canccerv,1.0,,,393.0,3.3914,,,
170,canchist,cancstom,1.0,6.0,0.0362,,,,,
176,canchist,canctran,1.0,,,7.0,0.0604,,,
182,canchist,canclary,1.0,13.0,0.0784,,,,,


Unnamed: 0,category,attribute,value,M_freq,M_norm,F_freq,F_norm,diff_M_F,stat_M_F,p_M_F
32,demo,marital,5.0,2148.0,12.9616,3057.0,26.3807,-13.4191,-27.641877,3.494461e-168
30,demo,marital,3.0,566.0,3.4154,1489.0,12.8495,-9.4341,-27.634809,4.249472e-168
49,smoke,smokelive,1.0,13873.0,83.7135,10752.0,92.7856,-9.0721,-24.243129,7.813098e-130
111,disease,diagpneu,1.0,3095.0,18.6761,3183.0,27.4681,-8.7920,-17.125074,9.648654e-66
150,disease,diagchro,1.0,1132.0,6.8308,1618.0,13.9627,-7.1319,-18.921346,7.608662e-80
...,...,...,...,...,...,...,...,...,...,...
167,canchist,canccerv,1.0,,,393.0,3.3914,,,
170,canchist,cancstom,1.0,6.0,0.0362,,,,,
176,canchist,canctran,1.0,,,7.0,0.0604,,,
182,canchist,canclary,1.0,13.0,0.0784,,,,,


In [None]:
display(cat_demo_gender.sort_values(by='diff_M_F', ascending=False).query('category == "lungcanc"'))
cat_demo_gender.sort_values(by='diff_M_F', ascending=True).query('category == "lungcanc"')

Unnamed: 0,category,attribute,value,M_freq,M_norm,F_freq,F_norm,diff_M_F,stat_M_F,p_M_F
225,lungcanc,Squamous_cell_carcinoma,1.0,250.0,1.5086,106.0,0.9147,0.5939,4.583207,5e-06
211,lungcanc,LC_stage,400.0,254.0,1.5327,130.0,1.1219,0.4108,3.006118,0.002646
207,lungcanc,LC_stage,210.0,70.0,0.4224,28.0,0.2416,0.1808,2.66004,0.007813
213,lungcanc,LC_stage,,15737.0,94.9614,10988.0,94.8222,0.1392,0.521339,0.602131
217,lungcanc,Small_cell_carcinoma,1.0,119.0,0.7181,80.0,0.6904,0.0277,0.274115,0.783996
208,lungcanc,LC_stage,220.0,29.0,0.175,18.0,0.1553,0.0197,0.401954,0.687718
215,lungcanc,Adenosquamous_carcinoma,1.0,14.0,0.0845,10.0,0.0863,-0.0018,-0.051304,0.959084
212,lungcanc,LC_stage,900.0,3.0,0.0181,3.0,0.0259,-0.0078,-0.426947,0.669418
227,lungcanc,Unclassified_carcinoma,1.0,92.0,0.5552,66.0,0.5696,-0.0144,-0.158859,0.87378
209,lungcanc,LC_stage,310.0,102.0,0.6155,74.0,0.6386,-0.0231,-0.241224,0.809382


Unnamed: 0,category,attribute,value,M_freq,M_norm,F_freq,F_norm,diff_M_F,stat_M_F,p_M_F
205,lungcanc,LC_stage,110.0,284.0,1.7137,265.0,2.2868,-0.5731,-3.339789,0.000838
219,lungcanc,Bronchiolo-alveolar_carcinoma,1.0,47.0,0.2836,76.0,0.6559,-0.3723,-4.348065,1.4e-05
223,lungcanc,Adenocarcinoma,1.0,305.0,1.8405,253.0,2.1833,-0.3428,-2.001829,0.045303
210,lungcanc,LC_stage,320.0,32.0,0.1931,32.0,0.2761,-0.083,-1.395978,0.162721
206,lungcanc,LC_stage,120.0,61.0,0.3681,50.0,0.4315,-0.0634,-0.823843,0.410029
229,lungcanc,Large_cell_carcinoma,1.0,18.0,0.1086,17.0,0.1467,-0.0381,-0.869463,0.384594
221,lungcanc,Carcinoid_tumor,1.0,1.0,0.006,4.0,0.0345,-0.0285,-1.558146,0.119199
209,lungcanc,LC_stage,310.0,102.0,0.6155,74.0,0.6386,-0.0231,-0.241224,0.809382
227,lungcanc,Unclassified_carcinoma,1.0,92.0,0.5552,66.0,0.5696,-0.0144,-0.158859,0.87378
212,lungcanc,LC_stage,900.0,3.0,0.0181,3.0,0.0259,-0.0078,-0.426947,0.669418


In [None]:
num_demo_gender = data.combine_diff_dfs(nlst_democols_sybil['num'], data.diffs_numerical_means, gender_train_sets, include_stat=True)
display(num_demo_gender.sort_values(by='diff_M_F', ascending=False).head(10))
num_demo_gender.sort_values(by='diff_M_F', ascending=True).head(10)

Unnamed: 0,category,attribute,value,M,F,diff_M_F,stat_M_F,p_M_F
6,demo,weight,50%,195.0,157.0,38.0,85.449791,0.0
7,demo,weight,mean,198.2521,162.1246,36.1275,85.449791,0.0
10,smoke,smokeday,50%,30.0,20.0,10.0,26.078155,3.715016e-148
15,smoke,pkyr,mean,59.6811,51.097,8.5841,30.427542,4.14073e-200
14,smoke,pkyr,50%,52.0,45.0,7.0,30.427542,4.14073e-200
4,demo,height,50%,70.0,64.0,6.0,181.182113,0.0
5,demo,height,mean,70.274,64.4574,5.8166,181.182113,0.0
11,smoke,smokeday,mean,29.8936,26.3792,3.5144,26.078155,3.715016e-148
2,demo,Age,50%,61.0,60.0,1.0,7.108716,1.199168e-12
0,demo,BMI,50%,27.5461,26.5684,0.9777,12.938133,3.5265779999999996e-38


Unnamed: 0,category,attribute,value,M,F,diff_M_F,stat_M_F,p_M_F
9,smoke,smokeage,mean,16.2727,17.3593,-1.0866,-25.186671,1.915804e-138
8,smoke,smokeage,50%,16.0,17.0,-1.0,-25.186671,1.915804e-138
12,smoke,smokeyr,50%,40.0,40.0,0.0,10.415331,2.34694e-25
3,demo,Age,mean,61.8288,61.3933,0.4355,7.108716,1.199168e-12
1,demo,BMI,mean,28.1879,27.4096,0.7783,12.938133,3.5265779999999996e-38
13,smoke,smokeyr,mean,40.3346,39.4056,0.929,10.415331,2.34694e-25
0,demo,BMI,50%,27.5461,26.5684,0.9777,12.938133,3.5265779999999996e-38
2,demo,Age,50%,61.0,60.0,1.0,7.108716,1.199168e-12
11,smoke,smokeday,mean,29.8936,26.3792,3.5144,26.078155,3.715016e-148
5,demo,height,mean,70.274,64.4574,5.8166,181.182113,0.0


In [None]:
cat_demo_gender[cat_demo_gender['attribute'].isin(set(sybil_worse_df['col']))].query('value == 1').sort_values(by='diff_M_F', ascending=True)

Unnamed: 0,category,attribute,value,M_freq,M_norm,F_freq,F_norm,diff_M_F,stat_M_F,p_M_F
111,disease,diagpneu,1.0,3095.0,18.6761,3183.0,27.4681,-8.792,-17.125074,9.648654000000002e-66
135,disease,diagadas,1.0,658.0,3.9706,1072.0,9.2509,-5.2803,-17.090989,1.7321139999999999e-65
197,other,FamilyHistoryLungCa,1.0,3476.0,20.9751,2756.0,23.7832,-2.8081,-5.545105,2.93777e-08
138,disease,diagcopd,1.0,722.0,4.3567,755.0,6.5154,-2.1587,-7.743676,9.658305e-15
203,other,Emphysema,1.0,1444.0,8.7135,873.0,7.5337,1.1798,3.588256,0.000332897
66,work,wrkfoun,1.0,1139.0,6.873,94.0,0.8112,6.0618,28.397602,2.1647980000000002e-177
72,work,wrkasbe,1.0,1256.0,7.579,65.0,0.5609,7.0181,32.344239,1.67183e-229
201,other,wrknomask,1.0,5486.0,33.104,1495.0,12.9013,20.2027,42.070788,0.0
14,demo,Married,1.0,12987.0,78.3671,6295.0,54.3234,24.0437,42.742945,0.0
11,demo,Gender,1.0,16572.0,100.0,,,,,


### Evaluation sets (Kiran data)

In [None]:
gender_eval_sets = {
    "M":kiran_data.query('Gender == 1'),
    "F":kiran_data.query('Gender == 2'),
}

In [None]:
cat_gender_eval = data.combine_diff_dfs(nlst_democols_val['cat'], data.diffs_category_prevalence, gender_eval_sets).query('value != 0')
display(cat_gender_eval.sort_values(by='diff_M_F', ascending=False).head(40))
cat_gender_eval.sort_values(by='diff_M_F', ascending=True).head(40)

Unnamed: 0,category,attribute,value,M_freq,M_norm,F_freq,F_norm,diff_M_F
51,smoke,pipe,1.0,2133.0,36.5615,88.0,2.0235,34.538
43,smoke,cigar,1.0,1795.0,30.7679,139.0,3.1961,27.5718
24,demo,marital,2.0,4531.0,77.6654,2324.0,53.4376,24.2278
13,demo,Married,True,4531.0,77.6654,2324.0,53.4376,24.2278
210,other,wrknomask,True,2005.0,34.3675,577.0,13.2674,21.1001
1,demo,Overweight,True,4318.0,74.0144,2645.0,60.8186,13.1958
72,work,wrkfarm,1.0,907.0,15.5468,262.0,6.0244,9.5224
202,nodule,Solid,True,4074.0,69.832,2626.0,60.3817,9.4503
120,disease,diaghear,1.0,983.0,16.8495,349.0,8.0248,8.8247
81,work,wrkweld,1.0,552.0,9.4618,44.0,1.0117,8.4501


Unnamed: 0,category,attribute,value,M_freq,M_norm,F_freq,F_norm,diff_M_F
27,demo,marital,5.0,797.0,13.6613,1107.0,25.4541,-11.7928
25,demo,marital,3.0,205.0,3.5139,659.0,15.1529,-11.639
102,disease,diagpneu,1.0,1086.0,18.615,1270.0,29.2021,-10.5871
40,smoke,smokelive,1.0,4887.0,83.7676,4078.0,93.7687,-10.0011
188,nodule,GroundGlassOpacity,True,1406.0,24.1001,1453.0,33.41,-9.3099
141,disease,diagchro,1.0,393.0,6.7364,671.0,15.4288,-8.6924
126,disease,diagadas,1.0,205.0,3.5139,461.0,10.6001,-7.0862
4,demo,educat,3.0,1284.0,22.0089,1254.0,28.8342,-6.8253
208,other,PersonalCancerHist,True,120.0,2.0569,357.0,8.2088,-6.1519
206,other,FamilyHistoryLungCa,True,1384.0,23.723,1238.0,28.4663,-4.7433


In [None]:
display(cat_gender_eval.query('category == "nodule"').sort_values(by='diff_M_F', ascending=False).head(40))
cat_gender_eval.query('category == "nodule"').sort_values(by='diff_M_F', ascending=True).head(40)

Unnamed: 0,category,attribute,value,M_freq,M_norm,F_freq,F_norm,diff_M_F
202,nodule,Solid,True,4074.0,69.832,2626.0,60.3817,9.4503
192,nodule,Perifissural,True,564.0,9.6675,261.0,6.0014,3.6661
196,nodule,Calcified,True,5.0,0.0857,3.0,0.069,0.0167
198,nodule,Spiculation,True,10.0,0.1714,10.0,0.2299,-0.0585
194,nodule,NonSolid,True,66.0,1.1313,58.0,1.3336,-0.2023
200,nodule,PartSolid,True,101.0,1.7312,139.0,3.1961,-1.4649
204,nodule,SemiSolid,True,666.0,11.4158,640.0,14.716,-3.3002
190,nodule,NoduleInUpperLung,True,2919.0,50.0343,2343.0,53.8745,-3.8402
188,nodule,GroundGlassOpacity,True,1406.0,24.1001,1453.0,33.41,-9.3099


Unnamed: 0,category,attribute,value,M_freq,M_norm,F_freq,F_norm,diff_M_F
188,nodule,GroundGlassOpacity,True,1406.0,24.1001,1453.0,33.41,-9.3099
190,nodule,NoduleInUpperLung,True,2919.0,50.0343,2343.0,53.8745,-3.8402
204,nodule,SemiSolid,True,666.0,11.4158,640.0,14.716,-3.3002
200,nodule,PartSolid,True,101.0,1.7312,139.0,3.1961,-1.4649
194,nodule,NonSolid,True,66.0,1.1313,58.0,1.3336,-0.2023
198,nodule,Spiculation,True,10.0,0.1714,10.0,0.2299,-0.0585
196,nodule,Calcified,True,5.0,0.0857,3.0,0.069,0.0167
192,nodule,Perifissural,True,564.0,9.6675,261.0,6.0014,3.6661
202,nodule,Solid,True,4074.0,69.832,2626.0,60.3817,9.4503


In [None]:
display(cat_gender_eval.query('attribute == "LC_stage"').sort_values(by='diff_M_F', ascending=False).head(40))
cat_gender_eval.query('attribute == "LC_stage"').sort_values(by='diff_M_F', ascending=True).head(40)

Unnamed: 0,category,attribute,value,M_freq,M_norm,F_freq,F_norm,diff_M_F
221,lungcanc,LC_stage,400.0,125.0,2.1426,44.0,1.0117,1.1309
217,lungcanc,LC_stage,210.0,65.0,1.1142,19.0,0.4369,0.6773
220,lungcanc,LC_stage,320.0,20.0,0.3428,11.0,0.2529,0.0899
219,lungcanc,LC_stage,310.0,78.0,1.337,58.0,1.3336,0.0034
222,lungcanc,LC_stage,900.0,1.0,0.0171,3.0,0.069,-0.0519
216,lungcanc,LC_stage,120.0,61.0,1.0456,51.0,1.1727,-0.1271
218,lungcanc,LC_stage,220.0,18.0,0.3085,22.0,0.5059,-0.1974
223,lungcanc,LC_stage,,5154.0,88.3442,3857.0,88.6871,-0.3429
215,lungcanc,LC_stage,110.0,312.0,5.348,284.0,6.5302,-1.1822


Unnamed: 0,category,attribute,value,M_freq,M_norm,F_freq,F_norm,diff_M_F
215,lungcanc,LC_stage,110.0,312.0,5.348,284.0,6.5302,-1.1822
223,lungcanc,LC_stage,,5154.0,88.3442,3857.0,88.6871,-0.3429
218,lungcanc,LC_stage,220.0,18.0,0.3085,22.0,0.5059,-0.1974
216,lungcanc,LC_stage,120.0,61.0,1.0456,51.0,1.1727,-0.1271
222,lungcanc,LC_stage,900.0,1.0,0.0171,3.0,0.069,-0.0519
219,lungcanc,LC_stage,310.0,78.0,1.337,58.0,1.3336,0.0034
220,lungcanc,LC_stage,320.0,20.0,0.3428,11.0,0.2529,0.0899
217,lungcanc,LC_stage,210.0,65.0,1.1142,19.0,0.4369,0.6773
221,lungcanc,LC_stage,400.0,125.0,2.1426,44.0,1.0117,1.1309


In [None]:
num_gender_eval = data.combine_diff_dfs(nlst_democols_val['num'], data.diffs_numerical_means, gender_eval_sets)
display(num_gender_eval.sort_values(by='diff_M_F', ascending=False).head(10))
num_gender_eval.sort_values(by='diff_M_F', ascending=True).head(10)

Unnamed: 0,category,attribute,value,M,F,diff_M_F
6,demo,weight,50%,193.0,157.0,36.0
7,demo,weight,mean,196.796,161.7294,35.0666
20,other,SliceCount,mean,174.9587,163.9939,10.9648
10,smoke,smokeday,50%,30.0,20.0,10.0
19,other,SliceCount,50%,165.0,155.0,10.0
15,smoke,pkyr,mean,62.0469,52.5078,9.5391
14,smoke,pkyr,50%,55.0,46.0,9.0
4,demo,height,50%,70.0,64.0,6.0
5,demo,height,mean,70.4206,64.4917,5.9289
11,smoke,smokeday,mean,30.2072,26.3831,3.8241


Unnamed: 0,category,attribute,value,M,F,diff_M_F
9,smoke,smokeage,mean,16.1409,17.2138,-1.0729
8,smoke,smokeage,50%,16.0,17.0,-1.0
16,other,NoduleCounts,mean,1.8819,1.8857,-0.0038
17,other,Diameter_mm,50%,7.3,7.2,0.1
18,other,Diameter_mm,mean,9.4266,8.9452,0.4814
1,demo,BMI,mean,27.8551,27.323,0.5321
3,demo,Age,mean,63.4344,62.8972,0.5372
0,demo,BMI,50%,27.2592,26.518,0.7412
2,demo,Age,50%,63.0,62.0,1.0
12,smoke,smokeyr,50%,41.0,40.0,1.0


In [None]:
cat_gender_eval[cat_gender_eval['attribute'].isin(set(sybil_worse_df['col']))].query('value == 1').sort_values(by='diff_M_F', ascending=True)

Unnamed: 0,category,attribute,value,M_freq,M_norm,F_freq,F_norm,diff_M_F
102,disease,diagpneu,1.0,1086.0,18.615,1270.0,29.2021,-10.5871
126,disease,diagadas,1.0,205.0,3.5139,461.0,10.6001,-7.0862
206,other,FamilyHistoryLungCa,True,1384.0,23.723,1238.0,28.4663,-4.7433
129,disease,diagcopd,1.0,336.0,5.7593,340.0,7.8179,-2.0586
212,other,Emphysema,1,2242.0,38.4299,1455.0,33.456,4.9739
57,work,wrkfoun,1.0,412.0,7.0621,29.0,0.6668,6.3953
63,work,wrkasbe,1.0,453.0,7.7648,28.0,0.6438,7.121
210,other,wrknomask,True,2005.0,34.3675,577.0,13.2674,21.1001
13,demo,Married,True,4531.0,77.6654,2324.0,53.4376,24.2278
10,demo,Gender,1,5834.0,100.0,,,


## What about BMI?

### Sybil training set

In [None]:
sybil_train_over = sybil_splits['train'].query('Overweight == 1')
sybil_train_normal = sybil_splits['train'].query('Overweight == 0')

overweight_train_sets = {
    "over":sybil_train_over,
    "normal":sybil_train_normal,
}

In [None]:
cat_demo_overweight = data.combine_diff_dfs(nlst_democols_sybil['cat'], data.diffs_category_prevalence, overweight_train_sets).query('value != 0')
display(cat_demo_overweight.sort_values(by='diff_over_normal', ascending=False).head(40))
cat_demo_overweight.sort_values(by='diff_over_normal', ascending=True).head(40)

Unnamed: 0,category,attribute,value,over_freq,over_norm,normal_freq,normal_norm,diff_over_normal
11,demo,Gender,1.0,12764.0,64.1472,3808.0,46.0905,18.0567
147,disease,diaghype,1.0,7824.0,39.3205,2032.0,24.5945,14.726
132,disease,diagdiab,1.0,2441.0,12.2676,340.0,4.1152,8.1524
14,demo,Married,1.0,14090.0,70.8111,5192.0,62.8419,7.9692
29,demo,marital,2.0,14090.0,70.8111,5192.0,62.8419,7.9692
60,smoke,pipe,1.0,4998.0,25.1181,1431.0,17.3203,7.7978
201,other,wrknomask,1.0,5338.0,26.8268,1643.0,19.8862,6.9406
52,smoke,cigar,1.0,4601.0,23.1229,1341.0,16.2309,6.892
129,disease,diaghear,1.0,2893.0,14.5391,761.0,9.2108,5.3283
81,work,wrkfarm,1.0,2405.0,12.0866,713.0,8.6299,3.4567


Unnamed: 0,category,attribute,value,over_freq,over_norm,normal_freq,normal_norm,diff_over_normal
55,smoke,cigsmok,1.0,8339.0,41.9087,5047.0,61.0869,-19.1782
12,demo,Gender,2.0,7134.0,35.8528,4454.0,53.9095,-18.0567
32,demo,marital,5.0,3347.0,16.8208,1858.0,22.4885,-5.6677
138,disease,diagcopd,1.0,925.0,4.6487,552.0,6.6812,-2.0325
117,disease,diagemph,1.0,1533.0,7.7043,784.0,9.4892,-1.7849
203,other,Emphysema,1.0,1533.0,7.7043,784.0,9.4892,-1.7849
49,smoke,smokelive,1.0,17307.0,86.9786,7318.0,88.5742,-1.5956
8,demo,educat,7.0,2732.0,13.73,1244.0,15.0569,-1.3269
28,demo,marital,1.0,835.0,4.1964,439.0,5.3135,-1.1171
223,lungcanc,Adenocarcinoma,1.0,333.0,1.6735,225.0,2.7233,-1.0498


In [None]:
display(cat_demo_overweight.sort_values(by='diff_over_normal', ascending=False).query('category == "lungcanc"'))
cat_demo_overweight.sort_values(by='diff_over_normal', ascending=True).query('category == "lungcanc"')

Unnamed: 0,category,attribute,value,over_freq,over_norm,normal_freq,normal_norm,diff_over_normal
213,lungcanc,LC_stage,,18989.0,95.4317,7736.0,93.6335,1.7982
215,lungcanc,Adenosquamous_carcinoma,1.0,20.0,0.1005,4.0,0.0484,0.0521
210,lungcanc,LC_stage,320.0,46.0,0.2312,18.0,0.2179,0.0133
221,lungcanc,Carcinoid_tumor,1.0,4.0,0.0201,1.0,0.0121,0.008
212,lungcanc,LC_stage,900.0,3.0,0.0151,3.0,0.0363,-0.0212
208,lungcanc,LC_stage,220.0,27.0,0.1357,20.0,0.2421,-0.1064
207,lungcanc,LC_stage,210.0,63.0,0.3166,35.0,0.4236,-0.107
229,lungcanc,Large_cell_carcinoma,1.0,18.0,0.0905,17.0,0.2058,-0.1153
209,lungcanc,LC_stage,310.0,117.0,0.588,59.0,0.7141,-0.1261
206,lungcanc,LC_stage,120.0,70.0,0.3518,41.0,0.4962,-0.1444


Unnamed: 0,category,attribute,value,over_freq,over_norm,normal_freq,normal_norm,diff_over_normal
223,lungcanc,Adenocarcinoma,1.0,333.0,1.6735,225.0,2.7233,-1.0498
205,lungcanc,LC_stage,110.0,342.0,1.7188,207.0,2.5054,-0.7866
211,lungcanc,LC_stage,400.0,241.0,1.2112,143.0,1.7308,-0.5196
227,lungcanc,Unclassified_carcinoma,1.0,98.0,0.4925,60.0,0.7262,-0.2337
219,lungcanc,Bronchiolo-alveolar_carcinoma,1.0,75.0,0.3769,48.0,0.581,-0.2041
217,lungcanc,Small_cell_carcinoma,1.0,131.0,0.6584,68.0,0.823,-0.1646
225,lungcanc,Squamous_cell_carcinoma,1.0,243.0,1.2212,113.0,1.3677,-0.1465
206,lungcanc,LC_stage,120.0,70.0,0.3518,41.0,0.4962,-0.1444
209,lungcanc,LC_stage,310.0,117.0,0.588,59.0,0.7141,-0.1261
229,lungcanc,Large_cell_carcinoma,1.0,18.0,0.0905,17.0,0.2058,-0.1153


In [None]:
num_demo_overweight = data.combine_diff_dfs(nlst_democols_sybil['num'], data.diffs_numerical_means, overweight_train_sets)
display(num_demo_overweight.sort_values(by='diff_over_normal', ascending=False).head(10))
num_demo_overweight.sort_values(by='diff_over_normal', ascending=True).head(10)

Unnamed: 0,category,attribute,value,over,normal,diff_over_normal
7,demo,weight,mean,198.7192,146.2254,52.4938
6,demo,weight,50%,195.0,145.0,50.0
10,smoke,smokeday,50%,30.0,20.0,10.0
1,demo,BMI,mean,30.0004,22.6737,7.3267
0,demo,BMI,50%,28.941,23.026,5.915
15,smoke,pkyr,mean,56.9631,54.1872,2.7759
11,smoke,smokeday,mean,29.1976,26.6405,2.5571
14,smoke,pkyr,50%,49.5,47.5,2.0
5,demo,height,mean,68.1749,67.1694,1.0055
4,demo,height,50%,68.0,67.0,1.0


Unnamed: 0,category,attribute,value,over,normal,diff_over_normal
13,smoke,smokeyr,mean,39.4865,41.0742,-1.5877
8,smoke,smokeage,50%,16.0,17.0,-1.0
12,smoke,smokeyr,50%,40.0,41.0,-1.0
9,smoke,smokeage,mean,16.6196,16.9614,-0.3418
3,demo,Age,mean,61.5771,61.8241,-0.247
2,demo,Age,50%,61.0,61.0,0.0
4,demo,height,50%,68.0,67.0,1.0
5,demo,height,mean,68.1749,67.1694,1.0055
14,smoke,pkyr,50%,49.5,47.5,2.0
11,smoke,smokeday,mean,29.1976,26.6405,2.5571


### Evaluation set (Kiran Data)

In [None]:
kiran_data_over = kiran_data.query('Overweight == 1')
kiran_data_normal = kiran_data.query('Overweight == 0')

overweight_eval_sets = {
    "over":kiran_data_over,
    "normal":kiran_data_normal,
}

In [None]:
cat_eval_overweight = data.combine_diff_dfs(nlst_democols_val['cat'], data.diffs_category_prevalence, overweight_eval_sets).query('value != 0')
display(cat_eval_overweight.sort_values(by='diff_over_normal', ascending=False).head(40))
cat_eval_overweight.sort_values(by='diff_over_normal', ascending=True).head(40)

Unnamed: 0,category,attribute,value,over_freq,over_norm,normal_freq,normal_norm,diff_over_normal
10,demo,Gender,1,4318.0,62.0135,1516.0,47.0807,14.9328
138,disease,diaghype,1.0,2749.0,39.4801,813.0,25.2484,14.2317
24,demo,marital,2.0,4878.0,70.056,1977.0,61.3975,8.6585
13,demo,Married,True,4878.0,70.056,1977.0,61.3975,8.6585
123,disease,diagdiab,1.0,867.0,12.4515,124.0,3.8509,8.6006
51,smoke,pipe,1.0,1679.0,24.1132,542.0,16.8323,7.2809
43,smoke,cigar,1.0,1473.0,21.1547,461.0,14.3168,6.8379
120,disease,diaghear,1.0,1051.0,15.0941,281.0,8.7267,6.3674
210,other,wrknomask,True,1835.0,26.3536,747.0,23.1988,3.1548
192,nodule,Perifissural,True,625.0,8.976,200.0,6.2112,2.7648


Unnamed: 0,category,attribute,value,over_freq,over_norm,normal_freq,normal_norm,diff_over_normal
46,smoke,cigsmok,1,3084.0,44.2913,2004.0,62.236,-17.9447
212,other,Emphysema,1,2195.0,31.5238,1502.0,46.646,-15.1222
11,demo,Gender,2,2645.0,37.9865,1704.0,52.9193,-14.9328
27,demo,marital,5.0,1176.0,16.8893,728.0,22.6087,-5.7194
190,nodule,NoduleInUpperLung,True,3499.0,50.2513,1763.0,54.7516,-4.5003
102,disease,diagpneu,1.0,1522.0,21.8584,834.0,25.9006,-4.0422
108,disease,diagemph,1.0,629.0,9.0335,393.0,12.205,-3.1715
129,disease,diagcopd,1.0,397.0,5.7016,279.0,8.6646,-2.963
204,nodule,SemiSolid,True,831.0,11.9345,475.0,14.7516,-2.8171
214,other,label,1,768.0,11.0297,431.0,13.3851,-2.3554


In [None]:
display(cat_eval_overweight.sort_values(by='diff_over_normal', ascending=False).query('category == "nodule"'))
cat_eval_overweight.sort_values(by='diff_over_normal', ascending=True).query('category == "nodule"')

Unnamed: 0,category,attribute,value,over_freq,over_norm,normal_freq,normal_norm,diff_over_normal
192,nodule,Perifissural,True,625.0,8.976,200.0,6.2112,2.7648
188,nodule,GroundGlassOpacity,True,1989.0,28.5653,870.0,27.0186,1.5467
194,nodule,NonSolid,True,89.0,1.2782,35.0,1.087,0.1912
196,nodule,Calcified,True,3.0,0.0431,5.0,0.1553,-0.1122
202,nodule,Solid,True,4577.0,65.7332,2123.0,65.9317,-0.1985
198,nodule,Spiculation,True,7.0,0.1005,13.0,0.4037,-0.3032
200,nodule,PartSolid,True,138.0,1.9819,102.0,3.1677,-1.1858
204,nodule,SemiSolid,True,831.0,11.9345,475.0,14.7516,-2.8171
190,nodule,NoduleInUpperLung,True,3499.0,50.2513,1763.0,54.7516,-4.5003


Unnamed: 0,category,attribute,value,over_freq,over_norm,normal_freq,normal_norm,diff_over_normal
190,nodule,NoduleInUpperLung,True,3499.0,50.2513,1763.0,54.7516,-4.5003
204,nodule,SemiSolid,True,831.0,11.9345,475.0,14.7516,-2.8171
200,nodule,PartSolid,True,138.0,1.9819,102.0,3.1677,-1.1858
198,nodule,Spiculation,True,7.0,0.1005,13.0,0.4037,-0.3032
202,nodule,Solid,True,4577.0,65.7332,2123.0,65.9317,-0.1985
196,nodule,Calcified,True,3.0,0.0431,5.0,0.1553,-0.1122
194,nodule,NonSolid,True,89.0,1.2782,35.0,1.087,0.1912
188,nodule,GroundGlassOpacity,True,1989.0,28.5653,870.0,27.0186,1.5467
192,nodule,Perifissural,True,625.0,8.976,200.0,6.2112,2.7648


In [None]:
num_eval_overweight = data.combine_diff_dfs(nlst_democols_val['num'], data.diffs_numerical_means, overweight_eval_sets)
display(num_eval_overweight.sort_values(by='diff_over_normal', ascending=False).head(10))
num_eval_overweight.sort_values(by='diff_over_normal', ascending=True).head(10)

Unnamed: 0,category,attribute,value,over,normal,diff_over_normal
7,demo,weight,mean,198.3737,145.7502,52.6235
6,demo,weight,50%,195.0,145.0,50.0
10,smoke,smokeday,50%,28.0,20.0,8.0
1,demo,BMI,mean,29.9508,22.5585,7.3923
0,demo,BMI,50%,28.9719,22.9619,6.01
11,smoke,smokeday,mean,29.2309,27.1534,2.0775
15,smoke,pkyr,mean,58.4821,56.8718,1.6103
4,demo,height,50%,68.0,67.0,1.0
14,smoke,pkyr,50%,51.0,50.0,1.0
5,demo,height,mean,68.1811,67.2473,0.9338


Unnamed: 0,category,attribute,value,over,normal,diff_over_normal
20,other,SliceCount,mean,168.8502,173.6024,-4.7522
19,other,SliceCount,50%,160.0,163.0,-3.0
12,smoke,smokeyr,50%,40.0,42.0,-2.0
13,smoke,smokeyr,mean,40.4237,42.2637,-1.84
2,demo,Age,50%,62.0,63.0,-1.0
18,other,Diameter_mm,mean,9.012,9.6729,-0.6609
3,demo,Age,mean,63.0955,63.4416,-0.3461
9,smoke,smokeage,mean,16.4968,16.8205,-0.3237
17,other,Diameter_mm,50%,7.2,7.4,-0.2
16,other,NoduleCounts,mean,1.8827,1.8854,-0.0027


## What about race?

### Kiran data

In [None]:
kiran_data_white = kiran_data.query('race == 1')
kiran_data_black = kiran_data.query('race == 2')

race_kiran_sets = {
    "white":kiran_data_white,
    "black":kiran_data_black,
}

In [None]:
cat_race_kiran = data.combine_diff_dfs(nlst_democols['cat'], data.diffs_category_prevalence, race_kiran_sets, include_stat=True).query('value != 0')
display(cat_race_kiran.sort_values(by='diff_white_black', ascending=False).head(40))
cat_race_kiran.sort_values(by='diff_white_black', ascending=True).head(40)

  odds_ratio = p1 / (1 - p1) / p2 * (1 - p2)
  odds_ratio = p1 / (1 - p1) / p2 * (1 - p2)


Unnamed: 0,category,attribute,value,white_freq,white_norm,black_freq,black_norm,diff_white_black,stat_white_black,p_white_black
13,demo,Married,True,6464.0,68.3226,127.0,37.574,30.7486,11.484578,1.577044e-30
23,demo,marital,2.0,6464.0,68.3226,127.0,37.574,30.7486,11.484578,1.577044e-30
45,smoke,pipe,1.0,2147.0,22.6932,27.0,7.9882,14.705,9.572134,1.047217e-21
10,demo,Gender,1,5432.0,57.4146,149.0,44.0828,13.3318,4.851527,1.225147e-06
62,work,wrkfarm,1.0,1125.0,11.8909,9.0,2.6627,9.2282,9.851018,6.785323000000001e-23
4,demo,educat,3.0,2410.0,25.473,61.0,18.0473,7.4257,3.471116,0.0005182993
37,smoke,cigar,1.0,1838.0,19.4271,41.0,12.1302,7.2969,4.005365,6.192181e-05
5,demo,educat,4.0,1345.0,14.2163,28.0,8.284,5.9323,3.847914,0.000119128
239,scanner,ManufacturersModelName,Mx8000,599.0,6.3313,3.0,0.8876,5.4437,9.579139,9.785698e-22
7,demo,educat,6.0,1493.0,15.7806,35.0,10.355,5.4256,3.193234,0.001406887


Unnamed: 0,category,attribute,value,white_freq,white_norm,black_freq,black_norm,diff_white_black,stat_white_black,p_white_black
121,disease,diaghype,1.0,3203.0,33.8548,195.0,57.6923,-23.8375,-8.728667,2.5769140000000002e-18
40,smoke,cigsmok,1,4665.0,49.3077,232.0,68.6391,-19.3314,-7.506103,6.091341e-14
19,demo,Unfinished_ed,True,2547.0,26.921,152.0,44.9704,-18.0494,-6.577752,4.776144e-11
26,demo,marital,5.0,1715.0,18.127,114.0,33.7278,-15.6008,-5.995891,2.023732e-09
11,demo,Gender,2,4029.0,42.5854,189.0,55.9172,-13.3318,-4.851527,1.225147e-06
223,scanner,Manufacturer,SIEMENS,3528.0,37.2899,166.0,49.1124,-11.8225,-4.27687,1.895395e-05
106,disease,diagdiab,1.0,831.0,8.7834,67.0,19.8225,-11.0391,-5.045557,4.522031e-07
243,scanner,ManufacturersModelName,Sensation 16,1196.0,12.6414,77.0,22.7811,-10.1397,-4.395599,1.104677e-05
3,demo,educat,2.0,454.0,4.7986,48.0,14.2012,-9.4026,-4.919388,8.681501e-07
6,demo,educat,5.0,2093.0,22.1224,104.0,30.7692,-8.6468,-3.395645,0.0006846721


In [None]:
display(cat_race_kiran.query('category == "nodule"').sort_values(by='diff_white_black', ascending=False).head(40))
cat_race_kiran.query('category == "nodule"').sort_values(by='diff_white_black', ascending=True).head(40)

Unnamed: 0,category,attribute,value,white_freq,white_norm,black_freq,black_norm,diff_white_black,stat_white_black,p_white_black
175,nodule,Perifissural,True,779.0,8.2338,18.0,5.3254,2.9084,2.319991,0.020341
185,nodule,Solid,True,6275.0,66.3249,218.0,64.497,1.8279,0.690341,0.48998
183,nodule,PartSolid,True,226.0,2.3888,5.0,1.4793,0.9095,1.34705,0.177964
177,nodule,NonSolid,True,120.0,1.2684,3.0,0.8876,0.3808,0.728127,0.466536
173,nodule,NoduleInUpperLung,True,4879.0,51.5696,179.0,52.9586,-1.389,-0.502695,0.615179
171,nodule,GroundGlassOpacity,True,2632.0,27.8195,99.0,29.2899,-1.4704,-0.584011,0.559213
187,nodule,SemiSolid,True,1194.0,12.6202,50.0,14.7929,-2.1727,-1.107909,0.267901
179,nodule,Calcified,True,8.0,0.0846,,,,,
181,nodule,Spiculation,True,19.0,0.2008,,,,,


Unnamed: 0,category,attribute,value,white_freq,white_norm,black_freq,black_norm,diff_white_black,stat_white_black,p_white_black
187,nodule,SemiSolid,True,1194.0,12.6202,50.0,14.7929,-2.1727,-1.107909,0.267901
171,nodule,GroundGlassOpacity,True,2632.0,27.8195,99.0,29.2899,-1.4704,-0.584011,0.559213
173,nodule,NoduleInUpperLung,True,4879.0,51.5696,179.0,52.9586,-1.389,-0.502695,0.615179
177,nodule,NonSolid,True,120.0,1.2684,3.0,0.8876,0.3808,0.728127,0.466536
183,nodule,PartSolid,True,226.0,2.3888,5.0,1.4793,0.9095,1.34705,0.177964
185,nodule,Solid,True,6275.0,66.3249,218.0,64.497,1.8279,0.690341,0.48998
175,nodule,Perifissural,True,779.0,8.2338,18.0,5.3254,2.9084,2.319991,0.020341
179,nodule,Calcified,True,8.0,0.0846,,,,,
181,nodule,Spiculation,True,19.0,0.2008,,,,,


In [None]:
display(cat_race_kiran.query('attribute == "LC_stage"').sort_values(by='diff_white_black', ascending=False).head(40))
cat_race_kiran.query('attribute == "LC_stage"').sort_values(by='diff_white_black', ascending=True).head(40)

Unnamed: 0,category,attribute,value,white_freq,white_norm,black_freq,black_norm,diff_white_black,stat_white_black,p_white_black
204,lungcanc,LC_stage,,8387.0,88.6481,282.0,83.432,5.2161,2.546442,0.010883
197,lungcanc,LC_stage,120.0,106.0,1.1204,4.0,1.1834,-0.063,-0.10541,0.916051
198,lungcanc,LC_stage,210.0,77.0,0.8139,3.0,0.8876,-0.0737,-0.142165,0.88695
199,lungcanc,LC_stage,220.0,36.0,0.3805,2.0,0.5917,-0.2112,-0.500559,0.616682
202,lungcanc,LC_stage,400.0,157.0,1.6594,7.0,2.071,-0.4116,-0.523833,0.600395
200,lungcanc,LC_stage,310.0,119.0,1.2578,6.0,1.7751,-0.5173,-0.711313,0.47689
201,lungcanc,LC_stage,320.0,26.0,0.2748,5.0,1.4793,-1.2045,-1.828154,0.067526
196,lungcanc,LC_stage,110.0,549.0,5.8028,29.0,8.5799,-2.7771,-1.800737,0.071744
203,lungcanc,LC_stage,900.0,4.0,0.0423,,,,,


Unnamed: 0,category,attribute,value,white_freq,white_norm,black_freq,black_norm,diff_white_black,stat_white_black,p_white_black
196,lungcanc,LC_stage,110.0,549.0,5.8028,29.0,8.5799,-2.7771,-1.800737,0.071744
201,lungcanc,LC_stage,320.0,26.0,0.2748,5.0,1.4793,-1.2045,-1.828154,0.067526
200,lungcanc,LC_stage,310.0,119.0,1.2578,6.0,1.7751,-0.5173,-0.711313,0.47689
202,lungcanc,LC_stage,400.0,157.0,1.6594,7.0,2.071,-0.4116,-0.523833,0.600395
199,lungcanc,LC_stage,220.0,36.0,0.3805,2.0,0.5917,-0.2112,-0.500559,0.616682
198,lungcanc,LC_stage,210.0,77.0,0.8139,3.0,0.8876,-0.0737,-0.142165,0.88695
197,lungcanc,LC_stage,120.0,106.0,1.1204,4.0,1.1834,-0.063,-0.10541,0.916051
204,lungcanc,LC_stage,,8387.0,88.6481,282.0,83.432,5.2161,2.546442,0.010883
203,lungcanc,LC_stage,900.0,4.0,0.0423,,,,,


In [None]:
num_race_kiran = data.combine_diff_dfs(nlst_democols_val['num'], data.diffs_numerical_means, race_kiran_sets, include_stat=True)
display(num_race_kiran.sort_values(by='diff_white_black', ascending=False).head(10))
num_race_kiran.sort_values(by='diff_white_black', ascending=True).head(10)

Unnamed: 0,category,attribute,value,white,black,diff_white_black,stat_white_black,p_white_black
15,smoke,pkyr,mean,58.2982,49.4726,8.8256,6.577601,5.02514e-11
14,smoke,pkyr,50%,51.0,44.0,7.0,6.577601,5.02514e-11
11,smoke,smokeday,mean,28.7498,23.3994,5.3504,8.534595,1.6152390000000002e-17
10,smoke,smokeday,50%,25.0,20.0,5.0,8.534595,1.6152390000000002e-17
6,demo,weight,50%,180.0,178.0,2.0,0.562112,0.5740526
7,demo,weight,mean,182.0639,180.8542,1.2097,0.562112,0.5740526
2,demo,Age,50%,63.0,62.0,1.0,2.191158,0.02846383
4,demo,height,50%,68.0,67.0,1.0,3.412383,0.0006465818
5,demo,height,mean,67.9504,67.2047,0.7457,3.412383,0.0006465818
3,demo,Age,mean,63.2459,62.6095,0.6364,2.191158,0.02846383


Unnamed: 0,category,attribute,value,white,black,diff_white_black,stat_white_black,p_white_black
20,other,SliceCount,mean,170.4577,180.2394,-9.7817,-2.632583,0.008496641
13,smoke,smokeyr,mean,40.9904,42.3077,-1.3173,-3.247603,0.001167731
12,smoke,smokeyr,50%,41.0,42.0,-1.0,-3.247603,0.001167731
8,smoke,smokeage,50%,16.0,17.0,-1.0,-1.577698,0.1146672
1,demo,BMI,mean,27.6099,28.3226,-0.7127,-2.577897,0.009954865
18,other,Diameter_mm,mean,9.2075,9.8432,-0.6357,-1.900713,0.05736896
17,other,Diameter_mm,50%,7.2,7.65,-0.45,-1.900713,0.05736896
9,smoke,smokeage,mean,16.5805,16.8876,-0.3071,-1.577698,0.1146672
19,other,SliceCount,50%,161.0,161.0,0.0,-2.632583,0.008496641
16,other,NoduleCounts,mean,1.8937,1.7929,0.1008,1.442862,0.1490914
