In [1]:
import pandas as pd
import json
from IPython.display import display, Markdown

## directory where results are
CHANSEY_ROOT = "V:"
CHANSEY_DIR = f"{CHANSEY_ROOT}/experiments/lung-malignancy-fairness-shaurya"
TEAMS_DIR = "C:/Users/shaur/OneDrive - Radboudumc/Documents - Master - Shaurya Gaur/General/Malignancy-Estimation Results"

RETRAIN_SUBSETS_DIR = f"{CHANSEY_DIR}/kiran-retrain-subsets"

What to do in this notebook:

- See racial and M:B splits for each dataset and its 10 folds.
- See if we can enrich and plug in more black patient data.

In [2]:
kiran_full = pd.read_csv(f"{TEAMS_DIR}/nlst/nlst_demov4_allmodels_cal.csv")
kiran_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16077 entries, 0 to 16076
Columns: 128 entries, PatientID to NoduleType
dtypes: bool(27), float64(86), int64(11), object(4)
memory usage: 12.8+ MB


In [9]:
kiran_full['race'].value_counts(dropna=True, normalize=False)

race
1.0    15040
2.0      481
3.0      228
6.0      177
4.0       68
5.0       44
Name: count, dtype: int64

In [4]:
kiran_full.groupby('race')['label'].mean()

race
1.0    0.076130
2.0    0.118503
3.0    0.092105
4.0    0.176471
5.0    0.045455
6.0    0.056497
Name: label, dtype: float64

In [5]:
pd.DataFrame([
    kiran_full['race'].value_counts(dropna=True, normalize=True),
    kiran_full.groupby('race')['label'].mean()
])

race,1.0,2.0,3.0,6.0,4.0,5.0
proportion,0.937773,0.029991,0.014216,0.011036,0.00424,0.002743
label,0.07613,0.118503,0.092105,0.056497,0.176471,0.045455


In [10]:
def analyze_race_and_malignancy(subset_percent='70'):
    subsetdf = pd.read_csv(f"{RETRAIN_SUBSETS_DIR}/NLST_{subset_percent}_Bogdan/dataset.csv")
    subset_demos = kiran_full[kiran_full['PatientID'].isin(subsetdf['PatientID'])]
    
    getinfo = lambda df: pd.DataFrame([
        df['race'].value_counts(dropna=True, normalize=True),
        df['race'].value_counts(dropna=True, normalize=False),
        df.groupby('race')['label'].mean(),
        df.groupby('race')['diaghype'].mean()
    ])
    
    print(f"Theoretical max black proportion = {481 / len(subsetdf)}")
    
    infodf = getinfo(subset_demos)
    display(Markdown("### Overall dataset:"))
    display(infodf)

    for i in range(10):
        display(Markdown(f"#### Fold {i}"))
        
        train_fold = pd.read_csv(f"{RETRAIN_SUBSETS_DIR}/NLST_{subset_percent}_Bogdan/train{i}.csv")
        display(Markdown(f"##### train - overall malignancy = {train_fold['label'].mean()}"))
        display(getinfo(kiran_full[kiran_full['PatientID'].isin(train_fold['PatientID'])]))
        
        valid_fold = pd.read_csv(f"{RETRAIN_SUBSETS_DIR}/NLST_{subset_percent}_Bogdan/valid{i}.csv")
        display(Markdown(f"##### valid - overall malignancy = {valid_fold['label'].mean()}"))
        display(getinfo(kiran_full[kiran_full['PatientID'].isin(valid_fold['PatientID'])]))

## 70% subset

In [11]:
analyze_race_and_malignancy('70')

Theoretical max black proportion = 0.042740358983472546


### Overall dataset:

race,1.0,2.0,3.0,6.0,4.0,5.0
proportion,0.941224,0.024757,0.016475,0.010865,0.00472,0.001959
count,10569.0,278.0,185.0,122.0,53.0,22.0
label,0.07645,0.115108,0.097297,0.040984,0.169811,0.090909
diaghype,0.343291,0.564748,0.383784,0.327869,0.150943,0.272727


#### Fold 0

##### train - overall malignancy = 0.07717391304347826

race,1.0,2.0,3.0,6.0,4.0,5.0
proportion,0.941543,0.025816,0.015727,0.010781,0.004649,0.001484
count,9519.0,261.0,159.0,109.0,47.0,15.0
label,0.076479,0.10728,0.09434,0.045872,0.06383,0.133333
diaghype,0.340811,0.563218,0.36478,0.275229,0.170213,0.4


##### valid - overall malignancy = 0.082010582010582

race,1.0,3.0,2.0,6.0,5.0,4.0
proportion,0.938338,0.023235,0.015192,0.011618,0.006256,0.005362
count,1050.0,26.0,17.0,13.0,7.0,6.0
label,0.07619,0.115385,0.235294,0.0,0.0,1.0
diaghype,0.365714,0.5,0.588235,0.769231,0.0,0.0


#### Fold 1

##### train - overall malignancy = 0.07837114832061826

race,1.0,2.0,3.0,6.0,4.0,5.0
proportion,0.941124,0.024027,0.016779,0.011021,0.004964,0.002085
count,9479.0,242.0,169.0,111.0,50.0,21.0
label,0.076907,0.132231,0.088757,0.045045,0.18,0.047619
diaghype,0.348281,0.528926,0.390533,0.36036,0.16,0.238095


##### valid - overall malignancy = 0.0714900947459087

race,1.0,2.0,3.0,6.0,4.0,5.0
proportion,0.942092,0.031115,0.013829,0.009507,0.002593,0.000864
count,1090.0,36.0,16.0,11.0,3.0,1.0
label,0.072477,0.0,0.1875,0.0,0.0,1.0
diaghype,0.3,0.805556,0.3125,0.0,0.0,1.0


#### Fold 2

##### train - overall malignancy = 0.07772174428952833

race,1.0,2.0,3.0,6.0,4.0,5.0
proportion,0.940932,0.025867,0.016551,0.010406,0.004063,0.00218
count,9494.0,261.0,167.0,105.0,41.0,22.0
label,0.07668,0.114943,0.08982,0.038095,0.170732,0.090909
diaghype,0.345512,0.555556,0.401198,0.333333,0.146341,0.272727


##### valid - overall malignancy = 0.07712532865907099

race,1.0,3.0,2.0,6.0,4.0
proportion,0.94381,0.015803,0.014925,0.014925,0.010536
count,1075.0,18.0,17.0,17.0,12.0
label,0.074419,0.166667,0.117647,0.058824,0.166667
diaghype,0.323721,0.222222,0.705882,0.294118,0.166667


#### Fold 3

##### train - overall malignancy = 0.07837145110410094

race,1.0,2.0,3.0,6.0,4.0,5.0
proportion,0.94081,0.025099,0.015514,0.011462,0.004941,0.002174
count,9521.0,254.0,157.0,116.0,50.0,22.0
label,0.077618,0.106299,0.089172,0.034483,0.18,0.090909
diaghype,0.336949,0.566929,0.33758,0.336207,0.1,0.272727


##### valid - overall malignancy = 0.07117117117117117

race,1.0,3.0,2.0,6.0,4.0
proportion,0.944995,0.025248,0.021641,0.00541,0.002705
count,1048.0,28.0,24.0,6.0,3.0
label,0.06584,0.142857,0.208333,0.166667,0.0
diaghype,0.400763,0.642857,0.541667,0.166667,1.0


#### Fold 4

##### train - overall malignancy = 0.07874093933075166

race,1.0,2.0,3.0,6.0,4.0,5.0
proportion,0.943062,0.025284,0.015628,0.009357,0.004977,0.001692
count,9474.0,254.0,157.0,94.0,50.0,17.0
label,0.077159,0.125984,0.11465,0.021277,0.18,0.058824
diaghype,0.342963,0.566929,0.426752,0.329787,0.16,0.176471


##### valid - overall malignancy = 0.06846999154691462

race,1.0,6.0,3.0,2.0,5.0,4.0
proportion,0.925613,0.023669,0.023669,0.020287,0.004227,0.002536
count,1095.0,28.0,28.0,24.0,5.0,3.0
label,0.07032,0.107143,0.0,0.0,0.2,0.0
diaghype,0.346119,0.321429,0.142857,0.541667,0.6,0.0


#### Fold 5

##### train - overall malignancy = 0.07769869513641756

race,1.0,2.0,3.0,6.0,4.0,5.0
proportion,0.94173,0.025666,0.015459,0.010009,0.004955,0.00218
count,9503.0,259.0,156.0,101.0,50.0,22.0
label,0.076081,0.119691,0.108974,0.049505,0.16,0.090909
diaghype,0.343285,0.579151,0.410256,0.316832,0.1,0.272727


##### valid - overall malignancy = 0.0773286467486819

race,1.0,3.0,6.0,2.0,4.0
proportion,0.936731,0.025483,0.018453,0.016696,0.002636
count,1066.0,29.0,21.0,19.0,3.0
label,0.079737,0.034483,0.0,0.052632,0.333333
diaghype,0.34334,0.241379,0.380952,0.368421,1.0


#### Fold 6

##### train - overall malignancy = 0.07765467202666929

race,1.0,2.0,3.0,6.0,4.0,5.0
proportion,0.944073,0.025162,0.015038,0.010615,0.003833,0.001278
count,9605.0,256.0,153.0,108.0,39.0,13.0
label,0.076314,0.113281,0.091503,0.046296,0.230769,0.153846
diaghype,0.349364,0.546875,0.352941,0.361111,0.205128,0.307692


##### valid - overall malignancy = 0.07772511848341232

race,1.0,3.0,2.0,6.0,4.0,5.0
proportion,0.913744,0.030332,0.020853,0.01327,0.01327,0.008531
count,964.0,32.0,22.0,14.0,14.0,9.0
label,0.077801,0.125,0.136364,0.0,0.0,0.0
diaghype,0.282586,0.53125,0.772727,0.071429,0.0,0.222222


#### Fold 7

##### train - overall malignancy = 0.077500988533017

race,1.0,2.0,3.0,6.0,4.0,5.0
proportion,0.940046,0.023387,0.017838,0.01209,0.004459,0.00218
count,9486.0,236.0,180.0,122.0,45.0,22.0
label,0.076218,0.114407,0.1,0.040984,0.2,0.090909
diaghype,0.340095,0.59322,0.394444,0.327869,0.177778,0.272727


##### valid - overall malignancy = 0.07908611599297012

race,1.0,2.0,4.0,3.0
proportion,0.95167,0.036907,0.00703,0.004394
count,1083.0,42.0,8.0,5.0
label,0.078486,0.119048,0.0,0.0
diaghype,0.371296,0.404762,0.0,0.0


#### Fold 8

##### train - overall malignancy = 0.07644567037730272

race,1.0,2.0,3.0,6.0,4.0,5.0
proportion,0.939672,0.023993,0.01797,0.01096,0.005233,0.002172
count,9517.0,243.0,182.0,111.0,53.0,22.0
label,0.075234,0.106996,0.098901,0.045045,0.169811,0.090909
diaghype,0.34228,0.572016,0.373626,0.306306,0.150943,0.272727


##### valid - overall malignancy = 0.08884859474161379

race,1.0,2.0,6.0,3.0
proportion,0.955495,0.031789,0.009991,0.002725
count,1052.0,35.0,11.0,3.0
label,0.087452,0.171429,0.0,0.0
diaghype,0.35249,0.514286,0.545455,1.0


#### Fold 9

##### train - overall malignancy = 0.07694578372527797

race,1.0,2.0,3.0,6.0,4.0,5.0
proportion,0.939245,0.023276,0.018246,0.011934,0.005129,0.00217
count,9523.0,236.0,185.0,121.0,52.0,22.0
label,0.075816,0.110169,0.097297,0.041322,0.173077,0.090909
diaghype,0.343326,0.576271,0.383784,0.330579,0.153846,0.272727


##### valid - overall malignancy = 0.0843263061411549

race,1.0,2.0,6.0,4.0
proportion,0.959633,0.038532,0.000917,0.000917
count,1046.0,42.0,1.0,1.0
label,0.082218,0.142857,0.0,0.0
diaghype,0.342967,0.5,0.0,0.0


## 80% subset

In [12]:
analyze_race_and_malignancy('80')

Theoretical max black proportion = 0.03739989114376798


### Overall dataset:

race,1.0,2.0,3.0,6.0,4.0,5.0
proportion,0.936701,0.029545,0.015201,0.012239,0.004365,0.001949
count,12016.0,379.0,195.0,157.0,56.0,25.0
label,0.076814,0.102902,0.097436,0.038217,0.160714,0.08
diaghype,0.339643,0.540897,0.4,0.318471,0.196429,0.24


#### Fold 0

##### train - overall malignancy = 0.07686328698505916

race,1.0,2.0,3.0,6.0,4.0,5.0
proportion,0.936879,0.030264,0.014613,0.012365,0.004323,0.001556
count,10835.0,350.0,169.0,143.0,50.0,18.0
label,0.076327,0.1,0.094675,0.041958,0.06,0.111111
diaghype,0.338452,0.54,0.384615,0.27972,0.22,0.333333


##### valid - overall malignancy = 0.08502340093603744

race,1.0,2.0,3.0,6.0,5.0,4.0
proportion,0.935075,0.022961,0.020586,0.011085,0.005542,0.004751
count,1181.0,29.0,26.0,14.0,7.0,6.0
label,0.081287,0.137931,0.115385,0.0,0.0,1.0
diaghype,0.35055,0.551724,0.5,0.714286,0.0,0.0


#### Fold 1

##### train - overall malignancy = 0.07810466024472794

race,1.0,2.0,3.0,6.0,4.0,5.0
proportion,0.937359,0.028276,0.015486,0.012441,0.004611,0.001827
count,10774.0,325.0,178.0,143.0,53.0,21.0
label,0.07713,0.110769,0.089888,0.041958,0.169811,0.047619
diaghype,0.343442,0.510769,0.404494,0.328671,0.207547,0.238095


##### valid - overall malignancy = 0.07399103139013453

race,1.0,2.0,3.0,6.0,5.0,4.0
proportion,0.931034,0.04048,0.012744,0.010495,0.002999,0.002249
count,1242.0,54.0,17.0,14.0,4.0,3.0
label,0.074074,0.055556,0.176471,0.0,0.25,0.0
diaghype,0.306763,0.722222,0.352941,0.214286,0.25,0.0


#### Fold 2

##### train - overall malignancy = 0.07769510295898945

race,1.0,2.0,3.0,6.0,4.0,5.0
proportion,0.935629,0.031318,0.015355,0.011712,0.003817,0.002169
count,10785.0,361.0,177.0,135.0,44.0,25.0
label,0.077051,0.102493,0.084746,0.037037,0.159091,0.08
diaghype,0.341139,0.531856,0.412429,0.296296,0.204545,0.24


##### valid - overall malignancy = 0.07751343054489639

race,1.0,6.0,2.0,3.0,4.0
proportion,0.946195,0.01691,0.013836,0.013836,0.009224
count,1231.0,22.0,18.0,18.0,12.0
label,0.074736,0.045455,0.111111,0.222222,0.166667
diaghype,0.326564,0.454545,0.722222,0.277778,0.166667


#### Fold 3

##### train - overall malignancy = 0.07858563153799603

race,1.0,2.0,3.0,6.0,4.0,5.0
proportion,0.939055,0.028695,0.014478,0.01101,0.004595,0.002167
count,10832.0,331.0,167.0,127.0,53.0,25.0
label,0.077917,0.099698,0.095808,0.031496,0.169811,0.08
diaghype,0.334382,0.574018,0.365269,0.385827,0.150943,0.24


##### valid - overall malignancy = 0.0695517774343122

race,1.0,2.0,6.0,3.0,4.0
proportion,0.9157,0.037123,0.023202,0.021655,0.00232
count,1184.0,48.0,30.0,28.0,3.0
label,0.066723,0.125,0.066667,0.107143,0.0
diaghype,0.387669,0.3125,0.033333,0.607143,1.0


#### Fold 4

##### train - overall malignancy = 0.07857453281182095

race,1.0,2.0,3.0,6.0,4.0,5.0
proportion,0.938197,0.029899,0.014296,0.011245,0.00462,0.001743
count,10763.0,343.0,164.0,129.0,53.0,20.0
label,0.077395,0.113703,0.109756,0.023256,0.169811,0.05
diaghype,0.340255,0.524781,0.439024,0.317829,0.207547,0.15


##### valid - overall malignancy = 0.0700589970501475

race,1.0,2.0,3.0,6.0,5.0,4.0
proportion,0.924041,0.026549,0.022861,0.020649,0.003687,0.002212
count,1253.0,36.0,31.0,28.0,5.0,3.0
label,0.071828,0.0,0.032258,0.107143,0.2,0.0
diaghype,0.334397,0.694444,0.193548,0.321429,0.6,0.0


#### Fold 5

##### train - overall malignancy = 0.07736093143596377

race,1.0,2.0,3.0,6.0,4.0,5.0
proportion,0.936949,0.030531,0.014184,0.01159,0.004584,0.002162
count,10833.0,353.0,164.0,134.0,53.0,25.0
label,0.076156,0.101983,0.115854,0.044776,0.150943,0.08
diaghype,0.339254,0.546742,0.414634,0.298507,0.150943,0.24


##### valid - overall malignancy = 0.08056872037914692

race,1.0,3.0,2.0,6.0,4.0
proportion,0.934439,0.024487,0.020537,0.018167,0.00237
count,1183.0,31.0,26.0,23.0,3.0
label,0.08284,0.0,0.115385,0.0,0.333333
diaghype,0.343195,0.322581,0.461538,0.434783,1.0


#### Fold 6

##### train - overall malignancy = 0.07764321910160611

race,1.0,2.0,3.0,6.0,4.0,5.0
proportion,0.938932,0.029888,0.013867,0.012317,0.003618,0.001378
count,10901.0,347.0,161.0,143.0,42.0,16.0
label,0.07669,0.103746,0.086957,0.041958,0.214286,0.125
diaghype,0.344238,0.512968,0.372671,0.342657,0.261905,0.25


##### valid - overall malignancy = 0.07799671592775041

race,1.0,3.0,2.0,6.0,4.0,5.0
proportion,0.915435,0.027915,0.026273,0.011494,0.011494,0.007389
count,1115.0,34.0,32.0,14.0,14.0,9.0
label,0.078027,0.147059,0.09375,0.0,0.0,0.0
diaghype,0.294595,0.529412,0.84375,0.071429,0.0,0.222222


#### Fold 7

##### train - overall malignancy = 0.07764136261455992

race,1.0,2.0,3.0,6.0,4.0,5.0
proportion,0.935923,0.028267,0.016301,0.01318,0.004162,0.002168
count,10794.0,326.0,188.0,152.0,48.0,25.0
label,0.076709,0.101227,0.101064,0.039474,0.1875,0.08
diaghype,0.336644,0.567485,0.414894,0.328947,0.229167,0.24


##### valid - overall malignancy = 0.077992277992278

race,1.0,2.0,4.0,3.0,6.0
proportion,0.943629,0.040927,0.006178,0.005405,0.003861
count,1222.0,53.0,8.0,7.0,5.0
label,0.077741,0.113208,0.0,0.0,0.0
diaghype,0.366174,0.377358,0.0,0.0,0.0


#### Fold 8

##### train - overall malignancy = 0.07689655172413794

race,1.0,2.0,3.0,6.0,4.0,5.0
proportion,0.933961,0.029389,0.016596,0.013052,0.004841,0.002161
count,10805.0,340.0,192.0,151.0,56.0,25.0
label,0.076168,0.094118,0.098958,0.039735,0.160714,0.08
diaghype,0.338617,0.552941,0.390625,0.291391,0.196429,0.24


##### valid - overall malignancy = 0.08485329103885805

race,1.0,2.0,6.0,3.0
proportion,0.961875,0.030977,0.004766,0.002383
count,1211.0,39.0,6.0,3.0
label,0.082576,0.179487,0.0,0.0
diaghype,0.348837,0.435897,1.0,1.0


#### Fold 9

##### train - overall malignancy = 0.07741324377852407

race,1.0,2.0,3.0,6.0,4.0,5.0
proportion,0.934139,0.028917,0.016832,0.013466,0.004489,0.002158
count,10822.0,335.0,195.0,156.0,52.0,25.0
label,0.076603,0.101493,0.097436,0.038462,0.173077,0.08
diaghype,0.339996,0.549254,0.4,0.320513,0.153846,0.24


##### valid - overall malignancy = 0.08012820512820513

race,1.0,2.0,4.0,6.0
proportion,0.960579,0.035398,0.003218,0.000805
count,1194.0,44.0,4.0,1.0
label,0.078727,0.113636,0.0,0.0
diaghype,0.336425,0.477273,0.75,0.0
