# NLST Predictions Merge

Match and put together NLST predictions for Kiran, Thijmen, PanCan and Sybil (after those predictions have ran).

In [1]:
import pandas as pd
import os
import numpy as np

import seaborn as sns
sns.set_style("white")
from evalutils.roc import get_bootstrapped_roc_ci_curves
import matplotlib.pyplot as plt

%matplotlib inline
import sklearn.metrics as skl_metrics

## directory where results are
LOCAL_PC = False
root_dir = "/mnt/w" if LOCAL_PC else "/data/bodyct"
EXPERIMENT_DIR = f"{root_dir}/experiments/lung-malignancy-fairness-shaurya"
NLST_PREDS = f"{EXPERIMENT_DIR}/nlst-preds"

NLST_PREDS_LOCAL = "/mnt/c/Users/shaur/OneDrive - Radboudumc/Documents - Master - Shaurya Gaur/General/Malignancy-Estimation Results/nlst"
NLST_PREDS = NLST_PREDS_LOCAL ## Comment out if not using Teams backup (aka Chansey is up :)

## Load Predictions

Kiran's model and PanCan2b.

In [2]:
nlst_kiran_pancan = pd.read_csv(f"{NLST_PREDS}/NLST_DL_vs_PanCan_Venk21.csv")
nlst_kiran_pancan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16077 entries, 0 to 16076
Data columns (total 25 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   PatientID            16077 non-null  int64  
 1   StudyDate            16077 non-null  int64  
 2   SeriesInstanceUID    16077 non-null  object 
 3   CoordX               16077 non-null  float64
 4   CoordY               16077 non-null  float64
 5   CoordZ               16077 non-null  float64
 6   LesionID             16077 non-null  int64  
 7   NoduleType           16077 non-null  object 
 8   Spiculation          16077 non-null  bool   
 9   Diameter [mm]        16077 non-null  float64
 10  Age                  16077 non-null  int64  
 11  Gender               16077 non-null  int64  
 12  FamilyHistoryLungCa  16077 non-null  bool   
 13  Emphysema            16077 non-null  bool   
 14  NoduleInUpperLung    16077 non-null  bool   
 15  NoduleCounts         16077 non-null 

Thijmen's model. Note that it only has 3240 annotation predictions and not 16077.

In [4]:
nlst_thijmen = pd.read_csv(f"{NLST_PREDS}/NLST_Tijmen_results/merged_model_output.csv")
nlst_thijmen.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3240 entries, 0 to 3239
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   AnnotationID  3240 non-null   object 
 1   label         3240 non-null   float64
 2   Thijmen_mean  3240 non-null   float64
dtypes: float64(2), object(1)
memory usage: 76.1+ KB


Sanity check to make sure the labels match for the `AnnotationID` between Thijmen's CSV and the Kiran PanCan sheet. If the output DataFrame below is empty, you're all good :)

In [17]:
nlst_kp2 = nlst_kiran_pancan.sort_values(by='AnnotationID', ascending=True).set_index("AnnotationID")
nlst_tj2 = nlst_thijmen.sort_values(by='AnnotationID', ascending=True).set_index("AnnotationID")

nlst_kp2 = nlst_kp2[nlst_kp2.index.isin(nlst_tj2.index)]
res = nlst_kp2['label'].compare(nlst_tj2['label'])

del nlst_kp2, nlst_tj2
res

Unnamed: 0_level_0,self,other
AnnotationID,Unnamed: 1_level_1,Unnamed: 2_level_1


Merge the predictions!

In [23]:
nlst_preds = nlst_kiran_pancan.merge(nlst_thijmen, 
                                     how="left", ## 'inner' for only ones Thijmen has preds for
                                     on=['AnnotationID', 'label'], suffixes=(None,None))
nlst_preds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16077 entries, 0 to 16076
Data columns (total 26 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   PatientID            16077 non-null  int64  
 1   StudyDate            16077 non-null  int64  
 2   SeriesInstanceUID    16077 non-null  object 
 3   CoordX               16077 non-null  float64
 4   CoordY               16077 non-null  float64
 5   CoordZ               16077 non-null  float64
 6   LesionID             16077 non-null  int64  
 7   NoduleType           16077 non-null  object 
 8   Spiculation          16077 non-null  bool   
 9   Diameter [mm]        16077 non-null  float64
 10  Age                  16077 non-null  int64  
 11  Gender               16077 non-null  int64  
 12  FamilyHistoryLungCa  16077 non-null  bool   
 13  Emphysema            16077 non-null  bool   
 14  NoduleInUpperLung    16077 non-null  bool   
 15  NoduleCounts         16077 non-null 

Thijmen's doesn't seem like a size-matched subset with a higher malignant percentage.

In [26]:
nlst_thijmen_nonnull = nlst_preds[~nlst_preds['Thijmen_mean'].isnull()]
nlst_thijmen_nonnull.groupby('label').nunique()

Unnamed: 0_level_0,PatientID,StudyDate,SeriesInstanceUID,CoordX,CoordY,CoordZ,LesionID,NoduleType,Spiculation,Diameter [mm],...,NoduleCounts,SCT_EPI_LOC,xie_gc_gclobe150,loclup,locrup,PanCan2b,DL,NoduleID,AnnotationID,Thijmen_mean
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,907,3,1797,2794,2834,2845,18,4,1,202,...,10,7,0,0,0,2722,2958,1825,2959,2959
1,154,3,265,279,280,280,2,3,2,159,...,8,6,6,2,2,280,281,164,281,281


## Load NLST Demographic Info

In [139]:
nlst_dict_full = pd.read_csv(f"{NLST_PREDS}/participant_d040722.csv")
nlst_dict_full.info()

  nlst_dict_full = pd.read_csv(f"{NLST_PREDS}/participant_d040722.csv")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53452 entries, 0 to 53451
Columns: 356 entries, cen to evpsent
dtypes: float64(283), int64(64), object(9)
memory usage: 145.2+ MB


In [140]:
nlst_dict_subset = nlst_dict_full[nlst_dict_full['pid'].isin(nlst_preds['PatientID'])].reset_index(drop=True)
nlst_dict_subset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5282 entries, 0 to 5281
Columns: 356 entries, cen to evpsent
dtypes: float64(283), int64(64), object(9)
memory usage: 14.3+ MB


In [141]:
nlst_dict_subset.groupby(['gender'])['anyscr_has_nodule'].value_counts(dropna=False, normalize=True)

gender  anyscr_has_nodule
1       1.0                  0.986711
        0.0                  0.013289
2       1.0                  0.994278
        0.0                  0.005722
Name: proportion, dtype: float64

In [147]:
workfacets = ['res', 'wrk', 'yrs']
worktypes = ['asbe', 'baki', 'butc', 'chem', 'coal', 'cott', 'farm', 'fire', 'flou', 'foun', 'hard', 'pain', 'sand', 'weld']

diseasefacets = ['age', 'diag']
diseasetypes = ['adas', 'asbe', 'bron', 'chas', 'chro', 'copd', 'diab', 'emph', 'fibr', 'hear', 'hype', 'pneu', 'sarc', 'sili', 'stro', 'tube']

pcancerfacets = ['age', 'canc']
pcancertypes = ['blad', 'brea', 'cerv', 'colo', 'esop', 'kidn', 'lary', 'lung', 'nasa', 'oral', 'panc', 'phar', 'stom', 'thyr', 'tran']

alcohol_acrin = ['acrin_alc_curr', 'acrin_alc_ever', 'acrin_drink24hr', 'acrin_drinknum_curr', 'acrin_drinknum_form', 'acrin_drinkyrs_curr', 'acrin_drinkyrs_form', 'acrin_lastdrink']
alcohol_lss = ['lss_alcohol_freq', 'lss_alcohol_num']

cols = {
    "info": ['pid', 'study'],
    "demo": ['age', 'educat', 'ethnic', 'gender', 'height', 'marital', 'race', 'weight'],
    "smoking": ['age_quit', 'cigar', 'cigsmok', 'pipe', 'pkyr', 'smokeage', 'smokeday', 'smokelive', 'smokework', 'smokeyr'],
    "screen": ['scr_group'],
    "work": [facet+job for facet in workfacets for job in worktypes],
    "disease": [facet+disease for facet in diseasefacets for disease in diseasetypes],
    "personal_cancer": [facet+cancer for facet in pcancerfacets for cancer in pcancertypes],
    "family": ['fam'+member for member in ['brother', 'sister', 'child', 'father', 'mother']],
    "alcohol": alcohol_lss+alcohol_acrin,   
}

allcols = sum([cols[k] for k in cols], start=[])
len(allcols)

140

In [152]:
for c in cols:
    print(c, len(cols[c]))
    nlst_colinfo = nlst_dict_subset[cols[c]]
    display(nlst_colinfo.describe())

info 2


Unnamed: 0,pid,study
count,5282.0,5282.0
mean,148184.307081,1.474252
std,44395.945951,0.72802
min,100012.0,1.0
25%,113113.5,1.0
50%,125989.0,1.0
75%,205225.25,2.0
max,218887.0,3.0


demo 8


Unnamed: 0,age,educat,ethnic,gender,height,marital,race,weight
count,5282.0,5282.0,5282.0,5282.0,5273.0,5281.0,5282.0,5270.0
mean,62.183453,4.83529,2.538054,1.43014,67.846008,2.654043,1.307459,181.264706
std,5.184738,4.579593,7.283255,0.495142,3.921241,1.247796,3.831637,38.917809
min,54.0,1.0,1.0,1.0,49.0,1.0,1.0,75.0
25%,58.0,3.0,2.0,1.0,65.0,2.0,1.0,154.0
50%,61.0,5.0,2.0,1.0,68.0,2.0,1.0,179.0
75%,66.0,6.0,2.0,2.0,71.0,3.0,1.0,205.0
max,74.0,99.0,99.0,2.0,83.0,9.0,99.0,368.0


smoking 10


Unnamed: 0,age_quit,cigar,cigsmok,pipe,pkyr,smokeage,smokeday,smokelive,smokework,smokeyr
count,2618.0,5264.0,5282.0,5247.0,5282.0,5282.0,5282.0,5259.0,5247.0,5282.0
mean,55.744843,0.194149,0.503786,0.219173,58.086814,16.585384,28.639909,0.883248,0.871355,40.993942
std,6.603271,0.395582,0.500033,0.413725,24.839586,3.551498,11.54246,0.321155,0.334838,7.403968
min,11.0,0.0,0.0,0.0,30.0,4.0,12.0,0.0,0.0,13.0
25%,51.0,0.0,0.0,0.0,40.5,15.0,20.0,1.0,1.0,36.0
50%,56.0,0.0,1.0,0.0,50.0,16.0,25.0,1.0,1.0,41.0
75%,60.0,0.0,1.0,0.0,69.0,18.0,35.0,1.0,1.0,46.0
max,74.0,1.0,1.0,1.0,260.0,40.0,135.0,1.0,1.0,67.0


screen 1


Unnamed: 0,scr_group
count,5282.0
mean,1.962514
std,0.559739
min,1.0
25%,2.0
50%,2.0
75%,2.0
max,5.0


work 42


Unnamed: 0,resasbe,resbaki,resbutc,reschem,rescoal,rescott,resfarm,resfire,resflou,resfoun,...,yrscoal,yrscott,yrsfarm,yrsfire,yrsflou,yrsfoun,yrshard,yrspain,yrssand,yrsweld
count,246.0,132.0,109.0,333.0,26.0,41.0,566.0,93.0,53.0,219.0,...,26.0,42.0,578.0,93.0,54.0,222.0,36.0,272.0,88.0,290.0
mean,0.186992,0.007576,0.018349,0.183183,0.230769,0.04878,0.033569,0.548387,0.132075,0.146119,...,16.076923,7.571429,13.043253,14.752688,9.148148,13.90991,10.277778,10.448529,10.920455,14.42069
std,0.439844,0.087039,0.134829,0.424498,0.58704,0.218085,0.198945,0.561754,0.394078,0.390977,...,13.962587,9.625025,10.54543,10.4784,11.296029,13.636556,10.18854,11.427822,10.875705,12.957834
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.25,1.0,5.0,5.0,2.0,3.0,2.0,2.0,2.0,3.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,16.5,4.0,10.0,12.0,4.0,8.0,6.5,5.0,7.5,10.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,24.5,7.75,18.0,24.0,9.75,23.75,15.25,15.0,18.0,25.0
max,2.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,...,55.0,40.0,60.0,38.0,47.0,47.0,38.0,50.0,50.0,50.0


disease 32


Unnamed: 0,ageadas,ageasbe,agebron,agechas,agechro,agecopd,agediab,ageemph,agefibr,agehear,...,diagdiab,diagemph,diagfibr,diaghear,diaghype,diagpneu,diagsarc,diagsili,diagstro,diagtube
count,344.0,56.0,151.0,164.0,511.0,339.0,504.0,513.0,15.0,677.0,...,5276.0,5269.0,5273.0,5262.0,5272.0,5271.0,5269.0,5269.0,5276.0,5276.0
mean,49.976744,54.642857,38.357616,6.646341,40.927593,56.982301,54.438492,56.116959,50.866667,54.355982,...,0.09818,0.103625,0.003034,0.13417,0.355842,0.235249,0.002467,0.001708,0.02881,0.011372
std,12.838741,7.354476,19.412829,3.939331,17.123446,7.469443,9.963561,8.926455,9.804275,10.431178,...,0.297587,0.304802,0.055006,0.340867,0.478813,0.424195,0.049615,0.041298,0.167287,0.106043
min,13.0,27.0,1.0,0.0,1.0,30.0,1.0,13.0,25.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,42.75,50.0,21.0,4.0,30.0,52.5,50.0,51.0,49.0,50.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,53.0,56.5,40.0,6.0,45.0,57.0,56.0,57.0,51.0,56.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,60.0,60.0,55.0,9.0,55.0,62.0,60.0,62.0,56.0,61.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
max,72.0,65.0,73.0,17.0,72.0,73.0,73.0,73.0,65.0,74.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


personal_cancer 30


Unnamed: 0,ageblad,agebrea,agecerv,agecolo,ageesop,agekidn,agelary,agelung,agenasa,ageoral,...,canckidn,canclary,canclung,cancnasa,cancoral,cancpanc,cancphar,cancstom,cancthyr,canctran
count,35.0,91.0,78.0,25.0,5.0,8.0,5.0,4.0,3.0,14.0,...,5276.0,5276.0,5276.0,5274.0,5275.0,5276.0,5276.0,5276.0,5273.0,5272.0
mean,55.885714,49.626374,36.538462,53.84,50.8,54.625,55.0,65.25,62.0,53.928571,...,0.001516,0.000948,0.000758,0.000569,0.002654,0.0,0.00019,0.001137,0.001517,0.00019
std,10.303544,8.452019,10.079801,8.979978,6.978539,7.424621,12.629331,4.5,6.244998,9.98818,...,0.038914,0.030773,0.027527,0.023846,0.051454,0.0,0.013767,0.033707,0.038925,0.013772
min,17.0,25.0,21.0,32.0,45.0,48.0,40.0,59.0,55.0,36.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,52.0,43.5,28.0,50.0,46.0,49.5,43.0,63.5,59.5,50.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,56.0,50.0,34.5,55.0,48.0,52.0,60.0,66.5,64.0,54.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,63.5,55.5,41.75,59.0,53.0,57.5,65.0,68.25,65.5,60.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,68.0,68.0,63.0,67.0,62.0,67.0,67.0,69.0,67.0,70.0,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0


family 5


Unnamed: 0,fambrother,famsister,famchild,famfather,fammother
count,5140.0,5141.0,5138.0,5114.0,5152.0
mean,0.057977,0.042988,0.004866,0.109894,0.0658
std,0.233722,0.202849,0.069591,0.312789,0.247956
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0


alcohol 10


Unnamed: 0,lss_alcohol_freq,lss_alcohol_num,acrin_alc_curr,acrin_alc_ever,acrin_drink24hr,acrin_drinknum_curr,acrin_drinknum_form,acrin_drinkyrs_curr,acrin_drinkyrs_form,acrin_lastdrink
count,3495.0,2744.0,1662.0,1757.0,1185.0,1181.0,404.0,1174.0,405.0,435.0
mean,2.931903,1.91035,1.737064,1.948776,0.972152,6.938188,13.975248,39.326235,23.711111,2.372414
std,1.459018,0.912094,0.440361,0.220516,1.622811,9.341564,20.147949,10.454303,13.555825,0.860914
min,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,2.0,1.0,1.0,2.0,0.0,0.0,0.0,35.0,12.0,1.0
50%,3.0,2.0,2.0,2.0,0.0,4.0,5.0,40.0,23.0,3.0
75%,4.0,2.0,2.0,2.0,2.0,10.0,20.0,45.0,34.0,3.0
max,5.0,5.0,2.0,2.0,26.0,89.0,100.0,70.0,70.0,3.0


In [146]:
nlst_demos = nlst_dict_subset[allcols]
nlst_demos.describe()

Unnamed: 0,pid,study,age,educat,ethnic,gender,height,marital,race,weight,...,lss_alcohol_freq,lss_alcohol_num,acrin_alc_curr,acrin_alc_ever,acrin_drink24hr,acrin_drinknum_curr,acrin_drinknum_form,acrin_drinkyrs_curr,acrin_drinkyrs_form,acrin_lastdrink
count,5282.0,5282.0,5282.0,5282.0,5282.0,5282.0,5273.0,5281.0,5282.0,5270.0,...,3495.0,2744.0,1662.0,1757.0,1185.0,1181.0,404.0,1174.0,405.0,435.0
mean,148184.307081,1.474252,62.183453,4.83529,2.538054,1.43014,67.846008,2.654043,1.307459,181.264706,...,2.931903,1.91035,1.737064,1.948776,0.972152,6.938188,13.975248,39.326235,23.711111,2.372414
std,44395.945951,0.72802,5.184738,4.579593,7.283255,0.495142,3.921241,1.247796,3.831637,38.917809,...,1.459018,0.912094,0.440361,0.220516,1.622811,9.341564,20.147949,10.454303,13.555825,0.860914
min,100012.0,1.0,54.0,1.0,1.0,1.0,49.0,1.0,1.0,75.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,113113.5,1.0,58.0,3.0,2.0,1.0,65.0,2.0,1.0,154.0,...,2.0,1.0,1.0,2.0,0.0,0.0,0.0,35.0,12.0,1.0
50%,125989.0,1.0,61.0,5.0,2.0,1.0,68.0,2.0,1.0,179.0,...,3.0,2.0,2.0,2.0,0.0,4.0,5.0,40.0,23.0,3.0
75%,205225.25,2.0,66.0,6.0,2.0,2.0,71.0,3.0,1.0,205.0,...,4.0,2.0,2.0,2.0,2.0,10.0,20.0,45.0,34.0,3.0
max,218887.0,3.0,74.0,99.0,99.0,2.0,83.0,9.0,99.0,368.0,...,5.0,5.0,2.0,2.0,26.0,89.0,100.0,70.0,70.0,3.0
