# NLST Predictions Merge

Match and put together NLST predictions for Kiran, Thijmen, PanCan and Sybil (after those predictions have ran).

In [3]:
import pandas as pd
import os
import numpy as np

import seaborn as sns
sns.set_style("white")
from evalutils.roc import get_bootstrapped_roc_ci_curves
import matplotlib.pyplot as plt

%matplotlib inline
import sklearn.metrics as skl_metrics

## directory where results are
LOCAL_PC = False
root_dir = "/mnt/w" if LOCAL_PC else "/data/bodyct"
EXPERIMENT_DIR = f"{root_dir}/experiments/lung-malignancy-fairness-shaurya"
NLST_PREDS = f"{EXPERIMENT_DIR}/nlst-preds"

NLST_PREDS_LOCAL = "/mnt/c/Users/shaur/OneDrive - Radboudumc/Documents - Master - Shaurya Gaur/General/Malignancy-Estimation Results/nlst"
NLST_PREDS = NLST_PREDS_LOCAL ## Comment out if not using Teams backup (aka Chansey is up :)

## Load Predictions

Kiran's model and PanCan2b.

In [4]:
nlst_kiran_pancan = pd.read_csv(f"{NLST_PREDS}/NLST_DL_vs_PanCan_Venk21.csv")
nlst_kiran_pancan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16077 entries, 0 to 16076
Data columns (total 25 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   PatientID            16077 non-null  int64  
 1   StudyDate            16077 non-null  int64  
 2   SeriesInstanceUID    16077 non-null  object 
 3   CoordX               16077 non-null  float64
 4   CoordY               16077 non-null  float64
 5   CoordZ               16077 non-null  float64
 6   LesionID             16077 non-null  int64  
 7   NoduleType           16077 non-null  object 
 8   Spiculation          16077 non-null  bool   
 9   Diameter [mm]        16077 non-null  float64
 10  Age                  16077 non-null  int64  
 11  Gender               16077 non-null  int64  
 12  FamilyHistoryLungCa  16077 non-null  bool   
 13  Emphysema            16077 non-null  bool   
 14  NoduleInUpperLung    16077 non-null  bool   
 15  NoduleCounts         16077 non-null 

Thijmen's model. Note that it only has 3240 annotation predictions and not 16077.

In [5]:
nlst_thijmen = pd.read_csv(f"{NLST_PREDS}/NLST_Tijmen_results/merged_model_output.csv")
nlst_thijmen.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3240 entries, 0 to 3239
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   AnnotationID  3240 non-null   object 
 1   label         3240 non-null   float64
 2   Thijmen_mean  3240 non-null   float64
dtypes: float64(2), object(1)
memory usage: 76.1+ KB


Sanity check to make sure the labels match for the `AnnotationID` between Thijmen's CSV and the Kiran PanCan sheet. If the output DataFrame below is empty, you're all good :)

In [6]:
nlst_kp2 = nlst_kiran_pancan.sort_values(by='AnnotationID', ascending=True).set_index("AnnotationID")
nlst_tj2 = nlst_thijmen.sort_values(by='AnnotationID', ascending=True).set_index("AnnotationID")

nlst_kp2 = nlst_kp2[nlst_kp2.index.isin(nlst_tj2.index)]
res = nlst_kp2['label'].compare(nlst_tj2['label'])

del nlst_kp2, nlst_tj2
res

Unnamed: 0_level_0,self,other
AnnotationID,Unnamed: 1_level_1,Unnamed: 2_level_1


Merge the predictions!

In [7]:
nlst_preds = nlst_kiran_pancan.merge(nlst_thijmen, 
                                     how="left", ## 'inner' for only ones Thijmen has preds for
                                     on=['AnnotationID', 'label'], suffixes=(None,None))
nlst_preds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16077 entries, 0 to 16076
Data columns (total 26 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   PatientID            16077 non-null  int64  
 1   StudyDate            16077 non-null  int64  
 2   SeriesInstanceUID    16077 non-null  object 
 3   CoordX               16077 non-null  float64
 4   CoordY               16077 non-null  float64
 5   CoordZ               16077 non-null  float64
 6   LesionID             16077 non-null  int64  
 7   NoduleType           16077 non-null  object 
 8   Spiculation          16077 non-null  bool   
 9   Diameter [mm]        16077 non-null  float64
 10  Age                  16077 non-null  int64  
 11  Gender               16077 non-null  int64  
 12  FamilyHistoryLungCa  16077 non-null  bool   
 13  Emphysema            16077 non-null  bool   
 14  NoduleInUpperLung    16077 non-null  bool   
 15  NoduleCounts         16077 non-null 

Thijmen's doesn't seem like a size-matched subset with a higher malignant percentage.

In [8]:
nlst_thijmen_nonnull = nlst_preds[~nlst_preds['Thijmen_mean'].isnull()]
nlst_thijmen_nonnull.groupby('label').nunique()

Unnamed: 0_level_0,PatientID,StudyDate,SeriesInstanceUID,CoordX,CoordY,CoordZ,LesionID,NoduleType,Spiculation,Diameter [mm],...,NoduleCounts,SCT_EPI_LOC,xie_gc_gclobe150,loclup,locrup,PanCan2b,DL,NoduleID,AnnotationID,Thijmen_mean
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,907,3,1797,2794,2834,2845,18,4,1,202,...,10,7,0,0,0,2722,2958,1825,2959,2959
1,154,3,265,279,280,280,2,3,2,159,...,8,6,6,2,2,280,281,164,281,281


In [9]:
nlst_preds.to_csv(f"{NLST_PREDS}/nlst_kiran_thijmen_pancan_16077.csv", index=False)

In [None]:
nlst_thijmen_local = 

In [None]:
nlst_preds = nlst_preds.merge(nlst_thijmen, 
                how="left", ## 'inner' for only ones Thijmen has preds for
                on=['AnnotationID', 'label'], suffixes=(None,None))

In [10]:
nlst_thijmen_nonnull.to_csv(f"{NLST_PREDS}/nlst_kiran_thijmen_pancan_3240.csv", index=False)