In [1]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json

import sys
sys.path.append('../')

import utils
from utils import ax_rocs, plot_rocs, rocs_models
from evalutils.roc import get_bootstrapped_roc_ci_curves
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

## directory where results are
LOCAL_PC = False
root_dir = "/mnt/w" if LOCAL_PC else "/data/bodyct"
EXPERIMENT_DIR = f"{root_dir}/experiments/lung-malignancy-fairness-shaurya"
NLST_PREDS = f"{EXPERIMENT_DIR}/nlst-preds"

NLST_PREDS_LOCAL = "/mnt/c/Users/shaur/OneDrive - Radboudumc/Documents - Master - Shaurya Gaur/General/Malignancy-Estimation Results/nlst"
NLST_PREDS = NLST_PREDS_LOCAL ## Comment out if not using Teams backup (aka Chansey is up :)

In [19]:
nlst_preds_nodule = pd.read_csv(f'{NLST_PREDS}/nlst_demo_v1_w4preds.csv')
nlst_preds_nodule.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16077 entries, 0 to 16076
Data columns (total 89 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   PatientID            16077 non-null  int64  
 1   StudyDate            16077 non-null  int64  
 2   SeriesInstanceUID    16077 non-null  object 
 3   CoordX               16077 non-null  float64
 4   CoordY               16077 non-null  float64
 5   CoordZ               16077 non-null  float64
 6   LesionID             16077 non-null  int64  
 7   NoduleType           16077 non-null  object 
 8   Spiculation          16077 non-null  bool   
 9   Diameter [mm]        16077 non-null  float64
 10  Age                  16077 non-null  int64  
 11  Gender               16077 non-null  int64  
 12  FamilyHistoryLungCa  16077 non-null  bool   
 13  Emphysema            16077 non-null  bool   
 14  NoduleInUpperLung    16077 non-null  bool   
 15  NoduleCounts         16077 non-null 

In [3]:
with open(f'{NLST_PREDS}/nlst_demo_v1_cols.json') as json_data:
    nlst_democols = json.load(json_data)
    json_data.close()

In [4]:
MODEL_TO_COL = {
    "Venkadesh": "DL",
    "de Haas": "Thijmen_mean",
    "de Haas local": "Thijmen_local",
    # "Sybil": "sybil_year1",
    "PanCan2b": "PanCan2b",
}

### Convert to scan-level

In [20]:
nlst_preds_nodule.nunique()[0:30]

PatientID               5282
StudyDate                  3
SeriesInstanceUID      10183
CoordX                 11867
CoordY                 13071
CoordZ                 13528
LesionID                  18
NoduleType                 7
Spiculation                2
Diameter [mm]            368
Age                       23
Gender                     2
FamilyHistoryLungCa        2
Emphysema                  2
NoduleInUpperLung          2
NoduleCounts              12
SCT_EPI_LOC                7
xie_gc_gclobe150           6
loclup                     2
locrup                     2
PanCan2b               14187
label                      2
DL                     16068
NoduleID               10024
AnnotationID           16077
Thijmen_mean            3240
height                    28
weight                   216
pkyr                     284
smokeage                  33
dtype: int64

In [23]:
nodule_specific_cols = ['NoduleType', 'Spiculation', 'CoordX', 'CoordY', 'CoordZ', 'Diameter [mm]', 'NoduleID', 'AnnotationID']
nlst_preds = nlst_preds_nodule.drop(nodule_specific_cols, axis=1)
nlst_preds.nunique()[0:30]

PatientID               5282
StudyDate                  3
SeriesInstanceUID      10183
LesionID                  18
Age                       23
Gender                     2
FamilyHistoryLungCa        2
Emphysema                  2
NoduleInUpperLung          2
NoduleCounts              12
SCT_EPI_LOC                7
xie_gc_gclobe150           6
loclup                     2
locrup                     2
PanCan2b               14187
label                      2
DL                     16068
Thijmen_mean            3240
height                    28
weight                   216
pkyr                     284
smokeage                  33
smokeday                  42
smokeyr                   52
marital                    5
educat                     7
race                       6
ethnic                     2
cigar                      2
cigsmok                    2
dtype: int64

In [33]:
models = list(MODEL_TO_COL.values())
for m in models:
    nlst_preds[m] = nlst_preds.groupby('SeriesInstanceUID')[m].transform(max)

nlst_preds.nunique().sort_values(ascending=False)

Thijmen_local        10183
SeriesInstanceUID    10183
DL                   10181
PanCan2b              9379
PatientID             5282
                     ...  
wrkweld                  2
wrknomask                2
diagadas                 2
wrkfire                  2
cancpanc                 1
Length: 81, dtype: int64

In [35]:
nlst_preds = nlst_preds.drop_duplicates(['SeriesInstanceUID'], ignore_index=True)
nlst_preds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10183 entries, 0 to 10182
Data columns (total 81 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   PatientID            10183 non-null  int64  
 1   StudyDate            10183 non-null  int64  
 2   SeriesInstanceUID    10183 non-null  object 
 3   LesionID             10183 non-null  int64  
 4   Age                  10183 non-null  int64  
 5   Gender               10183 non-null  int64  
 6   FamilyHistoryLungCa  10183 non-null  bool   
 7   Emphysema            10183 non-null  bool   
 8   NoduleInUpperLung    10183 non-null  bool   
 9   NoduleCounts         10183 non-null  int64  
 10  SCT_EPI_LOC          9615 non-null   float64
 11  xie_gc_gclobe150     568 non-null    float64
 12  loclup               1199 non-null   float64
 13  locrup               1199 non-null   float64
 14  PanCan2b             10183 non-null  float64
 15  label                10183 non-null 

In [None]:
MODEL_TO_COL = {
    "Venkadesh": "DL",
    # "de Haas": "Thijmen_mean",
    "de Haas local": "Thijmen_local",
    # "Sybil": "sybil_year1",
    "PanCan2b": "PanCan2b",
}

Sanity check - let's see internal NLST validation results.

In [36]:
utils.rocs_models(nlst_preds, models=MODEL_TO_COL, dataset_label="NLST")

ValueError: Input contains NaN.

In [None]:
utils.stats_from_cm(*utils.cm_with_thres(nlst_preds))

## Plot results by category

In [10]:
def plot_by_category(df, cat, models=MODEL_TO_COL.keys()):
    groups = df.groupby(cat) 
    df_catinfo, plot_roc = utils.info_by_splits(groups)
    display(df_catinfo)

    if not plot_roc:
        print("Not plotting ROC since there are zero values for malignant or benign nodules :(")
        return df_catinfo

    rocs = {}
    for m in models:
        rocs[m], df_perf = utils.perf_by_splits(groups, pred_col=MODEL_TO_COL[m])
        print(m)
        display(df_perf)

    fig, ax = plt.subplots(1, len(models), figsize=(6.5*len(models) - 0.5, 6))
    fig.suptitle(f"Model Performance Split By {cat}")
    for i, m in enumerate(models):
        utils.ax_rocs(ax[i], rocs[m], title=m)
    plt.show()

    return df_catinfo        

### Included columns (from predictions)

In [None]:
# pred_only_cols = ["NoduleInUpperLung", "Spiculation", "NoduleType"]
# for c in pred_only_cols:
#     print(c)
#     plot_by_category(nlst_preds, c)

### Demographics

In [None]:
for cat in nlst_democols['cat']['demo']:
    print(cat)
    plot_by_category(nlst_preds, cat)

### Smoking

In [None]:
for cat in nlst_democols['cat']['smoke']:
    print(cat)
    plot_by_category(nlst_preds, cat)

### Work history

In [None]:
for cat in nlst_democols['cat']['work']:
    print(cat)
    plot_by_category(nlst_preds, cat)

### Disease history

In [None]:
plot_by_category(nlst_preds, 'Emphysema')

In [None]:
for cat in nlst_democols['cat']['disease']:
    print(cat)
    plot_by_category(nlst_preds, cat)

### Personal cancer history

In [None]:
for cat in nlst_democols['cat']['canchist']:
    print(cat)
    plot_by_category(nlst_preds, cat)

## Binning for numerical columns

### Demographic columns

In [None]:
for c in nlst_democols['num']['demo']:
    print(c)
    display(nlst_preds.groupby('label')[c].describe())
    sns.histplot(nlst_preds, x=c, bins=25, hue="label", multiple='stack')
    plt.show()

In [None]:
nlst_preds['bin_Age'] = pd.cut(nlst_preds['Age'], bins=[55, 60, 65, 70, 78], right=False)
display(nlst_preds.groupby('bin_Age')['label'].describe())
plot_by_category(nlst_preds, f'bin_Age')

In [None]:
nlst_preds['bin_height'] = pd.cut(nlst_preds['height'], bins=[48, 68, 88], right=True)
display(nlst_preds.groupby('bin_height')[['label', 'height']].describe())
plot_by_category(nlst_preds, f'bin_height')

In [None]:
nlst_preds['bin_weight'] = pd.cut(nlst_preds['weight'], bins=[79, 179, 279], right=True)
display(nlst_preds.groupby('bin_weight')[['label', 'weight']].describe())
plot_by_category(nlst_preds, f'bin_weight')

### Smoking columns

In [None]:
for c in nlst_democols['num']['smoke']:
    print(c)
    display(nlst_preds.groupby('label')[c].describe())
    sns.histplot(nlst_preds, x=c, bins=25, hue="label", multiple='stack')
    plt.show()

In [None]:
for c in nlst_democols['num']['smoke']:
    nlst_preds[f'bin_{c}'] = pd.qcut(nlst_preds[c], q=2)
    display(nlst_preds.groupby(f'bin_{c}')[['label', f'{c}']].describe())
    plot_by_category(nlst_preds, f'bin_{c}')

### Included columns from predictions

In [None]:
included_num_cols = ["Diameter [mm]", "NoduleCounts"]
for c in included_num_cols:
    print(c)
    display(nlst_preds.groupby('label')[c].describe())
    sns.histplot(nlst_preds, x=c, bins=25, hue="label", multiple='stack')
    plt.show()

In [None]:
for c in included_num_cols:
    nlst_preds[f'bin_{c}'] = pd.qcut(nlst_preds[c], q=2, duplicates='drop')
    display(nlst_preds.groupby(f'bin_{c}')[['label', f'{c}']].describe())
    plot_by_category(nlst_preds, f'bin_{c}')

## Further binning categorical columns

In [None]:
nlst_preds[['race', 'ethnic']].value_counts()

In [None]:
non_hispanic_white = (nlst_preds['race'] == 1.0) & (nlst_preds['ethnic'] == 2.0)
nlst_preds['NonHispanicWhite'] = non_hispanic_white
plot_by_category(nlst_preds, 'NonHispanicWhite')

In [None]:
high_school_plus = (nlst_preds['educat'] != 1.0)
nlst_preds['HighSchoolPlus'] = high_school_plus
plot_by_category(nlst_preds, 'HighSchoolPlus')

In [None]:
married = (nlst_preds['marital'] != 2.0)
nlst_preds['Married'] = married
plot_by_category(nlst_preds, 'Married')

## Investigate links