In [None]:
import pandas as pd
import pickle
import re

from utilities import predictions
from utilities.info import *

In [None]:
FILE_DIR

NOTE: This includes the De Haas model (unused for MELBA paper). To load predictions, use these directories: `NLST_Tijmen_results`, `Tijmen_Local_NLST`, `Tijmen-Global-Hidden-NLST`, `Tijmen-Global-ShowNodule-NLST`.

## Load and Calibrate DLCST Predictions

In [None]:
dlcst_preds = pd.read_csv(f"{INPUT_DIR}/dlcst_allmodels.csv")
dlcst_preds.info()

In [None]:
_ = predictions.check_scoredists(dlcst_preds, ['Ensemble_Kiran', 'PanCan2b', 'thijmen_mean', 'sybil_year1'])

In [None]:
for model in ['Ensemble_Kiran', 'thijmen_mean']:
    dlcst_preds[f"{model}_cal"] = predictions.calibrate_preds(dlcst_preds[model], dlcst_preds['label'])
    predictions.make_calibration_plots(
        dlcst_preds['label'], dlcst_preds[model], dlcst_preds[f'{model}_cal'], 
        title=f'\n{model}')
    predictions.check_scoredists(dlcst_preds, [model, f'{model}_cal'])

In [None]:
dlcst_preds.to_csv(f"{FILE_DIR}/dlcst_allmodels_cal.csv", index=False)
dlcst_preds.info()

## Load Venk21 and PanCan Predictions (NLST)

In [None]:
venk21_pancan = pd.read_csv(f"{INPUT_DIR}/NLST_DL_vs_PanCan_Venk21.csv")
venk21_pancan.rename(columns={'Diameter [mm]': 'Diameter_mm'}, inplace=True)
venk21_pancan.info()

In [None]:
venk21_pancan['DL_cal'] = predictions.calibrate_preds(venk21_pancan['DL'], venk21_pancan['label'])
_= predictions.make_calibration_plots(venk21_pancan['label'], venk21_pancan['DL'], venk21_pancan['DL_cal'], title='\nVenkadesh 2021')

In [None]:
_ = predictions.check_scoredists(venk21_pancan, ['DL', 'DL_cal', 'PanCan2b'])

## Load De Haas Models

In [None]:
tijmen_load_info = {
    "Thijmen_mean": {
        "base_path": rf"{INPUT_DIR}/NLST_Tijmen_results/final_layer_nlst_validtion_20240626",
        "local": False,
        "valid": False,
    },
    "Thijmen_local": {
        "base_path": rf"{INPUT_DIR}/Tijmen Local NLST",
        "local": True,
        "valid": True,
    },
    "Thijmen_global_hidden": {
        "base_path": rf"{INPUT_DIR}/Tijmen-Global-Hidden-NLST/clip_hidden_nod_global_nlst_logits_20240422",
        "local": False,
        "valid": True,
    },
    "Thijmen_global_show": {
        "base_path": rf"{INPUT_DIR}/Tijmen-Global-ShowNodule-NLST/clip_show_nod_global_nlst_20240501",
        "local": False,
        "valid": True,
    }, 
}

In [None]:
tijmen_dfs = {}
for model in tijmen_load_info:
    df = predictions.load_tijmen_results(
        model_name=model,
        base_path=tijmen_load_info[model]['base_path'],
        local=tijmen_load_info[model]['local'],
        valid=tijmen_load_info[model]['valid'],
    )[['AnnotationID', 'label', model]]

    df[f"{model}_cal"] = predictions.calibrate_preds(df[model], df['label'])
    predictions.make_calibration_plots(
        df['label'], df[model], df[f'{model}_cal'], 
        title=f'\n{model}')
    predictions.check_scoredists(df, [model, f'{model}_cal'])

    df.to_csv(f"{tijmen_load_info[model]['base_path']}/combined_{('valid_' if tijmen_load_info[model]['valid'] else '')}output.csv")
    tijmen_dfs[model] = df

## Merge Venk21 and De Haas Predictions

In [None]:
preds = venk21_pancan
for m in tijmen_dfs:
    preds = preds.merge(tijmen_dfs[m], how='left', on=['AnnotationID', 'label'], suffixes=(None, None))

preds.info()

## Get Series for Sybil Predictions

This includes the ones from Venk21's cross-validation set which are NOT in Sybil's training set.

In [None]:
LOAD_SYBIL_PICKLE_FILE = True

if LOAD_SYBIL_PICKLE_FILE:
    SYBIL_SERIES_JSON_PATH = f"{INPUT_DIR}/sybil-nlst-info/sybil-nlst-pid_tp_series2split.p"
    sybil_split_dict = pickle.load(open(SYBIL_SERIES_JSON_PATH, "rb"))

    ids = list(sybil_split_dict.keys())
    splits = list(sybil_split_dict.values())
    pids = [re.split('PID-|__TimePoint-|__Series-', i)[1] for i in ids]
    timepoints = [re.split('PID-|__TimePoint-|__Series-', i)[2] for i in ids]
    siuids = [re.split('PID-|__TimePoint-|__Series-', i)[3] for i in ids]

    sybil_split_df = pd.DataFrame({'id': ids, 'split': splits, 'PatientID': pids, 'timepoint': timepoints, 'SeriesInstanceUID': siuids})

    sybil_split_df.to_csv(f"{INPUT_DIR}/sybil-nlst-info/sybil-nlst-splitinfo.csv", index=False)

else:
    sybil_split_df = pd.read_csv(f"{INPUT_DIR}/sybil-nlst-info/sybil-nlst-splitinfo.csv")

In [None]:
sybil_train_ids = set(sybil_split_df.query('split == "train"')['SeriesInstanceUID'])
venk21_series_ids = set(preds['SeriesInstanceUID'])
unique_series_ids = venk21_series_ids - sybil_train_ids

print("Sybil training scans:", len(sybil_train_ids))
print("Venk21 scans (cross-val.):", len(venk21_series_ids))
print("Validation set scans:", len(unique_series_ids))

In [None]:
preds['InSybilTrain'] = preds['SeriesInstanceUID'].isin(sybil_train_ids)
preds.info()

In [None]:
sybil_infer_input = preds.query("SeriesInstanceUID in @unique_series_ids")
sybil_infer_input.to_csv(f"{INPUT_DIR}/sybil-nlst-info/sybil_val_infer_series.csv", index=False)
sybil_infer_input[['SeriesInstanceUID']].nunique()

After this, convert MHA to DICOM and run Sybil's inference on `DIAGNijmegen/bodyct-sybil-lung-cancer-risk`.

## Merge Sybil Predictions

NOTE: I ran this in two jobs - one for the 1172 scans in the intersection with De Haas combined model's validation set, and a second for the 4739 (1 scan failed) on the rest. Ideally, you run it all at once, but I'm gonna merge them here. 

In [None]:
sybil_infer_1172 = pd.read_csv(f"{INPUT_DIR}/sybil-inference-1172.csv")
sybil_infer_4739 = pd.read_csv(f"{INPUT_DIR}/sybil-inference-4739.csv")
len(sybil_infer_1172), len(sybil_infer_4739)

In [None]:
sybil_infer = pd.concat([sybil_infer_1172, sybil_infer_4739], axis=0, ignore_index=True).drop_duplicates(subset='SeriesInstanceUID')
sybil_infer = sybil_infer.rename(columns={f'year{n+1}': f'sybil_year{n+1}' for n in range(6)})
sybil_infer.info()

In [None]:
allpreds = preds.merge(sybil_infer, validate='m:1',
                how="left",
                on=['SeriesInstanceUID'], suffixes=(None,None))
allpreds.info()

## Split up Nodule Type Columns

This is to make the analysis a bit easier later on.

In [None]:
allpreds['NoduleType2'] = allpreds['NoduleType']
allpreds = pd.get_dummies(allpreds, columns=['NoduleType2'], prefix='', prefix_sep='')
allpreds.info()

## Save Everything

In [None]:
allpreds.to_csv(f"{FILE_DIR}/nlst_allmodels.csv", index=False)