In [None]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
from IPython.display import display, Markdown
from scipy.stats import pearsonr, spearmanr, ks_2samp, mannwhitneyu, ttest_ind

import sys
sys.path.append('../')
from utilities import data

## directory where results are
EXPERIMENT_DIR = f"/data/bodyct/experiments/lung-malignancy-fairness-shaurya"
NLST_PREDS = f"{EXPERIMENT_DIR}/nlst"

TEAMS_DIR = "C:/Users/shaur/OneDrive - Radboudumc/Documents - Master - Shaurya Gaur/General/Malignancy-Estimation Results"
NLST_PREDS = f"{TEAMS_DIR}/nlst" ## Comment out if not using Teams backup (aka Chansey is up :)

In [None]:
nlst_preds_nodule = pd.read_csv(f"{NLST_PREDS}/nlst_demov4_allmodels_cal.csv")
nlst_preds_nodule.info()

In [None]:
with open(f'{NLST_PREDS}/nlst_demo_v4_cols.json') as json_data:
    nlst_democols = json.load(json_data)
    json_data.close()

In [None]:
# nlst_democols['num'].pop('nodule')

In [None]:
nlst_democols['num']['other'].append('Mean_Entropy_Kiran')

In [None]:
nlst_democols

In [None]:
nlst_preds, nlst_democols, MODELS = data.prep_nlst_preds(nlst_preds_nodule, nlst_democols, scanlevel=False, tijmen=False, sybil=False)
nlst_preds.info()

In [None]:
# nlst_policy_thresholds = pd.read_csv(f"{NLST_PREDS}/policy-thresholds-{len(nlst_preds)}.csv", index_col=0)
# nlst_policy_thresholds

In [None]:
THRESHOLD = 'Brock'

In [None]:
nlst_preds['WhiteOrBlack'] = nlst_preds['race'].replace([3, 4, 5, 6], value=np.nan, inplace=False)
nlst_democols['cat']['demo'].append('WhiteOrBlack')

In [None]:
MODEL_TO_COL = {
    "Venkadesh": "DL_cal",
    # "de Haas Combined": "Thijmen_mean_cal",
    "de Haas Local": "Thijmen_local_cal",
    "de Haas Global (hidden nodule)": "Thijmen_global_hidden",
    "de Haas Global (w/nodule)": "Thijmen_global_show_cal",
    "Sybil": "sybil_year1",
    "PanCan2b": "PanCan2b",
}

In [None]:
# nlst_preds['Kiran_pred_label'] = (nlst_preds[MODEL_TO_COL['Venkadesh']] > nlst_policy_thresholds.loc['Venkadesh', THRESHOLD]).astype(int).to_numpy()
nlst_preds['Kiran_pred_label'] = (nlst_preds[MODEL_TO_COL['Venkadesh']] > 0.06).astype(int).to_numpy()
nlst_preds_nodule['Kiran_pred_label'] = (nlst_preds_nodule[MODEL_TO_COL['Venkadesh']] > 0.06).astype(int).to_numpy()
nlst_preds['Kiran_PanCan_diff'] = nlst_preds[MODEL_TO_COL['Venkadesh']] - nlst_preds['PanCan2b']

In [None]:
false_positives = nlst_preds_nodule.query("label == 0 and Kiran_pred_label == 1")
false_negatives = nlst_preds_nodule.query("label == 1 and Kiran_pred_label == 0")

true_positives = nlst_preds_nodule.query("label == 1 and Kiran_pred_label == 1")
true_negatives = nlst_preds_nodule.query("label == 0 and Kiran_pred_label == 0")

## Difference between Training Info Splits

In [None]:
result_sets = {
    "FP": false_positives,
    "FN": false_negatives,
    "TP": true_positives,
    "TN": true_negatives, 
}

### utility code

In [None]:
def combine_col_dfs(cols=nlst_democols['cat'], df_func=pd.DataFrame, dfsets=result_sets, dispdf=False):
    splitdfs = []
    for cat in cols:
        if dispdf: display(Markdown(f"### {cat}"))
        
        for c in cols[cat]:
            df = df_func(c, dfsets)
            if dispdf: display(df)

            df['category'] = [cat] * len(df)
            df['attribute'] = [c] * len(df)
            df['value'] = df.index.values
            
            dfcols = df.columns.tolist()
            dfcols = dfcols[-3:] + dfcols[:-3]
            df = df[dfcols]
            df.reset_index(inplace=True, drop=True)
            df.sort_values(by='value', ascending=True, inplace=True)

            splitdfs.append(df)

    return pd.concat(splitdfs, axis=0, ignore_index=True)

In [None]:
def cat_dist_df(c='Gender', dfsets=result_sets):
    dfdict = {}
    for m in dfsets:
        dfdict[f"{m}_freq"] = dfsets[m][c].value_counts(normalize=False, dropna=False).astype(int)
        dfdict[f"{m}_norm"] = 100 * dfsets[m][c].value_counts(normalize=True, dropna=False).round(6)
        dfdict[f"{m}_freq"].fillna(0, inplace=True)
        dfdict[f"{m}_norm"].fillna(0, inplace=True)
    
    for i, m1 in enumerate(dfsets):
        for j, m2 in enumerate(dfsets):
            if j > i:
                # dfdict[f"diff_freq_{m1}_{m2}"] = (dfdict[f"{m1}_freq"] - dfdict[f"{m2}_freq"]).round(4)
                dfdict[f"diff_norm_{m1}_{m2}"] = (dfdict[f"{m1}_norm"] - dfdict[f"{m2}_norm"]).round(4)
    
    df = pd.DataFrame(dfdict).drop_duplicates()

    for m in dfsets:
        df[f"{m}_freq"] = df[f"{m}_freq"].fillna(0.0)
        df[f"{m}_norm"] = df[f"{m}_norm"].fillna(0.0)

    for i, m1 in enumerate(dfsets):
        for j, m2 in enumerate(dfsets):
            if j > i:
                # dfdict[f"diff_freq_{m1}_{m2}"] = (dfdict[f"{m1}_freq"] - dfdict[f"{m2}_freq"]).round(4)
                df[f"diff_norm_{m1}_{m2}"] = (df[f"{m1}_norm"] - df[f"{m2}_norm"]).round(4)    

    # df = pd.DataFrame(dfdict).drop_duplicates()
    return df

In [None]:
def num_dist_df(c='Gender', dfsets=result_sets):
    dfdict = {}
    for m in dfsets:
        dfdict[f"{m}"] = dfsets[m][c].describe(percentiles=[0.5]).round(4)
    
    for i, m1 in enumerate(dfsets):
        for j, m2 in enumerate(dfsets):
            if j > i:
                dfdict[f"diff_{m1}_{m2}"] = dfdict[f"{m1}"] - dfdict[f"{m2}"]
    
    df = pd.DataFrame(dfdict).drop_duplicates()
    df.drop(index=['count', 'max', 'min', 'std'], inplace=True)
    return df

### differences

In [None]:
cat_demo_splits = combine_col_dfs(nlst_democols['cat'], cat_dist_df, result_sets).query('value != 0')
display(cat_demo_splits.sort_values(by='diff_norm_FP_FN', ascending=False).head(30))
cat_demo_splits.sort_values(by='diff_norm_FP_FN', ascending=True).head(30)

In [None]:
num_demo_splits = combine_col_dfs(nlst_democols['num'], num_dist_df, result_sets)
display(num_demo_splits.sort_values(by='diff_FP_FN', ascending=False).head(30))
num_demo_splits.sort_values(by='diff_FP_FN', ascending=True).head(30)

### Now with the top 100 scores that were different from PanCan

In [None]:
result_top_100_diff = {
    "FP": false_positives.sort_values(by=['Kiran_PanCan_diff'], ascending=False)[0:100],
    "FN": false_negatives.sort_values(by=['Kiran_PanCan_diff'], ascending=False)[0:100],
}

In [None]:
cat_demo_splits = combine_col_dfs(nlst_democols['cat'], cat_dist_df, result_top_100_diff).query('value != 0')
display(cat_demo_splits.sort_values(by='diff_norm_FP_FN', ascending=False).head(30))
cat_demo_splits.sort_values(by='diff_norm_FP_FN', ascending=True).head(30)

In [None]:
num_demo_splits = combine_col_dfs(nlst_democols['num'], num_dist_df, result_top_100_diff)
display(num_demo_splits.sort_values(by='diff_FP_FN', ascending=False).head(30))
num_demo_splits.sort_values(by='diff_FP_FN', ascending=True).head(30)

## False Positives

In [None]:
false_positives = nlst_preds_nodule.query("label == 0 and Kiran_pred_label == 1")
false_positives.sort_values(by=['Kiran_PanCan_diff'], ascending=False)[['label', 'Kiran_pred_label', 'PanCan2b', 'DL_cal', 'Kiran_PanCan_diff']]

In [None]:
sns.histplot(false_positives, x='Kiran_PanCan_diff', hue='race', multiple='stack')

### Racial differences

In [None]:
race_fps = {
    "white": false_positives.query("race == 1"),
    "black": false_positives.query("race == 2"),
}

In [None]:
cat_demo_splits = combine_col_dfs(nlst_democols['cat'], cat_dist_df, race_fps).query('value != 0')
display(cat_demo_splits.sort_values(by='diff_norm_white_black', ascending=False).head(30))
cat_demo_splits.sort_values(by='diff_norm_white_black', ascending=True).head(30)

In [None]:
num_demo_splits = combine_col_dfs(nlst_democols['num'], num_dist_df, race_fps)
display(num_demo_splits.sort_values(by='diff_white_black', ascending=False).head(30))
num_demo_splits.sort_values(by='diff_white_black', ascending=True).head(30)

## False Negatives

In [None]:
false_negatives.sort_values(by=['Kiran_PanCan_diff'], ascending=True)[['label', 'Kiran_pred_label', 'PanCan2b', 'DL_cal', 'Kiran_PanCan_diff']]

In [None]:
sns.histplot(false_negatives, x='Kiran_PanCan_diff', hue='race', multiple='stack')

### Racial differences

In [None]:
race_fns = {
    "white": false_negatives.query("race == 1"),
    "black": false_negatives.query("race == 2"),
}

In [None]:
for m in race_fns:
    print(m, len(race_fns[m]))

In [None]:
cat_demo_splits = combine_col_dfs(nlst_democols['cat'], cat_dist_df, race_fns).query('value != 0')
display(cat_demo_splits.sort_values(by='diff_norm_white_black', ascending=False).head(30))
cat_demo_splits.sort_values(by='diff_norm_white_black', ascending=True).head(30)

In [None]:
display(cat_demo_splits.sort_values(by='diff_norm_white_black', ascending=False).query('category == "nodule"'))
cat_demo_splits.sort_values(by='diff_norm_white_black', ascending=True).query('category == "nodule"')   

In [None]:
display(cat_demo_splits.sort_values(by='diff_norm_white_black', ascending=False).query('attribute == "LC_stage"'))
cat_demo_splits.sort_values(by='diff_norm_white_black', ascending=True).query('attribute == "LC_stage"')   

In [None]:
num_demo_splits = combine_col_dfs(nlst_democols['num'], num_dist_df, race_fns)
display(num_demo_splits.sort_values(by='diff_white_black', ascending=False).head(30))
num_demo_splits.sort_values(by='diff_white_black', ascending=True).head(30)

In [None]:
sns.histplot(data=nlst_preds, x='pkyr', hue='WhiteOrBlack', common_norm=False, element='bars', kde=True, stat='probability')

## Uncertainty

In [None]:
sns.histplot(data=nlst_preds.query('label == 0'), x='Mean_Entropy_Kiran', hue='NoduleType', common_norm=False, element='bars', kde=True, stat='probability')
nlst_preds.query('label == 0').groupby('NoduleType')['Mean_Entropy_Kiran'].describe()

In [None]:
sns.histplot(data=nlst_preds.query('label == 1'), x='Mean_Entropy_Kiran', hue='NoduleType', common_norm=False, element='bars', kde=True, stat='probability')
nlst_preds.query('label == 1').groupby('NoduleType')['Mean_Entropy_Kiran'].describe()

In [None]:
sns.histplot(data=nlst_preds.query('label == 0'), x='Mean_Entropy_Kiran', hue='WhiteOrBlack', common_norm=False, element='bars', kde=True, stat='probability')
nlst_preds.query('label == 0').groupby('WhiteOrBlack')['Mean_Entropy_Kiran'].describe()

In [None]:
sns.histplot(data=nlst_preds.query('label == 1'), x='Mean_Entropy_Kiran', hue='WhiteOrBlack', common_norm=False, element='bars', kde=True, stat='probability')
nlst_preds.query('label == 1').groupby('WhiteOrBlack')['Mean_Entropy_Kiran'].describe()

In [None]:
sns.histplot(data=false_negatives, x='Mean_Entropy_Kiran', hue='WhiteOrBlack', common_norm=False, element='bars', kde=True, stat='density')
false_negatives.groupby('WhiteOrBlack')['Mean_Entropy_Kiran'].describe()

In [None]:
sns.histplot(data=false_positives, x='Mean_Entropy_Kiran', hue='WhiteOrBlack', common_norm=False, element='bars', kde=True, stat='density')
false_positives.groupby('WhiteOrBlack')['Mean_Entropy_Kiran'].describe()

In [None]:
sns.histplot(data=nlst_preds, x='Mean_Entropy_Kiran', hue='diaghype', common_norm=False, element='bars', kde=True, stat='density')
nlst_preds.groupby('diaghype')['Mean_Entropy_Kiran'].describe()

In [None]:
sns.histplot(data=false_negatives, x='Mean_Entropy_Kiran', hue='diaghype', common_norm=False, element='bars', kde=True, stat='density')
false_negatives.groupby('diaghype')['Mean_Entropy_Kiran'].describe()

In [None]:
sns.histplot(data=false_positives, x='Mean_Entropy_Kiran', hue='diaghype', common_norm=False, element='bars', kde=True, stat='density')
false_positives.groupby('diaghype')['Mean_Entropy_Kiran'].describe()

In [None]:
sns.histplot(data=nlst_preds.query('label == 1'), x='Mean_Entropy_Kiran', hue='diaghype', common_norm=False, element='bars', kde=True, stat='density')
nlst_preds.query('label == 1').groupby('diaghype')['Mean_Entropy_Kiran'].describe()

In [None]:
sns.histplot(data=nlst_preds.query('label == 0'), x='Mean_Entropy_Kiran', hue='diaghype', common_norm=False, element='bars', kde=True, stat='density')
nlst_preds.query('label == 0').groupby('diaghype')['Mean_Entropy_Kiran'].describe()

### Uncertainty by all factors

In [None]:
len(nlst_preds_nodule.query('label == 0').groupby('LC_stage')['Mean_Entropy_Kiran'])

In [None]:
def get_uncertainty_by_attributes(df):
    infodicts = []
    for category in nlst_democols['cat']:
        for attribute in nlst_democols['cat'][category]:
            info = {
                "category": category,
                "attribute": attribute,
                "diff_avg": 0,
                "diff_med": 0,
                "p-mannwhitney": 1,
                "p-kstest": 1,
            }
            if len(df.groupby(attribute)['Mean_Entropy_Kiran']) == 0: continue

            uncertainty_df = df.groupby(attribute)['Mean_Entropy_Kiran'].describe()
            if len(uncertainty_df) < 2: continue

            ## Get top 2 subgroups.
            uncertainty_df = uncertainty_df.sort_values(by='count', ascending=False)

            for i, (subgroup, stats) in enumerate(uncertainty_df.iterrows()):
                if i > 1: continue
                info[f"group{i+1}"] = subgroup
                info[f"group{i+1}_num"] = stats['count']
                info[f"group{i+1}_avg"] = stats['mean']
                info[f"group{i+1}_std"] = stats['std']
                info[f"group{i+1}_med"] = stats['50%']
                info[f"group{i+1}_iqr"] = stats['75%'] - stats['25%']

            info[f"diff_avg"] = info[f"group1_avg"] - info[f"group2_avg"]
            info[f"diff_med"] = info[f"group1_med"] - info[f"group2_med"]
            
            _, info["p-mannwhitney"] = mannwhitneyu(
                df[df[attribute] == info["group1"]]['Mean_Entropy_Kiran'], 
                df[df[attribute] == info["group2"]]['Mean_Entropy_Kiran'], 
                alternative='two-sided', nan_policy='omit')

            _, info["p-kstest"] = ks_2samp(
                df[df[attribute] == info["group1"]]['Mean_Entropy_Kiran'], 
                df[df[attribute] == info["group2"]]['Mean_Entropy_Kiran'], 
                alternative='two-sided', nan_policy='omit')
            
            infodicts.append(info)
    
    categorical_df = pd.DataFrame(infodicts).sort_values(by=['p-kstest', 'p-mannwhitney', 'diff_avg'], ascending=[True, True, True])

    numinfo = []
    for category in nlst_democols['num']:
        for attribute in nlst_democols['num'][category]:
            df2 = df.dropna(axis=0, subset=[attribute, 'Mean_Entropy_Kiran'])
            src, pval = spearmanr(df2['Mean_Entropy_Kiran'], df2[attribute])
            info = {
                "category": category,
                "attribute": attribute,
                "correlation": src,
                "p": pval,
            }
            numinfo.append(info)
    
    numerical_df = pd.DataFrame(numinfo).sort_values(by='p', ascending=True)
    return categorical_df, numerical_df


In [None]:
nodule_uncertainty_diffs, nodule_uncertainty_corrs = get_uncertainty_by_attributes(nlst_preds_nodule)
display(nodule_uncertainty_diffs)
display(nodule_uncertainty_corrs)

In [None]:
scan_uncertainty_diffs, scan_uncertainty_corrs = get_uncertainty_by_attributes(nlst_preds)
display(scan_uncertainty_diffs)
display(scan_uncertainty_corrs)

In [None]:
malignant_uncertainty_diffs, malignant_uncertainty_corrs = get_uncertainty_by_attributes(nlst_preds_nodule.query('label == 1'))
display(malignant_uncertainty_diffs)
display(malignant_uncertainty_corrs)

In [None]:
benign_uncertainty_diffs, benign_uncertainty_corrs = get_uncertainty_by_attributes(nlst_preds_nodule.query('label == 0'))
display(benign_uncertainty_diffs)
display(benign_uncertainty_corrs)

In [None]:
fp_uncertainty_diffs, fp_uncertainty_corrs = get_uncertainty_by_attributes(false_positives)
display(fp_uncertainty_diffs)
display(fp_uncertainty_corrs)

In [None]:
fn_uncertainty_diffs, fn_uncertainty_corrs = get_uncertainty_by_attributes(false_negatives)
display(fn_uncertainty_diffs)
display(fn_uncertainty_corrs)

In [None]:
tp_uncertainty_diffs, tp_uncertainty_corrs = get_uncertainty_by_attributes(true_positives)
display(tp_uncertainty_diffs)
display(tp_uncertainty_corrs)

In [None]:
positive_uncertainty_diffs, positive_uncertainty_corrs = get_uncertainty_by_attributes(nlst_preds.query('Kiran_pred_label == 1'))
display(positive_uncertainty_diffs)
display(positive_uncertainty_corrs)

In [None]:
negative_uncertainty_diffs, negative_uncertainty_corrs = get_uncertainty_by_attributes(nlst_preds.query('Kiran_pred_label == 0'))
display(negative_uncertainty_diffs)
display(negative_uncertainty_corrs)