# Imports

In [1]:
from mpl_toolkits.axes_grid1 import ImageGrid
from matplotlib.colors       import LinearSegmentedColormap
from skimage.transform       import resize
from plottify                import autosize
from sklearn                 import metrics
from PIL                     import Image
from adjustText              import adjust_text
from scipy.cluster           import hierarchy
import statsmodels.api   as sm
import matplotlib.pyplot as plt
import numpy             as np
import seaborn           as sns
import pandas            as pd
import scanpy            as sc
import matplotlib
import anndata
import random
import fastcluster
import copy
import umap
import h5py
import sys
import os

# Modify to include your local path.
sys.path.append('/media/adalberto/Disk2/PhD_Workspace')
from models.clustering.cox_proportional_hazard_regression_leiden_clusters import *
from models.visualization.attention_maps import *
from models.clustering.data_processing import *
from data_manipulation.data import Data

  warn("Tensorflow not installed; ParametricUMAP will be unavailable")


# Paper Figure - Clustermap Slides vs Clusters

In [14]:
# Workspace path.
main_path = '/media/adalberto/Disk2/PhD_Workspace'

# Image dataset variables.
dataset            = 'TCGAFFPE_LUADLUSC_5x_60pc'

############# Lungsubtype
meta_field     = 'luad'
matching_field = 'slides'
resolution     = 2.0
fold_number    = 4
groupby        = 'leiden_%s' % resolution
meta_folder    = 'lungtype_nn250_clusterfold4'
folds_pickle   = '/media/adalberto/Disk2/PhD_Workspace/utilities/files/LUADLUSC/lungsubtype_Institutions.pkl'

# Institutions.
inst_csv   = '/media/adalberto/Disk2/PhD_Workspace/utilities/files/TCGA/TCGA_Institutions.csv'
inst_frame = pd.read_csv(inst_csv)
inst_frame = inst_frame[inst_frame['Study Name'].isin(['Lung adenocarcinoma', 'Lung squamous cell carcinoma'])]

# Representations.
h5_complete_path   = '%s/results/BarlowTwins_3/TCGAFFPE_LUADLUSC_5x_60pc_250K/h224_w224_n3_zdim128_filtered/hdf5_TCGAFFPE_LUADLUSC_5x_60pc_he_complete_lungsubtype_survival_filtered.h5' % main_path
h5_additional_path = None


In [15]:
frames = build_cohort_representations(meta_folder, meta_field, matching_field, groupby, fold_number, folds_pickle, h5_complete_path, h5_additional_path, 'percent', 100)
complete_df, additional_complete_df, frame_clusters, frame_samples, features = frames

In [None]:
def clustermap_representations(features, complete_df, frame_clusters, method_slides, metric_slides, figsize, fontsize_labels, fontsize_ticks, dendrogram_ratio):
    slide_rep_df = complete_df.iloc[1:].copy(deep=True)

    # Row and Columns colors
    row_lut = dict(zip(np.unique(slide_rep_df[meta_field]), ['blue', 'orange']))
    row_colors = pd.Series(slide_rep_df[meta_field].map(row_lut), name='LUSC/LUAD\nWSI\n')

    purity_color_map = LinearSegmentedColormap.from_list('cluster_purity', ['blue','orange'])
    purities = [purity if flag else 100-purity for purity, flag in zip(frame_clusters['Subtype Purity(%)'], frame_clusters[meta_field])]
    col_colors = pd.Series([matplotlib.colors.to_hex(purity_color_map(perc/100)) for perc in purities], name='HPC\nLUSC/LUAD\nPurity\n')

    g = sns.clustermap(slide_rep_df[features].astype(float)*100, vmin=0, vmax=100, row_colors=row_colors, col_colors=col_colors, col_linkage=None, row_linkage=None, method=method_slides, metric=metric_slides, cmap='rocket_r', figsize=figsize, dendrogram_ratio=dendrogram_ratio, tree_kws=dict(linewidths=3.0))

    # X ticks and labels
    g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xmajorticklabels(), fontsize=fontsize_ticks)
    g.ax_heatmap.set_xlabel('HPC', fontsize=fontsize_labels)
    # Y ticks and labels
    g.ax_heatmap.set_ylabel('Whole Slide Image (WSI)',   fontsize=fontsize_labels)
    g.ax_heatmap.set_yticks([])
    # Row color labels
    g.ax_row_colors.tick_params(axis='both', length=0)
    g.ax_row_colors.tick_params(axis='x', which='major', labelsize=fontsize_labels)
    # Column color labels
    g.ax_col_colors.tick_params(axis='both', length=0)
    g.ax_col_colors.tick_params(axis='y', which='major', labelsize=fontsize_labels)
    g.ax_cbar.tick_params(labelsize=fontsize_ticks)

    [label.set_fontweight('bold') for label in g.ax_col_colors.get_yticklabels()]
    [label.set_fontweight('bold') for label in g.ax_row_colors.get_xticklabels()]
    [label.set_fontweight('bold') for label in g.ax_cbar.get_yticklabels()]

    for sel_ax in [g.ax_heatmap]:
        for ticks in [sel_ax.xaxis.get_major_ticks(), sel_ax.yaxis.get_major_ticks()]:
            for tick in ticks:
                tick.label1.set_fontsize(fontsize_ticks)
                tick.label1.set_fontweight('bold')

    g.ax_heatmap.set_xlabel('Histomorphological Phenotype Cluster (HPC)', fontsize=fontsize_labels, fontweight='bold')
    g.ax_heatmap.set_ylabel('Whole Slide Image (WSI)',   fontsize=fontsize_labels, fontweight='bold')

    plt.show()

method_slides = 'ward'
metric_slides = 'correlation'
sns.set_theme(style='white')
clustermap_representations(features, complete_df, frame_clusters, method_slides, metric_slides, figsize=(35,25), fontsize_labels=50, fontsize_ticks=27, dendrogram_ratio=(0.1,0.2))


# Paper Figure - UMAP Slide representations

In [None]:
frames = build_cohort_representations(meta_folder, meta_field, matching_field, groupby, fold_number, folds_pickle, h5_complete_path, h5_additional_path, 'clr', 100)
complete_df, additional_complete_df, frame_clusters, frame_samples, features = frames

columns = [col for col in complete_df.columns if col != 'luad' and col != 'samples' and col != 'slides']

labels = complete_df.to_numpy()[1:,-1]
data   = complete_df.to_numpy()[1:,2:-1]
df     = pd.DataFrame(data, columns=columns)
df['Lung Type'] = labels
df['Cohort']       = 'TCGA'

df_all = df


In [None]:
scatter_size    = 2000

figsize         = (20,20)
fontsize_labels = 60
fontsize_legend = 60
l_markerscale   = 10
l_box_w         = 3
lw              = 5

min_dist     = 0.0
n_components = 2
nn           = [25]
metrics      = ['euclidean']

for metric in metrics:
    for n_neighbors in nn:
        print(metric, n_neighbors)
        # UMAP
        fit = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, metric=metric)
        u   = fit.fit_transform(df_all[columns])
        df_all['UMAP Dim. 0'] = u[:, 0]
        df_all['UMAP Dim. 1'] = u[:, 1]

        fig   = plt.figure(figsize=figsize)
        ax    = fig.add_subplot(1, 1, 1)

        # Scatter plot.
        sns.scatterplot(data=df_all, x='UMAP Dim. 0', y='UMAP Dim. 1', hue='Lung Type', style='Cohort', markers={'TCGA':'v', 'NYU':'s'}, s=scatter_size, ax=ax)
        ax.set_xlabel('UMAP Dim. 0', fontsize=fontsize_labels)
        ax.set_ylabel('UMAP Dim. 1', fontsize=fontsize_labels)
        ax.set_title('Whole Slide Image\nVector Representations',  fontsize=fontsize_labels, fontweight='bold')
        ax.tick_params(axis='both', which='major', labelsize=fontsize_labels)
        # legend = ax.legend(loc='lower left', markerscale=markerscale, prop={'size': fontsize_legend-5}, ncol=2)
        legend = ax.legend(loc='upper left', markerscale=l_markerscale, prop={'size': fontsize_legend-5}, ncol=2)
        legend.get_texts()[1].set_text('LUSC')
        legend.get_texts()[2].set_text('LUAD')
        legend.get_texts()[0].set_size(fontsize_legend)
        legend.get_texts()[3].set_size(fontsize_legend)
        legend.get_frame().set_linewidth(l_box_w)

        for tick in ax.xaxis.get_major_ticks():
            tick.label1.set_fontsize(fontsize_labels)
            tick.label1.set_fontweight('bold')
        for tick in ax.yaxis.get_major_ticks():
            tick.label1.set_fontsize(fontsize_labels)
            tick.label1.set_fontweight('bold')

        ax.set_xlabel('UMAP Dim. 0', fontsize=fontsize_labels, fontweight='bold')
        ax.set_ylabel('UMAP Dim. 1', fontsize=fontsize_labels, fontweight='bold')
        for axis in ['top','bottom','left','right']:
            ax.spines[axis].set_linewidth(4)

        plt.tight_layout()
        plt.show()


# Paper Figure - ROC curve - Institutions & Folds

In [113]:
# Workspace path.
main_path = '/media/adalberto/Disk2/PhD_Workspace'

# Image dataset variables.
dataset            = 'TCGAFFPE_LUADLUSC_5x_60pc'

############# Lungsubtype
meta_field        = 'luad'
matching_field    = 'slides'
type_composition = 'clr'
resolution       = 2.0
fold_number      = 4
groupby          = 'leiden_%s' % resolution
meta_folder      = 'lungtype_nn250_clusterfold4'
folds_pickle     = '/media/adalberto/Disk2/PhD_Workspace/utilities/files/LUADLUSC/lungsubtype_Institutions.pkl'
min_tiles        = 100
label            = 1
alpha           = 10.0

# Institutions.
inst_csv   = '/media/adalberto/Disk2/PhD_Workspace/utilities/files/TCGA/TCGA_Institutions.csv'
inst_frame = pd.read_csv(inst_csv)
inst_frame = inst_frame[inst_frame['Study Name'].isin(['Lung adenocarcinoma', 'Lung squamous cell carcinoma'])]
# Institutions.
replace_dict = {'Ontario Institute for Cancer Research (OICR)':'Ontario Institute for Cancer Research',
                'Ontario Institute for Cancer Research (OICR)/Ottawa':'Ontario Institute for Cancer Research',
                'St Joseph\'s Medical Center (MD)': 'St. Joseph\'s Medical Center (MD)',
                'Fox Chase':'Fox Chase Cancer Center'}

inst_csv   = '/media/adalberto/Disk2/PhD_Workspace/utilities/files/TCGA/TCGA_Institutions.csv'
inst_frame = pd.read_csv(inst_csv)
inst_frame = inst_frame[inst_frame['Study Name'].isin(['Lung adenocarcinoma', 'Lung squamous cell carcinoma'])][['TSS Code', 'Source Site']]
inst_frame['Source Site'] = inst_frame['Source Site'].replace(replace_dict)
inst_frame['TSS Code']    = inst_frame['TSS Code'].replace({'1':'01','2':'02','3':'03','4':'04','5':'05', '6':'06','7':'07','8':'08','9':'09'})

# Representations.
h5_complete_path   = '%s/results/BarlowTwins_3/TCGAFFPE_LUADLUSC_5x_60pc_250K/h224_w224_n3_zdim128_filtered/hdf5_TCGAFFPE_LUADLUSC_5x_60pc_he_complete_lungsubtype_survival_filtered.h5' % main_path
h5_additional_path = None


In [None]:
# Get folds from existing split.
folds = load_existing_split(folds_pickle)

# Path for alpha Logistic Regression results.
main_cluster_path = h5_complete_path.split('hdf5_')[0]
main_cluster_path = os.path.join(main_cluster_path, meta_folder)
adatas_path       = os.path.join(main_cluster_path, 'adatas')

data_res_folds = dict()
data_res_folds[resolution] = dict()
for i, fold in enumerate(folds):
    # Read CSV files for train, validation, test, and additional sets.
    dataframes, complete_df, leiden_clusters = read_csvs(adatas_path, matching_field, groupby, i, fold, h5_complete_path, h5_additional_path, additional_as_fold=False, force_fold=fold_number)
    train_df, valid_df, test_df, additional_df = dataframes

    # Check clusters and diversity within.
    frame_clusters, frame_samples = create_frames(train_df, groupby, meta_field, diversity_key=matching_field, reduction=2)

    # Create representations per sample: cluster % of total sample.
    data, data_df, features = prepare_data_classes(dataframes, matching_field, meta_field, groupby, leiden_clusters, type_composition, min_tiles, use_conn=False, use_ratio=False, top_variance_feat=0)

    # Insert institutions.
    data_dfs = list()
    for dataframe in data_df:
        if dataframe is not None:
            sample_slide = dataframe['slides'].values[0]
            if dataframe is not None and 'TCGA' in str(sample_slide):
                dataframe.insert(0, 'TSS Code', dataframe['slides'].apply(lambda x: x.split('-')[1]))
                dataframe = pd.merge(dataframe, inst_frame, on='TSS Code', how='left')
        else:
            dataframe = None
        data_dfs.append(dataframe)

    # Include features that are not the regular leiden clusters.
    frame_clusters = include_features_frame_clusters(frame_clusters, leiden_clusters, features, groupby)

    # Store representations.
    data_res_folds[resolution][i] = {'data':data, 'data_df':data_dfs, 'complete_df':complete_df, 'features':features, 'frame_clusters':frame_clusters, 'leiden_clusters':leiden_clusters}

    # Information.
    print('\t\tFold', i, 'Features:', len(features), 'Clusters:', len(leiden_clusters))


In [None]:
from sklearn                 import metrics

folds_roc = dict()
folds_roc['test'] = dict()
if h5_additional_path is not None:
    folds_roc['additional'] = dict()
for i, fold in enumerate(folds):
    # Load data for classification.
    data            = data_res_folds[resolution][i]['data']
    data_df         = data_res_folds[resolution][i]['data_df']
    features        = data_res_folds[resolution][i]['features']
    frame_clusters  = data_res_folds[resolution][i]['frame_clusters']
    leiden_clusters = data_res_folds[resolution][i]['leiden_clusters']

    train,    valid,    test,    additional    = data
    train_df, valid_df, test_df, additional_df = data_df
    train_data, train_labels = train

    # One-vs-rest for Logistic Regression.
    model                  = sm.Logit(endog=train_labels[:,label], exog=train_data).fit_regularized(method='l1', alpha=alpha, disp=0)

    train, valid, test, additional = data
    train_data, train_labels = train
    valid_data, valid_labels = valid
    test_data,  test_labels  = test
    if additional is not None:
        additional_data, additional_labels = additional

    # Predictions.
    train_pred = model.predict(exog=train_data)
    valid_pred = model.predict(exog=valid_data)
    test_pred  = model.predict(exog=test_data)
    train_df['predictions'] = train_pred
    valid_df['predictions'] = valid_pred
    test_df['predictions']  = test_pred
    if additional is not None:
        additional_pred = model.predict(exog=additional_data)
        additional_df['predictions']  = additional_pred
    data_res_folds[resolution][i]['data_df'] = [train_df, valid_df, test_df, additional_df]

    folds_roc['test'][i] = dict()
    fpr, tpr, thresholds = metrics.roc_curve(list(test_labels[:,label]), list(test_pred))
    folds_roc['test'][i]['fpr'] = fpr
    folds_roc['test'][i]['tpr'] = tpr

    if additional is not None:
        folds_roc['additional'][i] = dict()
        fpr, tpr, thresholds = metrics.roc_curve(list(additional_labels[:,label]), list(additional_pred))
        folds_roc['additional'][i]['fpr'] = fpr
        folds_roc['additional'][i]['tpr'] = tpr


In [None]:
if 'TSS Code' not in complete_df.columns:
    complete_df.insert(0, 'TSS Code', complete_df['slides'].apply(lambda x: x.split('-')[1]))
    complete_df = pd.merge(complete_df, inst_frame, on='TSS Code', how='left')
complete_df['Source Site'] = complete_df['Source Site'].replace({'Mary Bird Perkins Cancer Center - Our Lady of the Lake':'Mary Bird Perkins Cancer Center', 'Thoraxklinik at University Hospital Heidelberg':'University Hospital Heidelberg'})
a, frame_samples = cluster_diversity(complete_df, frame_clusters, groupby, diversity_key='Source Site')
frame_samples    = frame_samples[[groupby, 'Source Site', 'Purity (%)', 'Counts']]
frame_samples['Purity (%)'] = frame_samples['Purity (%)']/100
frame_samples

## Paper Figure - Insitutions per cluster

In [None]:
work_df = complete_df.copy(deep=True)
work_df['Weight'] = 1
site_distribution           = work_df[['Weight', 'Source Site']].groupby('Source Site').count()
site_distribution['Weight'] = site_distribution['Weight']/site_distribution['Weight'].sum()
site_distribution['Group'] = 'All Institutions'
site_distribution = site_distribution.reset_index()
site_distribution = site_distribution.sort_values(by='Source Site', ascending=False)


figsize = (50, 20)
fontsize_labels = 38
fontsize_ticks  = 33
rotation        = 45

subsampled = np.random.choice(leiden_clusters, size=12, replace=False)

plotted = 0
while plotted < len(subsampled):
    f, (ax1, ax2, ax3, ax4, ax5) = plt.subplots(1, 5, sharey=True, sharex=True, figsize=figsize)

    frame_samples = frame_samples.sort_values(by='Source Site')
    for i, ax in  enumerate((ax2, ax3, ax4, ax5)):
        if plotted >= len(leiden_clusters.tolist()):
            ax.spines['top'].set_visible(False)
            ax.spines['right'].set_visible(False)
            ax.spines['bottom'].set_visible(False)
            ax.spines['left'].set_visible(False)
            ax.yaxis.label.set_visible(False)
            ax.xaxis.label.set_visible(False)
            ax.get_xaxis().set_ticks([])
        else:
            ratio = complete_df[complete_df[groupby]==subsampled[plotted]].shape[0]/complete_df.shape[0]*100
            ax.set_title('HPC %s\n%s%s of entire\npopulation' % (subsampled[plotted],np.round(ratio,1), '%'),  fontsize=fontsize_labels*1.2, fontweight='bold')
            work_samples_df = frame_samples[frame_samples[groupby]==subsampled[plotted]]
            sns.barplot(y='Source Site', x='Purity (%)', data=work_samples_df, ax=ax, palette='tab20')
            ax.yaxis.label.set_visible(False)
            plotted += 1

    for ax in (ax1, ax2, ax3, ax4, ax5):
        ax.set_xlim([0.0, frame_samples['Purity (%)'].max()+0.05])
        for tick in ax.xaxis.get_major_ticks():
            tick.label1.set_fontsize(fontsize_ticks)
            tick.label1.set_fontweight('bold')
            tick.label1.set_rotation(rotation)
        for tick in ax1.yaxis.get_major_ticks():
            tick.label1.set_fontsize(fontsize_ticks)
            tick.label1.set_fontweight('bold')

        ax.set_ylabel('Institution',            fontsize=fontsize_labels*1.1, fontweight='bold')
        ax.set_xlabel('Institution percentage', fontsize=fontsize_labels*1.1, fontweight='bold')
        for axis in ['top','bottom','left','right']:
            ax.spines[axis].set_linewidth(4)


    sns.barplot(y='Source Site', x='Weight', data=site_distribution, ax=ax1, palette='tab20')
    ax1.set_title('Entire\npopulation',      fontsize=fontsize_labels*1.2, fontweight='bold')
    ax1.set_ylabel('Institution',            fontsize=fontsize_labels*1.1, fontweight='bold')
    ax1.set_xlabel('Institution percentage', fontsize=fontsize_labels*1.1, fontweight='bold')

    plt.tight_layout()
    plt.show()
    # break


## Paper Figure - ROC Insitutions

In [123]:
naming_replacements = {'Mary Bird Perkins Cancer Center - Our Lady of the Lake':'Mary Bird Perkins Cancer Center', 'Thoraxklinik at University Hospital Heidelberg':'University Hospital Heidelberg'}

data = list()
institutions_roc = dict()
for i, fold in enumerate(folds):
    train_df, valid_df, test_df, additional_df = data_res_folds[resolution][i]['data_df']
    test_df['Source Site'] = test_df['Source Site'].replace(naming_replacements)
    for institution in np.unique(test_df['Source Site']):
        test_inst_df = test_df[test_df['Source Site']==institution].copy(deep=True)

        test_labels = test_inst_df[meta_field].values.tolist()
        test_pred   = test_inst_df['predictions'].values.tolist()

        samples = len(test_labels)

        fpr = None
        tpr = None
        thresholds = None
        roc_auc = None

        if len(np.unique(test_labels)) != 1:
            fpr, tpr, thresholds = metrics.roc_curve(test_labels, test_pred)
            roc_auc = auc(fpr, tpr)

        institutions_roc[institution] = [fpr, tpr, thresholds, roc_auc, samples]
        data.append((institution, roc_auc, samples, i))

data = pd.DataFrame(data, columns=['Institution', 'AUC', 'Sample Size', 'Fold'])
data = data.sort_values(by='Sample Size', ascending=False)

In [None]:
data['Institution'] = data['Institution'].replace(naming_replacements)
data = data.sort_values(by='AUC', ascending=False)
# data = data[~data['AUC'].isna()]
figsize = (50, 27)
fontsize_labels = 45

f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=figsize)
sns.barplot(y='Institution', x='AUC', data=data, ax=ax1, palette='tab20')
ax1.xaxis.set_ticks(np.arange(0, 1.05, 0.05))
ax1.set_xlim([0.0, 1.005])
ax1.axvline(0.93, linestyle='--', color='black')
for tick in ax1.xaxis.get_major_ticks():
    tick.label1.set_fontsize(fontsize_labels)
    tick.label1.set_fontweight('bold')
    tick.label1.set_rotation(90)
for tick in ax1.yaxis.get_major_ticks():
    tick.label1.set_fontsize(fontsize_labels)
    tick.label1.set_fontweight('bold')

ax1.set_ylabel('Institution',  fontsize=fontsize_labels*1.1, fontweight='bold')
ax1.set_xlabel('AUC', fontsize=fontsize_labels*1.1, fontweight='bold')
for axis in ['top','bottom','left','right']:
    ax1.spines[axis].set_linewidth(4)

sns.barplot(y='Institution', x='Sample Size', data=data, ax=ax2, palette='tab20')
for tick in ax2.xaxis.get_major_ticks():
    tick.label1.set_fontsize(fontsize_labels)
    tick.label1.set_fontweight('bold')
for tick in ax2.yaxis.get_major_ticks():
    tick.label1.set_fontsize(fontsize_labels)
    tick.label1.set_fontweight('bold')

ax2.yaxis.label.set_visible(False)
ax2.set_xlabel('Sample Size', fontsize=fontsize_labels*1.1, fontweight='bold')
for axis in ['top','bottom','left','right']:
    ax2.spines[axis].set_linewidth(4)

plt.tight_layout()
plt.show()

In [None]:

figsize    = (40,20)
fontsize_labels = 40
fontsize_legend = 32
l_box_w         = 3
lw              = 5

fig   = plt.figure(figsize=figsize)
ax    = fig.add_subplot(1, 1, 1)

data = data.sort_values(by='Sample Size', ascending=False)
for institution in data['Institution'].values:

    fpr, tpr, thresholds, roc_auc, samples = institutions_roc[institution]
    if roc_auc is None: continue
    ax.plot(fpr, tpr, lw=lw, label="%s (%s) = %0.2f" % (institution, samples, roc_auc))


legend = ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., title=r'$\bf{Per\ Institution\ AUC\ and\ Sample\ Size}$', prop={'size': fontsize_legend-4, 'weight':'bold'})
legend.get_title().set_fontsize(fontsize_legend)
# set the linewidth of each legend object
for line in legend.get_lines():
    line.set_linewidth(lw)
legend.get_frame().set_linewidth(l_box_w)

for tick in ax.xaxis.get_major_ticks():
    tick.label1.set_fontsize(fontsize_labels)
    tick.label1.set_fontweight('bold')
for tick in ax.yaxis.get_major_ticks():
    tick.label1.set_fontsize(fontsize_labels)
    tick.label1.set_fontweight('bold')

ax.set_title('TCGA Cohort',  fontsize=fontsize_labels*1.2, fontweight='bold')
ax.set_ylabel('True Positive Rate',  fontsize=fontsize_labels, fontweight='bold')
ax.set_xlabel('False Positive Rate', fontsize=fontsize_labels, fontweight='bold')
for axis in ['top','bottom','left','right']:
    ax.spines[axis].set_linewidth(4)

plt.tight_layout()
plt.show()


## Paper Figure - ROC Folds

In [None]:
def plot_auc(ax, fold, title, lw, fontsize_labels, fontsize_legend, l_box_w):
    aucs = list()
    for i in range(5):
        roc_auc = auc(fold[i]['fpr'], fold[i]['tpr'])
        aucs.append(roc_auc)
        ax.plot(fold[i]['fpr'], fold[i]['tpr'], lw=lw, label=" Fold %s AUC = %0.3f" % (i, roc_auc))

    mean, minus, plus = mean_confidence_interval(aucs, confidence=0.95)
    legend = ax.legend(loc='lower right', title=r'$\bf{Mean (CI): %s (%s-%s)}$' % (np.round(mean, 3), np.round(minus, 3), np.round(plus, 3)), prop={'size': fontsize_legend-4, 'weight':'bold'})
    legend.get_title().set_fontsize(fontsize_legend)
    # set the linewidth of each legend object
    for line in legend.get_lines():
        line.set_linewidth(lw)
    legend.get_frame().set_linewidth(l_box_w)

    for tick in ax.xaxis.get_major_ticks():
        tick.label1.set_fontsize(fontsize_labels)
        tick.label1.set_fontweight('bold')
    for tick in ax.yaxis.get_major_ticks():
        tick.label1.set_fontsize(fontsize_labels)
        tick.label1.set_fontweight('bold')

    ax.set_title(title,  fontsize=fontsize_labels*1.2, fontweight='bold')
    ax.set_ylabel('True Positive Rate',  fontsize=fontsize_labels, fontweight='bold')
    ax.set_xlabel('False Positive Rate', fontsize=fontsize_labels, fontweight='bold')
    for axis in ['top','bottom','left','right']:
        ax.spines[axis].set_linewidth(4)


figsize    = (20,20)
fontsize_labels = 60
fontsize_legend = 60
l_box_w         = 3
lw              = 5

fig   = plt.figure(figsize=figsize)
ax    = fig.add_subplot(1, 1, 1)
title = 'TCGA Cohort'
plot_auc(ax, folds_roc['test'], title, lw, fontsize_labels, fontsize_legend, l_box_w)
plt.show()


# Paper Figure - Force Cluster Fold Comparison

## LUAD vs LUSC

In [129]:
resolution = 2.0
force_fold = 4

alpha              = 10.0
meta_folder        = 'lungtype_nn250'
meta_folder_folder = 'lungtype_nn250_clusterfold%s' % force_fold

h5_complete_path = '%s/results/BarlowTwins_3/TCGAFFPE_LUADLUSC_5x_60pc_250K/h224_w224_n3_zdim128_filtered/hdf5_TCGAFFPE_LUADLUSC_5x_60pc_he_complete_lungsubtype_survival_filtered.h5' % main_path
main_cluster_path  = h5_complete_path.split('hdf5_')[0]
cluster_path       = os.path.join(main_cluster_path, meta_folder)
cluster_path_force = os.path.join(main_cluster_path, meta_folder_folder)
csv_all_path       = os.path.join(cluster_path,       'alpha_%s_mintiles_100/luad_auc_results_mintiles_100.csv' % str(alpha).replace('.', 'p'))
csv_all_path_force = os.path.join(cluster_path_force, 'alpha_%s_mintiles_100/luad_auc_results_mintiles_100.csv' % str(alpha).replace('.', 'p'))

results_df       = pd.read_csv(csv_all_path)
results_df       = results_df[results_df['Leiden Resolution']=='leiden_%s'%resolution].reset_index()

results_force_df = pd.read_csv(csv_all_path_force)
results_force_df = results_force_df[results_force_df['Leiden Resolution']=='leiden_%s'%resolution].reset_index()


In [130]:
plot_data = list()
for x in results_df.index:
    values = results_df.iloc[x].values
    plot_data.append((values[2], 'Train',      values[3]))
    plot_data.append((values[2], 'Validation', values[4]))
    plot_data.append((values[2], 'Test',       values[5]))

plot_data = pd.DataFrame(plot_data, columns=['Fold', 'Set', 'AUC'])
plot_data['Force Fold'] = 'Clusterig on each fold train set'

plot_data_force = list()
for x in results_force_df.index:
    values = results_force_df.iloc[x].values
    plot_data_force.append((values[2], 'Train',      values[3]))
    plot_data_force.append((values[2], 'Validation', values[4]))
    plot_data_force.append((values[2], 'Test',       values[5]))


plot_data_force = pd.DataFrame(plot_data_force, columns=['Fold', 'Set', 'AUC'])
plot_data_force['Force Fold'] = 'Common clusters across folds'

plot_data = pd.concat([plot_data_force, plot_data], axis=0)

In [None]:

figsize         = (15,10)
fontsize_labels = 35
fontsize_ticks  = 25
fontsize_legend = 20
l_markerscale   = 1
l_box_w         = 2

sns.set_theme(style='white')
fig   = plt.figure(figsize=figsize)
f, ax = plt.subplots(1, 1, sharey=True, sharex=True, figsize=figsize)

sns.pointplot(data=plot_data, y='AUC', x='Set', hue='Force Fold', linewidth=2.5, dodge=.4, join=False, capsize=.00, markers='o', ax=ax)

for tick in ax.xaxis.get_major_ticks():
    tick.label1.set_fontsize(fontsize_ticks)
    tick.label1.set_fontweight('bold')
for tick in ax.yaxis.get_major_ticks():
    tick.label1.set_fontsize(fontsize_ticks)
    tick.label1.set_fontweight('bold')
for axis in ['top','bottom','left','right']:
    ax.spines[axis].set_linewidth(4)
ax.set_ylim([0.85, 1.0])

ax.set_title('Logistic Regression\nLUAD vs LUSC', fontsize=fontsize_labels*1.3, fontweight='bold')
ax.set_xlabel('Set', fontsize=fontsize_labels, fontweight='bold')
ax.set_ylabel('AUC', fontsize=fontsize_labels, fontweight='bold')

legend = ax.legend(loc='lower left', title=r'$\bf{Clustering}$', markerscale=l_markerscale, prop={'size': fontsize_legend, 'weight':'bold'})
legend.get_title().set_fontsize(fontsize_legend*1.3)
legend.get_frame().set_linewidth(l_box_w)

plt.show()


## LUAD Overall Survival

In [132]:
resolution = 2.0
force_fold = 0

alpha              = 0.828643
meta_folder        = 'luad_overall_survival_nn250'
meta_folder_folder = 'luad_overall_survival_nn250_clusterfold%s' % force_fold

h5_complete_path = '%s/results/BarlowTwins_3/TCGAFFPE_LUADLUSC_5x_60pc_250K/h224_w224_n3_zdim128_filtered/hdf5_TCGAFFPE_LUADLUSC_5x_60pc_he_complete_lungsubtype_survival_filtered.h5' % main_path
main_cluster_path  = h5_complete_path.split('hdf5_')[0]
cluster_path       = os.path.join(main_cluster_path, meta_folder)
cluster_path_force = os.path.join(main_cluster_path, meta_folder_folder)
csv_all_path       = os.path.join(cluster_path,       'c_index_luad_overall_survival_nn250_l1_ratio_0.0_mintiles_100_summary.csv')
csv_all_path_force = os.path.join(cluster_path_force, 'c_index_luad_overall_survival_nn250_clusterfold0_l1_ratio_0.0_mintiles_100.csv')

results_df       = pd.read_csv(csv_all_path)
results_force_df = pd.read_csv(csv_all_path_force)


In [133]:
results_resolution  = results_df[results_df.Resolution==resolution]
alphas              = np.unique(results_df.Alpha.values)
results_force_df = results_force_df[(results_force_df.resolution==resolution)&(results_force_df.alpha==alphas[6])].reset_index()
plot_data = list()
for x in results_force_df.index:

    values = results_force_df.iloc[x].values
    plot_data.append((values[2], 'Train',      values[4]))
    plot_data.append((values[2], 'Test',       values[5]))

plot_data = pd.DataFrame(plot_data, columns=['Fold', 'Set', 'C-Index'])
plot_data['Force Fold'] = 'Clusterig on each fold train set'

results_df = results_df[results_df.Resolution==resolution]
results_df['Set'] = results_df['Set'].replace({'C-Index train':'Train', 'C-Index test':'Test', 'C-Index additional':'NYU Cohort'})
plot_data_force = results_df.copy(deep=True)
plot_data_force['Force Fold'] = 'Common clusters across folds'

plot_data = pd.concat([plot_data_force, plot_data], axis=0)
plot_data = plot_data[['Fold', 'Set', 'C-Index', 'Force Fold']]
plot_data

Unnamed: 0,Fold,Set,C-Index,Force Fold
20,0.0,Train,0.69,Common clusters across folds
21,0.0,Test,0.66,Common clusters across folds
22,1.0,Train,0.73,Common clusters across folds
23,1.0,Test,0.61,Common clusters across folds
24,2.0,Train,0.65,Common clusters across folds
25,2.0,Test,0.64,Common clusters across folds
26,3.0,Train,0.7,Common clusters across folds
27,3.0,Test,0.56,Common clusters across folds
28,4.0,Train,0.67,Common clusters across folds
29,4.0,Test,0.62,Common clusters across folds


In [None]:

figsize         = (15,10)
fontsize_labels = 35
fontsize_ticks  = 25
fontsize_legend = 20
l_markerscale   = 1
l_box_w         = 2

sns.set_theme(style='white')
fig   = plt.figure(figsize=figsize)
f, ax = plt.subplots(1, 1, sharey=True, sharex=True, figsize=figsize)

sns.pointplot(data=plot_data, y='C-Index', x='Set', hue='Force Fold', linewidth=2.5, dodge=.4, join=False, capsize=.00, markers='o', ax=ax)

for tick in ax.xaxis.get_major_ticks():
    tick.label1.set_fontsize(fontsize_ticks)
    tick.label1.set_fontweight('bold')
for tick in ax.yaxis.get_major_ticks():
    tick.label1.set_fontsize(fontsize_ticks)
    tick.label1.set_fontweight('bold')
for axis in ['top','bottom','left','right']:
    ax.spines[axis].set_linewidth(4)
ax.set_ylim([0.5, 0.85])

ax.set_title('Cox Proportional Hazard\nLUAD Overall Survival', fontsize=fontsize_labels*1.3, fontweight='bold')
ax.set_xlabel('Set', fontsize=fontsize_labels, fontweight='bold')
ax.set_ylabel('C-Index', fontsize=fontsize_labels, fontweight='bold')

legend = ax.legend(loc='lower left', title=r'$\bf{Clustering}$', markerscale=l_markerscale, prop={'size': fontsize_legend, 'weight':'bold'})
legend.get_title().set_fontsize(fontsize_legend*1.3)
legend.get_frame().set_linewidth(l_box_w)

plt.show()