# Imports

In [13]:
from matplotlib        import collections             as matcoll
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.colors import TwoSlopeNorm
from matplotlib.pyplot import rc_context
from scipy.cluster     import hierarchy
from adjustText        import adjust_text

import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import scanpy as sc
import pandas as pd
import numpy as np
import math
import glob
import h5py
import sys
import os

main_path = '/media/adalberto/Disk2/PhD_Workspace'
sys.path.append(main_path)
from models.clustering.logistic_regression_leiden_clusters import *
from models.evaluation.folds import load_existing_split
from models.clustering.correlations import *
from models.clustering.data_processing import *
from models.clustering.leiden_representations import include_tile_connections_frame
from data_manipulation.utils import store_data


# Methods

In [29]:
def create_histo_annotation_df(h5_hist_anno_path, additional_df):
    with h5py.File(h5_hist_anno_path, 'r') as content:
        slides   = [slide.decode("utf-8").split('_')[0] for slide in content['combined_slides']]
        tiles    = [tile.decode("utf-8").split('.')[0] for tile in content['combined_tiles']]
        histtype = [type_.decode("utf-8") for type_ in content['combined_hist_subtype']]
        histo_df = pd.DataFrame(slides, columns=['slides'])
        histo_df['tiles'] = tiles
        histo_df['histtype'] = histtype

    selected_anno = ['acinar', 'lepidic', 'micropapillary', 'papillary', 'solid']

    additional_df['slides'] = additional_df['slides'].astype(str)
    histo_complete_df = additional_df.merge(histo_df, how='inner', left_on=['slides','tiles'], right_on=['slides','tiles'])
    histo_complete_df = histo_complete_df.loc[histo_complete_df['histtype'].isin(selected_anno)]

    return histo_complete_df


def get_col_colors(cox_os_clusters, cox_pfs_clusters, p_th):
    colors        = None
    colors_masked = None
    if cox_os_clusters is not None:
        # Column colors.
        coef_df   = cox_os_clusters.sort_values(by=groupby)
        cmap_PiYG = plt.cm.PiYG_r
        norm      = TwoSlopeNorm(vmin=coef_df['coef'].min(), vcenter=0, vmax=coef_df['coef'].max())
        column_os_colors              = pd.Series([cmap_PiYG(norm(coef)) for p, coef in zip(coef_df['p'], coef_df['coef'])], name='Cox Coefficient Overall Survival')
        column_os_colors_masked       = pd.Series([cmap_PiYG(norm(coef)) if p <p_th else cmap_PiYG(norm(0))[:3] for p, coef in zip(coef_df['p'], coef_df['coef'])], name='Cox Coefficient Overall Survival')
        column_os_colors_masked.index = coef_df[groupby].astype(str)
        column_os_colors.index        = coef_df[groupby].astype(str)
        colors        = column_os_colors
        colors_masked = column_os_colors_masked

        if cox_pfs_clusters is not None:
            cox_pfs_clusters = cox_pfs_clusters.sort_values(by=groupby)
            cmap_PiYG = plt.cm.PiYG_r
            norm                     = TwoSlopeNorm(vmin=cox_pfs_clusters['coef'].astype(float).min(), vcenter=0, vmax=cox_pfs_clusters['coef'].astype(float).max())
            column_pfs_colors        = pd.Series([cmap_PiYG(norm(coef)) for p, coef in zip(cox_pfs_clusters['p'], cox_pfs_clusters['coef'])], name='Cox Coefficient\nRecurrence Free Survival')
            column_pfs_colors_masked = pd.Series([cmap_PiYG(norm(coef)) if p <p_th else cmap_PiYG(norm(0))[:3] for p, coef in zip(cox_pfs_clusters['p'], cox_pfs_clusters['coef'])], name='Cox Coefficient\nProgression Free Survival')
            column_pfs_colors.index        = coef_df[groupby].astype(str)
            column_pfs_colors_masked.index = coef_df[groupby].astype(str)

            colors = pd.concat([column_os_colors, column_pfs_colors],axis=1)
            colors_masked = pd.concat([column_os_colors_masked, column_pfs_colors_masked],axis=1)

    return colors, colors_masked


def plot_clustermap(all_data_rho, mask, x_label, y_label, directory, file_name, figsize, vcenter=0, annot=True, fmt='.2f', cox_os_clusters=None, cox_pfs_clusters=None, col_linkage=None, row_linkage=None, fontsize_ticks=28, fontsize_labels=30, fontsize_annot=20, dendrogram_ratio=0.2, row_colors_same=False, show=False, not_masked=False, p_th=0.01):

    if col_linkage is None:
        Z = hierarchy.linkage(y=all_data_rho.T, method='ward', metric='euclidean', optimal_ordering=False)
        col_linkage = Z

    colors, colors_masked = get_col_colors(cox_os_clusters, cox_pfs_clusters, p_th)
    with rc_context({'figure.figsize': figsize}):

        for name, col_colors in [('', colors), ('_masked', colors_masked)]:
            if 'masked' in name and not_masked:
                continue
            sns.set_theme(style='white')
            vref = np.max(np.abs(all_data_rho.values))
            if vcenter == 0:
                norm = TwoSlopeNorm(vmin=-vref, vcenter=vcenter, vmax=vref)
            else:
                norm = TwoSlopeNorm(vmin=all_data_rho.values.min(), vcenter=vcenter, vmax=all_data_rho.values.max())

            row_colors = None
            if row_colors_same:
                row_colors = col_colors

            g = sns.clustermap(all_data_rho, vmin=-vref, vmax=vref, method='ward', metric='euclidean', annot=annot, mask=mask, col_colors=col_colors, row_colors=row_colors, col_linkage=col_linkage, row_linkage=row_linkage, fmt=fmt, norm=norm, cmap=sns.diverging_palette(250, 20, as_cmap=True), dendrogram_ratio=dendrogram_ratio, annot_kws={"size": fontsize_annot},  yticklabels=True,  xticklabels=True)

            if col_colors is not None:
                g.ax_col_colors.set_yticklabels(g.ax_col_colors.get_ymajorticklabels(), fontsize=fontsize_ticks)
            if row_colors_same:
                g.ax_row_colors.set_xticklabels(g.ax_row_colors.get_xmajorticklabels(), fontsize=fontsize_ticks)

            g.ax_heatmap.set_ylabel('\n%s' % y_label, fontsize=fontsize_labels)
            g.ax_heatmap.set_xlabel('\n%s' % x_label, fontsize=fontsize_labels)
            g._figure.set_size_inches(figsize[0]*1.1, figsize[1]*1.1)
            g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xmajorticklabels(), fontsize=fontsize_ticks)
            g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_ymajorticklabels(), fontsize=fontsize_ticks)
            g.ax_cbar.tick_params(labelsize=fontsize_ticks)
            if show:
                plt.show()
            else:
                plt.savefig('%s/%s' % (directory, file_name.replace('.jpg', '%s.jpg' % name)))
                plt.close(g._figure)

            if col_colors is None:
                break
    return g

def plot_dendrogram(adata, groupby, directory=None, file_name=None, show=False):
    fig = plt.figure(figsize=(15,5))
    fig.suptitle('Leiden Cluster Dendrogram')
    ax = fig.add_subplot(1, 1, 1)
    ax = sc.pl.dendrogram(adata, groupby=groupby, ax=ax, show=show)
    if directory is not None and file_name is not None:
        plt.savefig(os.path.join(directory, file_name))
        plt.close(fig)


# Variables for run comparison

In [15]:
# Workspace path.
main_path = '/media/adalberto/Disk2/PhD_Workspace'

'''
LUAD Survival
resolution     = 2.0
fold_number    = 0
alpha          = 1.0
alpha_2        = 1.0

resolution     = 1.0
fold_number    = 0
alpha          = None
alpha_2        = None

'''

# Resolution and fold for the tile clustering and slide representations.
resolution     = 2.0
fold_number    = 0
groupby        = 'leiden_%s' % resolution

# Folder run.
meta_folder     = 'luad_overall_survival_nn250_clusterfold%s' % fold_number
matching_field  = 'samples'
meta_field      = 'os_event_ind'
# meta_field      = 'luad'

# HoverNet dataset annotations.
cell_names             = ['cell neoplastic', 'cell inflammatory', 'cell connective', 'cell dead']
dataset = 'TCGAFFPE_LUADLUSC_5x_10pc'
magnification          = '20x'
annotation_restriction = 1

# Penalties for Cox regression and flag for usage.
use_cox        = True
alpha          = 1.0
alpha_2        = None

# Pickle files.
# folds_pickle = '%s/utilities/files/LUADLUSC/lungsubtype_Institutions.pkl' % main_path
folds_pickle = '%s/utilities/files/LUAD/overall_survival_TCGA_folds.pkl'  % main_path

# Tile representation files.
h5_complete_path   = '%s/results/BarlowTwins_3/TCGAFFPE_LUADLUSC_5x_60pc_250K/h224_w224_n3_zdim128_filtered/hdf5_TCGAFFPE_LUADLUSC_5x_60pc_he_complete_lungsubtype_survival_filtered.h5' % main_path
h5_additional_path = None

# Annotation files.
hovernet_csv     = '%s/datasets/HoverNet/%s/%s/%s_hovernet_annotations_5x.csv' % (main_path, dataset, magnification, dataset)
tcga_immune_csv  = '%s/utilities/files/TCGA/TCGA_immune_landscape.csv' % main_path

# Run path.
main_cluster_path = h5_complete_path.split('hdf5_')[0]
main_cluster_path = os.path.join(main_cluster_path, meta_folder)
adatas_path       = os.path.join(main_cluster_path, 'adatas')
figure_path       = os.path.join(main_cluster_path, 'leiden_%s_fold%s' % (str(resolution).replace('.','p'),fold_number))
figure_path       = os.path.join(figure_path,       'figures')
if not os.path.isdir(figure_path):
    os.makedirs(figure_path)


### Cox Regression runs

In [16]:
# Cox run for coefficients.
coef_os_df  = None
coef_pfs_df = None
if use_cox:
    csv_cox = os.path.join(main_cluster_path, '%s_leiden_%s_alpha_%s_l1ratio_0p0_mintiles_100' % (meta_folder, resolution, str(alpha).replace('.','p')))
    csv_cox = os.path.join(csv_cox, 'leiden_%s_stats_all_folds.csv' % (str(resolution).replace('.','p')))

    # Read in regression coefficient file
    cox_df  = pd.read_csv(csv_cox)
    coef_os_df = cox_df[[groupby, 'coef', 'p']].copy(deep=True)

### Annotation files

In [17]:
# Immune landscape sample annotations.
immune_landscape_df = pd.read_csv(tcga_immune_csv)

# HoverNet Annotations.
hovernet_df  = pd.read_csv(hovernet_csv)
if '.' in hovernet_df.slides.astype(str).values[0]:
    hovernet_df['slides'] = [slide.split('.')[0] for slide in hovernet_df.slides if '.' in slide ]

### Representations: Slides and Tiles.

In [20]:
''' Get representations for slide representation correlations. '''
# Fold
folds = load_existing_split(folds_pickle)
fold = folds[fold_number]

# Read cohort CSVs.
dataframes, complete_df, leiden_clusters   = read_csvs(adatas_path, matching_field, groupby, fold_number, fold, h5_complete_path, h5_additional_path, additional_as_fold=False, force_fold=None)
train_df, valid_df, test_df, additional_df = dataframes
complete_df['tiles']   = complete_df['tiles'].apply(lambda x: x.split('.jpeg')[0])
if additional_df is not None:
    additional_df['tiles'] = additional_df['tiles'].apply(lambda x: x.split('.jpeg')[0])

annotated_df           = complete_df.merge(hovernet_df, how='inner', left_on=['slides', 'tiles'], right_on=['slides', 'tiles'])
cluster_anno_df        = annotated_df[annotated_df['annotated_20x_tile_count']>=annotation_restriction]

''' Get representations for slide representation correlations. '''
frames = build_cohort_representations(meta_folder, meta_field, matching_field, groupby, fold_number, folds_pickle, h5_complete_path, h5_additional_path, 'clr', 100)
complete_df, additional_complete_df, frame_clusters, frame_samples, features = frames
complete_df.columns            = complete_df.columns.astype(str)
if additional_complete_df is not None:
    if matching_field == 'samples' and matching_field not in additional_complete_df.columns:
        additional_complete_df[matching_field] = additional_complete_df['slides']
    additional_complete_df.columns = additional_complete_df.columns.astype(str)
# # Check for duplicates, ways of handling them:
# # 1. Drop duplicates.
# # 2. Combine slides into representations: This may include coming back from CLR and back after merged.
# if len(np.unique(complete_df.samples)) != complete_df.shape[0]:
#     # Easy option 1.
#     complete_df = complete_df.drop_duplicates(subset='samples', keep="last")

''' Read clustering file '''
adata_train, h5ad_path = read_h5ad_reference(h5_complete_path, meta_folder, groupby, fold_number)

# Leiden clusters dendrogram.
leiden_linkage_method = 'average'
leiden_cor_method     = 'spearman'
sc.tl.dendrogram(adata_train, groupby, use_rep='X', linkage_method=leiden_linkage_method, cor_method=leiden_cor_method)
leiden_linkage = adata_train.uns['dendrogram_%s' % groupby]['linkage']

''' Prepare manual annotations for histological subtypes.'''
if additional_df is not None:
    histo_complete_df  = create_histo_annotation_df(h5_hist_anno_path, additional_df)


### Correlations

In [21]:
corr_method   = 'spearman'   # Correlation method.
corr_matching = 'samples'
pval_th       = 0.01

correlations_dict = dict()

''' Leiden Cluster Dendrogram'''
file_name = h5_complete_path.split('/hdf5_')[1].split('.h5')[0] + '_%s__fold%s_%s_leiden_dendrogram' % (groupby.replace('.', 'p'), fold_number, meta_folder)
sc.tl.dendrogram(adata_train, groupby=groupby, cor_method='pearson', linkage_method='average', optimal_ordering=True)
correlations_dict[groupby] = dict()
correlations_dict[groupby]['file_name'] = file_name
correlations_dict[groupby]['linkage']   = adata_train.uns['dendrogram_leiden_2.0']['linkage']

''' Cluster Purity '''
if 'NYU' in hovernet_csv:
    file_name = h5_additional_path.split('/hdf5_')[1].split('.h5')[0] + '_%s__fold%s_%s_hovernet' % (groupby.replace('.', 'p'), fold_number, meta_folder)
else:
    file_name = h5_complete_path.split('/hdf5_')[1].split('.h5')[0] + '_%s__fold%s_%s_hovernet' % (groupby.replace('.', 'p'), fold_number, meta_folder)
critical_coef, critical_ref, p_values, mask = ks_test_cluster_purities(cluster_anno_df=annotated_df, fields=cell_names, groupby=groupby, fold_number=fold_number,
                                                                       directory=main_cluster_path, file_name=file_name, p_th=pval_th, critical_values_flag=False)
correlations_dict['hovernet'] = dict()
correlations_dict['hovernet']['file_name'] = file_name
correlations_dict['hovernet']['data']      = critical_coef, critical_ref, p_values, mask

''' Immune Landscape '''
file_name = h5_complete_path.split('/hdf5_')[1].split('.h5')[0] + '_%s__fold%s_%s_immunelandscape' % (groupby.replace('.', 'p'), fold_number, meta_folder)
all_data_rho, all_data_pval, mask, _ = correlate_clusters_annotation(slide_rep_df=complete_df, annotations_df=immune_landscape_df, purity_field=meta_field,
                                                                     matching_field=corr_matching, corr_method=corr_method, pval_th=pval_th, field_th=0.05*len(features),
                                                                     groupby=groupby, fold_number=fold_number, directory=main_cluster_path, file_name=file_name)
correlations_dict['immunelandscape'] = dict()
correlations_dict['immunelandscape']['file_name'] = file_name
correlations_dict['immunelandscape']['data']      = all_data_rho, all_data_pval, mask

''' Tile Histological Subtype Annotation '''
if additional_complete_df is not None:
    file_name = h5_additional_path.split('/hdf5_')[1].split('.h5')[0] + '_%s__fold%s_%s_histsubtypes_anno' % (groupby.replace('.', 'p'), fold_number, meta_folder)
    p_values, strength, mask = cluster_purity_hypergeom(histo_complete_df, frame_clusters, groupby, 'histtype', pval_th=pval_th, pvalue_as_strengh=False)
    strength.index = strength.index.astype(str)
    correlations_dict['tile_histsubtypes'] = dict()
    correlations_dict['tile_histsubtypes']['file_name'] = file_name
    correlations_dict['tile_histsubtypes']['data']     = p_values, strength, mask

''' WSI Rep. Cluster Correlations '''
file_name     = h5_complete_path.split('/hdf5_')[1].split('.h5')[0] + '_%s__fold%s_%s_contentcorr' % (groupby.replace('.', 'p'), fold_number, meta_folder)
all_data_rho, all_data_pval, mask = correlate_clusters_occurrance_annotation(complete_df, meta_field, groupby, fold_number, main_cluster_path, file_name,
                                                                             corr_method=corr_method, pval_th=pval_th)

correlations_dict['content_corr'] = dict()
correlations_dict['content_corr']['file_name'] = file_name
correlations_dict['content_corr']['data']     = all_data_rho, all_data_pval, mask

# Paper figures - Correlations

In [30]:
p_th = 0.05

''' Leiden Dendrogram '''
file_name = correlations_dict[groupby]['file_name']
plot_dendrogram(adata_train, groupby, directory=figure_path, file_name=file_name+'.png', show=False)

''' Cluster Purity '''
file_name = correlations_dict['hovernet']['file_name']
critical_coef, critical_ref, p_values, mask = correlations_dict['hovernet']['data']
g = plot_clustermap(all_data_rho=np.round(critical_coef,2), mask=mask.values, x_label='Cluster', y_label='Cell Annotations', cox_os_clusters=coef_os_df, cox_pfs_clusters=coef_pfs_df,
                directory=figure_path, file_name=file_name+'.jpg', figsize=(60,30))
correlations_dict['hovernet']['linkage'] = g.dendrogram_col.linkage

''' Immune Landscape '''
file_name = correlations_dict['immunelandscape']['file_name']
all_data_rho, all_data_pval, mask = correlations_dict['immunelandscape']['data']
g = plot_clustermap(all_data_rho=all_data_rho, mask=mask.values, x_label='Cluster', y_label='Immune feature', cox_os_clusters=coef_os_df, cox_pfs_clusters=coef_pfs_df,
                    directory=figure_path, file_name=file_name+'.jpg', figsize=(65,35))
correlations_dict['immunelandscape']['linkage'] = g.dendrogram_col.linkage

'''Tile Histological Subtype Annotations'''
if additional_complete_df is not None:
    file_name = correlations_dict['tile_histsubtypes']['file_name']
    p_values, strength, mask = correlations_dict['tile_histsubtypes']['data']
    g = plot_clustermap(all_data_rho=strength.transpose(), mask=mask.values.transpose(), vcenter=1, x_label='Cluster', y_label='Histological subtype', cox_os_clusters=coef_os_df, cox_pfs_clusters=coef_pfs_df,
                        fmt='.1f', directory=figure_path, file_name=file_name+'.jpg', figsize=(60,30))


executing
executing


In [24]:
'''Cross Dendrogram - Immune/HoverNet & HoverNet/Immune '''
file_name = correlations_dict['hovernet']['file_name'] + '_cross_immune'
critical_coef, critical_ref, p_values, mask = correlations_dict['hovernet']['data']
g = plot_clustermap(all_data_rho=np.round(critical_coef,2), mask=mask.values, x_label='Cluster', y_label='Cell Annotations', cox_os_clusters=coef_os_df, cox_pfs_clusters=coef_pfs_df,
                    col_linkage=correlations_dict['immunelandscape']['linkage'], directory=figure_path, file_name=file_name+'.jpg', figsize=(60,30))

file_name = correlations_dict['immunelandscape']['file_name'] + '_cross_hovernet'
all_data_rho, all_data_pval, mask = correlations_dict['immunelandscape']['data']
g = plot_clustermap(all_data_rho=all_data_rho, mask=mask.values, x_label='Cluster', y_label='Immune feature', cox_os_clusters=coef_os_df, cox_pfs_clusters=coef_pfs_df,
                    col_linkage=correlations_dict['hovernet']['linkage'], directory=figure_path, file_name=file_name+'.jpg', figsize=(65,35))

'''Cross Dendrogram - Immune/Leiden & HoverNet/Leiden '''
file_name = correlations_dict['hovernet']['file_name'] + '_cross_leiden'
critical_coef, critical_ref, p_values, mask = correlations_dict['hovernet']['data']
g = plot_clustermap(all_data_rho=np.round(critical_coef,2), mask=mask.values, x_label='Cluster', y_label='Cell Annotations', cox_os_clusters=coef_os_df, cox_pfs_clusters=coef_pfs_df,
                    col_linkage=leiden_linkage, directory=figure_path, file_name=file_name+'.jpg', figsize=(60,30))

file_name = correlations_dict['immunelandscape']['file_name'] + '_cross_leiden'
all_data_rho, all_data_pval, mask = correlations_dict['immunelandscape']['data']
g = plot_clustermap(all_data_rho=all_data_rho, mask=mask.values, x_label='Cluster', y_label='Immune feature', cox_os_clusters=coef_os_df, cox_pfs_clusters=coef_pfs_df,
                    col_linkage=leiden_linkage, directory=figure_path, file_name=file_name+'.jpg', figsize=(65,35))

'''Cross Dendrogram - Tile Annotation/Immune & Tile Annotation/HoverNet/Leiden '''
if additional_complete_df is not None:
    file_name = correlations_dict['tile_histsubtypes']['file_name'] + '_cross_immune'
    p_values, strength, mask = correlations_dict['tile_histsubtypes']['data']
    g = plot_clustermap(all_data_rho=strength.transpose(), mask=mask.values.transpose(), vcenter=1, x_label='Cluster', y_label='Histological subtype', cox_os_clusters=coef_os_df, cox_pfs_clusters=coef_pfs_df, fmt='.1f', col_linkage=correlations_dict['immunelandscape']['linkage'], directory=figure_path, file_name=file_name+'.jpg', figsize=(60,30))

    file_name = correlations_dict['tile_histsubtypes']['file_name'] + '_cross_leiden'
    g = plot_clustermap(all_data_rho=strength.transpose(), mask=mask.values.transpose(), vcenter=1, x_label='Cluster', y_label='Histological subtype', cox_os_clusters=coef_os_df, cox_pfs_clusters=coef_pfs_df, fmt='.1f', col_linkage=leiden_linkage, directory=figure_path, file_name=file_name+'.jpg', figsize=(60,30))


# Paper figures - Cell Type Enrichment Hover-Net

In [None]:
def plot_hist(ax, data, cell_name, title, fontsize, fontsize_title, color, stat='density', kde=True, fill=True, cumulative=False, bins=75):
    ax = sns.histplot(data, stat=stat, kde=kde, element='step', cumulative=cumulative, fill=fill, ax=ax, color=color)
    ax.set_title(title,           fontweight='bold', fontsize=fontsize_title)
    ax.set_xlabel('Number %s\nper tile' % cell_name, fontweight='bold', fontsize=fontsize_title)
    ax.set_ylabel('Density',      fontweight='bold', fontsize=fontsize_title)

    for tick in ax.xaxis.get_major_ticks():
        tick.label1.set_fontsize(fontsize)
        tick.label1.set_fontweight('bold')
    for tick in ax.yaxis.get_major_ticks():
        tick.label1.set_fontsize(fontsize)
        tick.label1.set_fontweight('bold')
    for axis in ['top','bottom','left','right']:
        ax.spines[axis].set_linewidth(4)

def plot_cumulative_comparison(data_1, data_2, cell_name, title, fontsize, fontsize_title, ax, lw=4, markersize=4):
    min_value  = min(data_1.min(), data_2.min())
    max_value  = max(data_1.max(), data_2.max())

    hist_1, bin_1 = np.histogram(data_1, bins=75, range=[min_value, max_value], density=True)
    hist_2, bin_2 = np.histogram(data_2, bins=75, range=[min_value, max_value], density=True)
    cum_1 = np.cumsum(hist_1 * np.diff(bin_1))
    cum_2 = np.cumsum(hist_2 * np.diff(bin_1))

    index    = np.argmax(np.abs(cum_1-cum_2))
    index   += 1
    x_value  = [bin_1[index]]
    y_value  = [(cum_1[index], cum_2[index])]

    if max(cum_1[index],cum_2[index])==cum_2[index]:
        text = ax.annotate(r'$\bf{+D_{n,m}}$', (x_value[0]+0.02*max_value, min(cum_1[index],cum_2[index])-0.01), fontweight='bold', fontsize=fontsize_title)
    else:
        text = ax.annotate(r'$\bf{-D_{n,m}}$', (x_value[0]+0.02*max_value, min(cum_1[index],cum_2[index])-0.01), fontweight='bold', fontsize=fontsize_title)

    lines = []
    for i, j in zip(x_value, y_value):
        pair = [(i, j[0]), (i, j[1])]
        lines.append(pair)

    ax.plot(bin_1[:-1], cum_1, color='blue', lw=lw)
    ax.plot(bin_1[:-1], cum_2, color='red', lw=lw)
    ax.plot(x_value, [i for (i,j) in y_value], 'bs', markersize=markersize)
    ax.plot(x_value, [j for (i,j) in y_value], 'ro', markersize=markersize)

    linecoll = matcoll.LineCollection(lines, colors='k', linestyle='--', lw=lw)
    ax.add_collection(linecoll)

    ax.set_title(title,           fontweight='bold', fontsize=fontsize_title)
    ax.set_xlabel('Number %s\nper tile' % cell_name, fontweight='bold', fontsize=fontsize_title)
    ax.set_ylabel('Density',      fontweight='bold', fontsize=fontsize_title)

    for tick in ax.xaxis.get_major_ticks():
        tick.label1.set_fontsize(fontsize)
        tick.label1.set_fontweight('bold')
    for tick in ax.yaxis.get_major_ticks():
        tick.label1.set_fontsize(fontsize)
        tick.label1.set_fontweight('bold')
    for axis in ['top','bottom','left','right']:
        ax.spines[axis].set_linewidth(4)
    # adjust_text([text])

def plot_comparison(cluster_anno_df, cell_name, cluster_id_1, fontsize, fontsize_title, stat='density', fill=True, cumulative=False, cluster_id_2=None, lw=6, markersize=6,
                    bins=75, figsize=(10,4)):

    fig, ax = plt.subplots(ncols=3, nrows=1, sharex=True, sharey=False, figsize=figsize)

    title_1 = 'HPC %s\nProbability Distribution' % cluster_id_1
    data_1  = cluster_anno_df[cluster_anno_df[groupby]==cluster_id_1][cell_name].values

    if cluster_id_2 is None:
        title_2 = 'All HPCs\nProbability Distribution'
        data_2  = cluster_anno_df[cell_name].values
    else:
        title_2 = 'HPC %s\nProbability Distribution' % cluster_id_2
        data_2  = cluster_anno_df[cluster_anno_df[groupby]==cluster_id_2][cell_name].values

    plot_hist(ax[0], data_1, cell_name, title_1, fontsize, fontsize_title, color='blue', stat=stat, cumulative=cumulative, fill=fill, bins=bins)
    plot_hist(ax[1], data_2, cell_name, title_2, fontsize, fontsize_title, color='red',  stat=stat, cumulative=cumulative, fill=fill, bins=bins)

    range_0 = ax[0].get_ylim()
    range_1 = ax[1].get_ylim()
    if range_0[1] > range_1[1]:
        final_range = range_0
    else:
        final_range = range_1
    ax[0].set_ylim(final_range)
    ax[1].set_ylim(final_range)

    # Cumulative
    title = 'Cumulative\nDistribution'
    # plot_hist(ax[2], data_1, title, fontsize, fontsize_title, color='blue',  stat=stat, kde=False, cumulative=True, fill=False, bins=bins)
    # plot_hist(ax[2], data_2, title, fontsize, fontsize_title, color='red',   stat=stat, kde=False, cumulative=True, fill=False, bins=bins)
    plot_cumulative_comparison(data_1, data_2, cell_name, title, fontsize, fontsize_title, ax=ax[2], lw=lw, markersize=markersize)

    plt.tight_layout()
    plt.show()

figsize         = (20,6)
fontsize_title = 20
fontsize       = 15
bins           = 75

cell_name    = cell_names[0]
cluster_id_1 = 10
plot_comparison(cluster_anno_df, cell_name, cluster_id_1, fontsize, fontsize_title, lw=3, markersize=7, stat='density', cumulative=False, cluster_id_2=None, bins=bins, figsize=figsize)

cluster_id_1 = 15
plot_comparison(cluster_anno_df, cell_name, cluster_id_1, fontsize, fontsize_title, lw=3, markersize=7, stat='density', cumulative=False, cluster_id_2=None, bins=bins, figsize=figsize)


# Paper figures - Scatter Plot correlations

In [32]:
frames_perc = build_cohort_representations(meta_folder, meta_field, matching_field, groupby, fold_number, folds_pickle, h5_complete_path, h5_additional_path, 'percent', 100)
complete_df_perc, additional_complete_df, frame_clusters, frame_samples, features = frames_perc

frames_clr = build_cohort_representations(meta_folder, meta_field, matching_field, groupby, fold_number, folds_pickle, h5_complete_path, h5_additional_path, 'clr', 100)
complete_df_clr,  additional_complete_df, frame_clusters, frame_samples, features = frames_clr

cross_df_perc = pd.merge(complete_df_perc, immune_landscape_df, left_on='samples', right_on='samples', how='inner')
cross_df_clr  = pd.merge(complete_df_clr,  immune_landscape_df, left_on='samples', right_on='samples', how='inner')


In [None]:
def show_correlation_scatter(cross_df, cluster, annotations, all_data_rho, all_data_pval, fontsize_labels=22, fontsize_title=30):
    from decimal import Decimal
    cross_df.columns = cross_df.columns.astype(str)

    for i, annotation in enumerate(annotations):
        rho_annotation  = all_data_rho.loc[annotation, str(cluster)]
        pval_annotation = all_data_pval.loc[annotation, str(cluster)]
        g = sns.jointplot(data=cross_df, x=annotation, y=str(cluster), kind='reg', ci=None, height=10, ratio=2)
        g.ax_joint.set_ylabel('HPC %s\nContribution' % cluster, fontsize=fontsize_labels, fontweight='bold')
        g.ax_joint.set_xlabel(annotation, fontsize=fontsize_labels, fontweight='bold')

        for tick in g.ax_joint.xaxis.get_major_ticks():
            tick.label1.set_fontsize(fontsize_labels)
            tick.label1.set_fontweight('bold')
        for tick in g.ax_joint.yaxis.get_major_ticks():
            tick.label1.set_fontsize(fontsize_labels)
            tick.label1.set_fontweight('bold')

        for axis in ['top','bottom','left','right']:
            g.ax_joint.spines[axis].set_linewidth(4)
            g.ax_marg_x.spines[axis].set_linewidth(4)
            g.ax_marg_y.spines[axis].set_linewidth(4)

        plt.suptitle('Spearman %s=%s\np-value=%s' % (r'$\mathbf{\rho}$', np.round(rho_annotation, 1), '%.1E' % Decimal(pval_annotation)), fontsize=fontsize_title, fontweight='bold')
        g.fig.tight_layout()
        plt.show()


sns.set_theme(style='white')

cross_df_plt = cross_df_clr.copy(deep=True)
cross_df_plt['Th2 Cells'] /= 1000
cross_df_plt = cross_df_plt.rename(columns={'Th2 Cells':'Th2 Cells (Thousands)'})

all_data_rho_plt  = all_data_rho.rename(index={'Th2 Cells':'Th2 Cells (Thousands)'})
all_data_pval_plt = all_data_pval.rename(index={'Th2 Cells':'Th2 Cells (Thousands)'})

all_data_rho, all_data_pval, mask = correlations_dict['immunelandscape']['data']
annotations = ['TIL Regional Fraction', 'Lymphocyte Infiltration Signature Score', 'Leukocyte Fraction']
show_correlation_scatter(cross_df_plt, cluster=1, annotations=annotations, all_data_rho=all_data_rho_plt, all_data_pval=all_data_pval_plt)
# show_correlation_scatter(cross_df_clr, cluster=31, annotations=annotations, all_data_rho=all_data_rho_plt, all_data_pval=all_data_pval_plt)

annotations = ['Proliferation', 'Wound Healing', 'Th2 Cells (Thousands)']
# show_correlation_scatter(cross_df_plt, cluster=14, annotations=annotations, all_data_rho=all_data_rho_plt, all_data_pval=all_data_pval_plt)
# show_correlation_scatter(cross_df_plt, cluster=15, annotations=annotations, all_data_rho=all_data_rho_plt, all_data_pval=all_data_pval_plt)
# show_correlation_scatter(cross_df_clr, cluster=11, annotations=annotations, all_data_rho=all_data_rho_plt, all_data_pval=all_data_pval_plt)
