In [21]:
# Imports notebook size.
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

from mpl_toolkits.axes_grid1 import ImageGrid
import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc
import anndata
import random
import umap

# Own libraries
import sys
sys.path.append('/media/adalberto/Disk2/PhD_Workspace')
from data_manipulation.data import Data
from data_manipulation.utils import store_data
from models.evaluation.folds import load_existing_split
from models.clustering.data_processing import *
from models.clustering.leiden_representations import assign_clusters


In [58]:
dbs_path = '/media/adalberto/Disk2/PhD_Workspace'
dataset  = 'NYUFFPE_survival_5x'

data = Data(dataset=dataset, marker='he', patch_h=224, patch_w=224, n_channels=3, batch_size=64, project_path=dbs_path)
print('Number of samples:', data.training.images.shape[0])

Train Set: /media/adalberto/Disk2/PhD_Workspace/datasets/NYUFFPE_survival_5x/he/patches_h224_w224/hdf5_NYUFFPE_survival_5x_he_train.h5
Validation Set: /media/adalberto/Disk2/PhD_Workspace/datasets/NYUFFPE_survival_5x/he/patches_h224_w224/hdf5_NYUFFPE_survival_5x_he_validation.h5
Test Set: /media/adalberto/Disk2/PhD_Workspace/datasets/NYUFFPE_survival_5x/he/patches_h224_w224/hdf5_NYUFFPE_survival_5x_he_test.h5

Number of samples: 57423


In [59]:
h5_additional_rep_path = '/media/adalberto/Disk2/PhD_Workspace/results/ContrastivePathology_BarlowTwins_3/NYU_LUADall_5x/h224_w224_n3_zdim128/hdf5_NYU_LUADall_5x_he_combined.h5'
meta_field             = 'survival'
rep_key                = 'z_latent'

additional_frame, additional_dims, additional_rest = representations_to_frame(h5_additional_rep_path, meta_field=meta_field, rep_key=rep_key)
print('Number of samples:', additional_frame.shape[0])

Loading representations: /media/adalberto/Disk2/PhD_Workspace/results/ContrastivePathology_BarlowTwins_3/NYU_LUADall_5x/h224_w224_n3_zdim128/hdf5_NYU_LUADall_5x_he_combined.h5
Number of samples: 57423


In [11]:
# Reference cluster for background/unfocused tiles.
anndata_path = '/media/adalberto/Disk2/PhD_Workspace/results/ContrastivePathology_BarlowTwins_3/TCGAFFPE_5x_perP/h224_w224_n3_zdim128/luad_250NN/adatas/TCGAFFPE_5x_perP_he_complete_lung_subtype_leiden_4p0__fold3_train_subsample.h5ad'
adata = sc.read_h5ad(anndata_path)


In [12]:
groupby = [column for column in adata.obs.columns if 'leiden' in column][0]
fold    = 3

In [23]:
additional_adata  = anndata.AnnData(X=additional_frame[additional_dims].to_numpy(), obs=additional_frame[additional_rest].astype('category'))
sc.tl.ingest(additional_adata, adata, obs=groupby, embedding_method='pca', neighbors_key='nn_leiden')




In [24]:
clusters_to_review = [75, 71, 66, 49, 12]
clusters_to_remove = [86, 83, 82, 80, 77, 76, 74, 73, 70, 68] + clusters_to_review


In [55]:
data_clusters = additional_adata.obs[additional_adata.obs[groupby].astype(int).isin(clusters_to_remove)]
data_clusters


Unnamed: 0,combined_hist_subtype,combined_labels,combined_patterns,combined_slides,combined_tiles,slides,leiden_4.0
3,,0.0,,9064_files_None,29_41.jpeg,9064,75
8,,0.0,,9064_files_None,42_36.jpeg,9064,75
17,,0.0,,9064_files_None,16_31.jpeg,9064,75
22,,0.0,,9064_files_None,27_32.jpeg,9064,75
23,,0.0,,9064_files_None,29_30.jpeg,9064,75
...,...,...,...,...,...,...,...
57351,,0.0,,13080_files_None,15_14.jpeg,13080,75
57368,,0.0,,13080_files_None,12_10.jpeg,13080,75
57393,,0.0,,13080_files_None,35_16.jpeg,13080,75
57395,,0.0,,13080_files_None,11_9.jpeg,13080,75


In [None]:
def cluster_set_images(frame, images, cluster_id, groupby, batches=1):
    cluster_frame = frame[frame[groupby].astype(int)==cluster_id]
    combined      = cluster_frame.index.tolist()
    random.shuffle(combined)
    combined_plot = sorted(combined[:100*batches])

    for batch in range(batches):
        images_cluster = list()
        for index in combined_plot[100*batch:100*(batch+1)]:
            images_cluster.append(images[int(index)]/255.)

        sns.set_theme(style='white')
        fig = plt.figure(figsize=(40, 8))
        fig.suptitle('Cluster %s' % (cluster_id), fontsize=18, fontweight='bold')
        grid = ImageGrid(fig, 111, nrows_ncols=(5, 20), axes_pad=0.1,)

        for ax, im in zip(grid, images_cluster):
            ax.imshow(im)

        plt.show()
        sns.set_theme(style='darkgrid')

for cluster_id in clusters_to_remove:
    print('Cluster', cluster_id, 'Number of samples:', data_clusters[data_clusters[groupby].astype(int)==cluster_id].shape[0])
    if data_clusters[data_clusters[groupby].astype(int)==cluster_id].shape[0]<20: continue
    cluster_set_images(data_clusters, data.training.images, cluster_id, groupby, batches=5)
    print()
    print()
    print()

In [60]:
indexes_remove_path = '/media/adalberto/Disk2/PhD_Workspace/utilities/files/indexes_to_remove'
data_remove_path    = os.path.join(indexes_remove_path, dataset)
if not os.path.isdir(data_remove_path):
    os.makedirs(data_remove_path)

In [67]:
indexes_set = data_clusters.index.astype(int).values.tolist() 

set_name = data.training.hdf5_path.split('/hdf5_')[1].replace('.h5', '')
pickle_path = os.path.join(data_remove_path, '%s.pkl' % set_name)
if len(indexes_set) > 0:
    store_data(indexes_set, pickle_path)