In [1]:
# Imports notebook size.
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

from mpl_toolkits.axes_grid1 import ImageGrid
import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc
import anndata
import random
import umap

# Own libraries
import sys
sys.path.append('/mnt/cephfs/sharedscratch/users/fshahi/Projects/Histomorphological-Phenotype-Learning')
from data_manipulation.data import Data
from data_manipulation.utils import store_data
from models.evaluation.folds import load_existing_split
from models.clustering.data_processing import *
from models.clustering.leiden_representations import assign_clusters


  from IPython.core.display import display, HTML


  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()
2023-11-24 22:49:04.717003: I tensorflow/stream_executor/platform/default/dso_loader.cc:50] Successfully opened dynamic library libcudart.so.12






In [2]:
dbs_path = '/mnt/cephfs/sharedscratch/users/fshahi/Projects/Histomorphological-Phenotype-Learning'
dataset  = 'Meso_500'
# Reference cluster for background/unfocused tiles.
Main_dataset = 'Meso_400_subsampled'
meta_folder = 'removal'
fold    = 4
data = Data(dataset=dataset, marker='he', patch_h=224, patch_w=224, n_channels=3, batch_size=64, project_path=dbs_path)
print('Number of samples:', data.training.images.shape[0]+data.validation.images.shape[0]+data.test.images.shape[0])

Meso_500
Number of samples: 3103767


## Add H5ad for addiotional dataset

In [3]:
h5_additional_rep_path = '{}/results/BarlowTwins_3/{}/h224_w224_n3_zdim128/hdf5_{}_he_complete_metadata.h5'.format(dbs_path, dataset, dataset)
meta_field             = 'Meso_type'
rep_key                = 'z_latent'

additional_frame, additional_dims, additional_rest = representations_to_frame(h5_additional_rep_path, meta_field=meta_field, rep_key=rep_key)
print('Number of samples:', additional_frame.shape[0])

Loading representations: /mnt/cephfs/sharedscratch/users/fshahi/Projects/Histomorphological-Phenotype-Learning/results/BarlowTwins_3/Meso_500/h224_w224_n3_zdim128/hdf5_Meso_500_he_complete_metadata.h5
Keys: <KeysViewHDF5 ['Meso_type', 'Sex', 'case_Id', 'desmoplastic_component', 'hist_subtype', 'img_h_latent', 'img_z_latent', 'indexes', 'labels', 'original_set', 'os_event_data', 'os_event_ind', 'patterns', 'recurrence', 'samples', 'slides', 'smoking_history', 'stage', 'tiles', 'time_to_recurrence', 'type', 'wcc_score']>
Number of samples: 3103767


In [11]:
anndata_path = '{}/results/BarlowTwins_3/{}/h224_w224_n3_zdim128/{}/adatas/{}_he_complete_combined_metadata_leiden_5p0__fold{}_subsample.h5ad'.format(dbs_path, Main_dataset, meta_folder, Main_dataset, fold)
adata = sc.read_h5ad(anndata_path)
adata


AnnData object with n_obs × n_vars = 200000 × 128
    obs: 'case_number', 'hist_subtype', 'indexes', 'labels', 'original_set', 'os_event_data', 'os_event_ind', 'patterns', 'samples', 'slides', 'stage', 'tiles', 'type', 'leiden_5.0'
    uns: 'leiden', 'nn_leiden', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'
    obsp: 'nn_leiden_connectivities', 'nn_leiden_distances'

In [12]:
groupby = [column for column in adata.obs.columns if 'leiden' in column][0]


In [13]:
additional_adata  = anndata.AnnData(X=additional_frame[additional_dims].to_numpy(), obs=additional_frame[additional_rest].astype('category'))
sc.tl.ingest(additional_adata, adata, obs=groupby, embedding_method='pca', neighbors_key='nn_leiden')
additional_adata.write_h5ad('{}/results/BarlowTwins_3/{}/h224_w224_n3_zdim128/{}/adatas/{}_he_complete_combined_metadata_{}__fold{}_subsample.h5ad'.format(dbs_path, dataset, meta_folder, dataset, groupby.replace('.','p') ,fold))
additional_adata




In [19]:
import csv
import os
import seaborn as sns
import numpy as np

def cluster_set_images(frame, images, cluster_id, groupby, batches=1):
    leiden_path    = '{}/results/BarlowTwins_3/{}/h224_w224_n3_zdim128/{}/{}_fold{}/'.format(dbs_path, dataset, meta_folder, groupby.replace('.','p'), fold)
    images_path    = os.path.join(leiden_path, 'images')
    backtrack_path = os.path.join(leiden_path, 'backtrack')
    if not os.path.isdir(images_path):
        os.makedirs(images_path)
        os.makedirs(backtrack_path)



    cluster_frame = frame[frame[groupby].astype(int)==cluster_id]
    combined      = cluster_frame.index.tolist()
    random.shuffle(combined)
    combined_plot = sorted(combined[:100*batches])
    # for batch in range(batches):
    images_cluster = list()
    csv_information = list()
    i = 0
    # for index in combined_plot[100*batch:100*(batch+1)]:
    for index in combined_plot:
        images_cluster.append(images[int(index)]/255.)
        csv_information.append({index: frame.loc[index].to_dict()})
        i += 1
        if i==100: break


    sns.set_theme(style='white')
    fig = plt.figure(figsize=(40, 8))
    fig.suptitle('Cluster %s' % (cluster_id), fontsize=18, fontweight='bold')
    grid = ImageGrid(fig, 111, nrows_ncols=(5, 20), axes_pad=0.1,)

    for ax, im in zip(grid, images_cluster):
        ax.imshow(im)
        ax.axis('off')
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.spines['bottom'].set_visible(False)
        ax.spines['left'].set_visible(False)
        ax.set_xticks([])
        ax.set_yticks([])
        ax.set_yticks([])
    plt.savefig(os.path.join(images_path,'cluster_%s.jpg' % (cluster_id)), dpi=1000)
    plt.close(fig)
    


    # plt.show()
    sns.set_theme(style='darkgrid')

    # Tracking file for selected images.
    with open(os.path.join(backtrack_path, 'cluster_%s.csv' % (cluster_id)), 'w') as content:
        w = csv.DictWriter(content, frame.columns.to_list())
        w.writeheader()
        for element in csv_information:
            for index in element:
                w.writerow(element[index])


# X = data.training.images
# X = np.concatenate((X, data.validation.images), axis=0)
# X = np.concatenate((X, data.test.images), axis=0)

# clusters_to_review = [1]
# clusters_to_remove = [] + clusters_to_review
range_clusters = additional_adata.obs[groupby].astype(int).value_counts().shape[0]

for cluster_id in range(range_clusters):
    data_clusters = additional_adata.obs[additional_adata.obs[groupby].astype(int).isin(range(range_clusters))]
    print('Cluster', cluster_id, 'Number of samples:', data_clusters[data_clusters[groupby].astype(int)==cluster_id].shape[0])
    # if data_clusters[data_clusters[groupby].astype(int)==cluster_id].shape[0]<20: continue
    cluster_set_images(data_clusters, X, cluster_id, groupby, batches=5)
    print()
    print()
    print()

Cluster 0 Number of samples: 1770



Cluster 1 Number of samples: 3311



Cluster 2 Number of samples: 910



Cluster 3 Number of samples: 1431



Cluster 4 Number of samples: 117



Cluster 5 Number of samples: 1937



Cluster 6 Number of samples: 1284



Cluster 7 Number of samples: 1279



Cluster 8 Number of samples: 1803



Cluster 9 Number of samples: 992



Cluster 10 Number of samples: 1176



Cluster 11 Number of samples: 2016



Cluster 12 Number of samples: 1749



Cluster 13 Number of samples: 749



Cluster 14 Number of samples: 2564



Cluster 15 Number of samples: 1191



Cluster 16 Number of samples: 749



Cluster 17 Number of samples: 1035



Cluster 18 Number of samples: 1193



Cluster 19 Number of samples: 2553



Cluster 20 Number of samples: 1218



Cluster 21 Number of samples: 597



Cluster 22 Number of samples: 584



Cluster 23 Number of samples: 1363



Cluster 24 Number of samples: 1210



Cluster 25 Number of samples: 2271



Cluster 26 Number of samples:

In [23]:
clusters_to_remove = [4,15,23,26,27,34,48,51,54,56,65,66,68,73,75,78,81,84,85,88,90,91,92]
fold = 4
data_clusters = additional_adata.obs[additional_adata.obs[groupby].astype(int).isin(clusters_to_remove)]
print('Number of samples:', data_clusters.shape[0])

Number of samples: 6178


In [24]:
indexes_remove_path = '{}/files/indexes_to_remove'.format(dbs_path)
data_remove_path    = os.path.join(indexes_remove_path, dataset)
if not os.path.isdir(data_remove_path):
    os.makedirs(data_remove_path)

In [25]:
indexes_set = data_clusters.index.astype(int).values.tolist() 

set_name = data.training.hdf5_path.split('/hdf5_')[1].replace('.h5', '')
pickle_path = os.path.join(data_remove_path, '%s.pkl' % set_name)
if len(indexes_set) > 0:
    store_data(indexes_set, pickle_path)