In [1]:
import sys
from glob import glob
import numpy as np
import pandas as pd
import umap
import hdbscan
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from plotly.graph_objs import *
import plotly.express as px

import warnings
warnings.filterwarnings("ignore")

In [2]:
def show_cluster(xyz):
    x, y, z = xyz[:,0], xyz[:,1], xyz[:,2]
    fig = go.Figure(data=[go.Scatter3d(
        x=x,
        y=y,
        z=z,
        mode='markers',
        marker=dict(
            size=3,
            opacity=0.75
        )
    )])
    # tight layout
    fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
    fig.write_html('test.html', auto_open=True)

def scattered_wsi(df,x,y,hue,size,opacity,auto_open,filename):
    fig = px.scatter(df,
                     x=x, y=y,color=hue,
                     width=800, height=800,
                     color_discrete_sequence=px.colors.qualitative.Set2)
    fig.update_traces(marker=dict(size=size,opacity=opacity))
    fig.update_layout(template='simple_white')
    fig.update_layout(legend= {'itemsizing': 'constant'})
    fig.write_html(filename+'.spatial_projection.FOV.html', auto_open=auto_open)
    return 

def patch2covd(df):
    fov_covd = [] # initialize a covd list
    fov_numb = len(set(df['patch'])) # tot number of FOVs
    for p in list(set(df['patch'])): # for each patch
        fdf = df['patch'] == p # consider only nuclei in patch
        arr = df[fdf][['area', # define the array of morphological features
                       'perimeter', 
                       'solidity',
                       'eccentricity', 
                       'circularity', 
                       'mean_intensity',
                       'cov_intensity']].to_numpy()
        covd = np.cov(arr,rowvar=False) # determine the covariance descriptor of the patch
        if covd.shape == (7,7): # discard problematic patches
            idx = np.triu_indices(covd.shape[1])
            covd2vec = covd[idx] # vectorize to the upper triangular part of the covd
            fov_covd.append([p,covd2vec])      # integrate the list of covd patches
    data = np.asarray([el[1] for el in fov_covd]) # format as array
    return data, fov_covd

def clustering(embedding,min_cluster_size,min_samples,plot_tree):
    clusterer = hdbscan.HDBSCAN(min_samples=min_samples, 
                                min_cluster_size=min_cluster_size, 
                                gen_min_span_tree=True)
    clusterer.fit(embedding) #cluster positions
    if plot_tree:
        clusterer.condensed_tree_.plot(select_clusters=True, 
                                   selection_palette=sns.color_palette("Set2",len(clusterer.labels_)))
    return clusterer

In [3]:
num_quantiles = 50
files = glob('../data_intensity/pkl/id_*.measurements.covd.pkl.intensityANDmorphology.csv.gz')
file = ('../data_intensity/pkl/id_13.measurements.covd.pkl.intensityANDmorphology.csv.gz')

In [84]:
for file in sorted(files):
    print(file)
    df = pd.read_csv(file) # load the dataframe
    df['fov_col'] = pd.qcut(df['cx'], num_quantiles, labels=False) # rename by quantile
    df['fov_row'] = pd.qcut(df['cy'], num_quantiles, labels=False) # rename by quantile
    df['patch'] = df['fov_row'].apply(str)+'_'+df['fov_col'].apply(str) # define a patch ID    

    # Get the covd representation of each patch
    data, fov_covd = patch2covd(df)

    # Find the umap embedding of the covds
    embedding = umap.UMAP(min_dist=0.0, 
                              n_components=3,
                              random_state=42).fit_transform(data)

    # Density based clustering of the embedding
    min_cluster_size = 100 #parameters
    min_samples = 50       #parameters
    clusterer = clustering(embedding,min_cluster_size,min_samples,False)

    # Spatial projection
    df_left = df.groupby(['patch']).mean()[['cx','cy']] # df with centroid of each patch
    df_left.reset_index(inplace=True)
    df_right = pd.DataFrame({'patch': [el[0] for el in fov_covd], # df with cluster id of each patch
                             'cluster': clusterer.labels_},
                             columns=['patch', 'cluster'])
    df_patched = df_left.merge(df_right, left_on='patch', right_on='patch') # df with centroid and cluster ID per patch
    df_patched['hue'] = df_patched['cluster'].apply(str) # to color
    filters = df_patched['cluster'] > -1 # only assigned patches
    scattered_wsi(df_patched[filters],'cx','cy','hue',size=8,opacity=1,auto_open=True,filename=file)

../data_intensity/pkl/id_13.measurements.covd.pkl.intensityANDmorphology.csv.gz
../data_intensity/pkl/id_17.measurements.covd.pkl.intensityANDmorphology.csv.gz
../data_intensity/pkl/id_39.measurements.covd.pkl.intensityANDmorphology.csv.gz
../data_intensity/pkl/id_40.measurements.covd.pkl.intensityANDmorphology.csv.gz
../data_intensity/pkl/id_41.measurements.covd.pkl.intensityANDmorphology.csv.gz
../data_intensity/pkl/id_45.measurements.covd.pkl.intensityANDmorphology.csv.gz
../data_intensity/pkl/id_46.measurements.covd.pkl.intensityANDmorphology.csv.gz
../data_intensity/pkl/id_51.measurements.covd.pkl.intensityANDmorphology.csv.gz
../data_intensity/pkl/id_52.measurements.covd.pkl.intensityANDmorphology.csv.gz
../data_intensity/pkl/id_54.measurements.covd.pkl.intensityANDmorphology.csv.gz
../data_intensity/pkl/id_56.measurements.covd.pkl.intensityANDmorphology.csv.gz
../data_intensity/pkl/id_57.measurements.covd.pkl.intensityANDmorphology.csv.gz


In [284]:
df_cloud = pd.DataFrame({'x': embedding[:,0],
                         'y': embedding[:,1],
                         'z': embedding[:,2],
                         'cluster': clusterer.labels_},
                         columns=['x','y','z','cluster'])
df_cloud['color'] = df_cloud['cluster'].apply(str)
filters = df_cloud['cluster'] > -1
fig = px.scatter_3d(df_cloud[filters], x="x", y="y", z="z", color="color", hover_name="color")
fig.update_traces(marker=dict(size=4,opacity=0.9),selector=dict(mode='markers'))
fig.write_html('test.html', auto_open=True)

In [285]:
show_cluster(embedding)