In [16]:
'''
Load the necessary libraries
'''
from mpl_toolkits.mplot3d import Axes3D 
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.graph_objs import *
import plotly.express as px
import seaborn as sns
%matplotlib inline
sns.set_context('poster')
sns.set_style('white')
sns.set_color_codes()
plot_kwds = {'alpha' : 0.5, 's' : 80, 'linewidths':0}

import hdbscan

import os
import glob

from sklearn.neighbors import NearestNeighbors
from numpy import linalg as LA
import numpy as np
import pandas as pd

from sklearn.metrics import pairwise_distances_argmin_min

In [23]:
#def cluster_nuclei(filename,sample_size,n_neighbors,threshold_q,min_cluster_size,min_samples):
def cluster_nuclei(filename,sample_size,n_neighbors,threshold_q,auto_open):
    df = pd.read_pickle(filename)
    if sample_size > 0 and sample_size < df.shape[0]:
        df = df.sample(n=sample_size)
    embedding = df[['x','y','z']].to_numpy()
    
    '''
    Calculate the local curvature of the point cloud embedding
    '''
    nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm='kd_tree').fit(embedding)
    distances, indices = nbrs.kneighbors(embedding)
    eigvals = [LA.eigvalsh(np.cov(embedding[indices[idx,:],:].T)) for idx in range(embedding.shape[0])] #full data

    curvatures = [min(eigvals[idx])/sum(eigvals[idx]) for idx in range(len(eigvals))]

    # Add curvature to the dataframe
    df['curvature'] = curvatures 

    # Find the minima in curvature histrogram
    q1 = np.quantile(curvatures,threshold_q)

    df1 = df[df['curvature'] <= q1] # define the low curvature sector

    min_cluster_size = round(df1.shape[0]/15) # parameter to be adjausted
    min_samples = round(min_cluster_size/15)       # parameter to be adjausted

    clusterer = hdbscan.HDBSCAN(min_samples=min_samples,min_cluster_size=min_cluster_size,gen_min_span_tree=True)
    clusterer.fit(df1.loc[:,('x','y','z')]) 

    clusterer.condensed_tree_.plot(select_clusters=True,
                                   selection_palette=sns.color_palette("Set2",len(clusterer.labels_)))
    plt.savefig(filename+'.tree.png')
    plt.close()
    
    df1['cluster'] = clusterer.labels_    # add cluster id to dataframe
    df1['cluster'] = df1['cluster'].apply(str)   # make cluster id a string
    df1_filtered = df1[df1.cluster != str(-1)] # remove unassigned points

    # expand the clusters to the entire point-cloud
    idx, dist = pairwise_distances_argmin_min(df[['x','y','z']].to_numpy(),df1_filtered[['x','y','z']].to_numpy())
    df['cluster'] = [int(df1_filtered.cluster.iloc[idx[row]])+1 for row in range(df.shape[0])] #add 1 to avoid confusion with background
    df.to_csv(filename+'.csv',index=False)
    
    fig = px.scatter(df1_filtered, x="cx", y="cy",color="cluster",
                         width=800, height=800,
                         color_discrete_sequence=px.colors.qualitative.Set2)
    fig.update_traces(marker=dict(size=5,opacity=1.0))
    fig.write_html(filename+'.spatial_decoration.html', auto_open=auto_open)
    fig.write_image(filename+'.spatial_decoration.png')

    fig = px.scatter_3d(df1_filtered, x="x", y="y", z="z", color="cluster", hover_name="cluster", 
                            color_discrete_sequence=px.colors.qualitative.Set2)
    fig.update_traces(marker=dict(size=3,opacity=0.75),selector=dict(mode='markers'))
    fig.write_html(filename+'.low_curvature_clusters.html', auto_open=auto_open)
    fig.write_image(filename+'.low_curvature_clusters.png')
    return df

In [24]:
#def cluster_nuclei(filename,sample_size,n_neighbors,threshold_q,min_cluster_size,min_samples):
def cluster_nuclei(filename,sample_size,n_neighbors,threshold_q,auto_open):
    df = pd.read_pickle(filename)
    if sample_size > 0 and sample_size < df.shape[0]:
        df = df.sample(n=sample_size)
    embedding = df[['x','y','z']].to_numpy()
    
    '''
    Calculate the local curvature of the point cloud embedding
    '''
    nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm='kd_tree').fit(embedding)
    distances, indices = nbrs.kneighbors(embedding)
    eigvals = [LA.eigvalsh(np.cov(embedding[indices[idx,:],:].T)) for idx in range(embedding.shape[0])] #full data

    curvatures = [min(eigvals[idx])/sum(eigvals[idx]) for idx in range(len(eigvals))]

    # Add curvature to the dataframe
    df['curvature'] = curvatures 

    # Find the minima in curvature histrogram
    q1 = np.quantile(curvatures,threshold_q)

    df1 = df[df['curvature'] <= q1] # define the low curvature sector

    min_cluster_size = round(df1.shape[0]/15) # parameter to be adjausted
    min_samples = round(min_cluster_size/15)       # parameter to be adjausted

    clusterer = hdbscan.HDBSCAN(min_samples=min_samples,min_cluster_size=min_cluster_size,gen_min_span_tree=True)
    clusterer.fit(df1.loc[:,('x','y','z')]) 

    clusterer.condensed_tree_.plot(select_clusters=True,
                                   selection_palette=sns.color_palette("Set2",len(clusterer.labels_)))
    plt.savefig(filename+'.tree.png')
    plt.close()
    df1['cluster'] = clusterer.labels_    # add cluster id to dataframe
    df1['cluster'] = df1['cluster'].apply(str)   # make cluster id a string
    df1_filtered = df1[df1.cluster != str(-1)] # remove unassigned points
    
    # expand the clusters to the entire point-cloud
    idx, dist = pairwise_distances_argmin_min(df[['x','y','z']].to_numpy(),df1_filtered[['x','y','z']].to_numpy())
    df['cluster'] = [int(df1_filtered.cluster.iloc[idx[row]])+1 for row in range(df.shape[0])] #add 1 to avoid confusion with background
    df.to_csv(filename+'.csv',index=False)
    return df

In [27]:
sample_size = 0
n_neighbors = 100
threshold_q = 0.1
auto_open = True
#min_cluster_size = 1000
#min_samples = 500

for filename in glob.glob(r'../pkl/id_13.*.pkl'):
    #df_out = cluster_nuclei(filename,sample_size,n_neighbors,threshold_q,min_cluster_size,min_samples)
    df_out = cluster_nuclei(filename,sample_size,n_neighbors,threshold_q,auto_open=auto_open)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [28]:
df_out.shape

(504854, 14)