In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib

from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, inconsistent
from scipy.spatial.distance import cdist, pdist

import matplotlib.pyplot as plt
%matplotlib inline

algo = "HC"
base_path = "/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/"
pictures_path = f"{base_path}Pictures/"
dataset_path = f"{base_path}Survey+dataset/"
code_path = f"{base_path}Code/"
results_path = f"{base_path}Code/Data/"
HC_base_path = f"{results_path}{algo}/"
numpy_file_type = ".npy"
image_file_type = ".html"

import sys
sys.path.append(code_path)
np.set_printoptions(precision=5, suppress=True)

# Silohuette analisys

In [None]:
import metrics.silhouette 
import importlib
importlib.reload(metrics.silhouette) 

<module 'metrics.silhouette' from '/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Code/metrics/silhouette.py'>

In [3]:
from sklearn.metrics import silhouette_score#, silhouette_samples
from metrics.silhouette import getSilhouette

# Define plot silhouette

In [4]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly import tools
#import matplotlib.cm as cm

def plotSilhouette(fig, n_clusters, silhouette_avg, sample_silhouette_values, cluster_labels, silhouette_row=1, silhouette_col=1):

  x_lower = 10

  for i in np.unique(cluster_labels):
      # Aggregate the silhouette scores for samples belonging to
      # cluster i, and sort them
      ith_cluster_silhouette_values = \
          sample_silhouette_values[cluster_labels == i]

      ith_cluster_silhouette_values.sort()

      size_cluster_i = ith_cluster_silhouette_values.shape[0]
      x_upper = x_lower + size_cluster_i

      #colors = plt.cm.Spectral(cluster_labels.astype(float) / n_clusters)
      
      filled_area = go.Scatter(x=np.arange(x_lower, x_upper),
                                y=ith_cluster_silhouette_values,
                                mode='lines',
                                name=str(i),
                                #showlegend=True,
                                line=dict(width=0.5,
                                        #color=colors
                                        ),
                                fill='tozeroy')
      fig.add_trace(filled_area, silhouette_row, silhouette_col)
      
      # Compute the new y_lower for next plot
      x_lower = x_upper + 10  # 10 for the 0 samples

  # The 1st subplot is the silhouette plot
  # The silhouette coefficient can range from -1, 1 
  fig.update_yaxes(title_text='The silhouette coefficient values',
                   row=silhouette_row, col=silhouette_col,
                   range=[-1, 1])

  # The (n_clusters+1)*10 is for inserting blank space between silhouette
  # plots of individual clusters, to demarcate them clearly.
  fig.update_xaxes(title_text='Cluster label',
                   row=silhouette_row, col=silhouette_col,
                   #showticklabels=False,
                   range=[0, len(df_DNA) + (n_clusters + 1) * 10])

  # The vertical line for average silhouette score of all the values
  axis_line = go.Scatter(y=[silhouette_avg]*100,
                         x=np.linspace(0, len(df_DNA), 100,),
                         #showlegend=True,
                         name='silhouette avg',
                         mode='lines',
                         line=dict(color="red", dash='dash',
                                   width =1) )

  fig.add_trace(axis_line, silhouette_row, silhouette_col)

  return fig
    

In [5]:
import plotly.express as px # for colors
def plot_silhouette_from_k_values(distance_matrix, metric, link, k_values, postprocessing, save_path=None, 
                    row_height=300, showFig=False, columnNameCharLimit=27):
  # Customization options
  marker_colors = ['#eb4034', '#346beb', '#32a838', '#ff99ff', '#f5a742']
  fontsize = 8

  #
  titles = []
  for k in k_values:
    titles.append(f"Silhouette for k={k} clusters with {metric}{vars} and {link}")

  rows = len(k_values)
  fig = make_subplots(rows=rows, cols=1,
                      vertical_spacing=0.4/rows,
                      subplot_titles=titles,
                      )
  
  for i in range(rows):
    k = k_values[i]

    HC_path = f"{HC_base_path}{metric}{vars}/{algo}_{metric}_{link}{numpy_file_type}"
    Z = {}
    Z[metric] = np.load(HC_path)
    cluster_labels = fcluster(Z[metric], k, criterion='maxclust')

    silhouette_avg, sample_silhouette_values, cluster_labels = getSilhouette(distance_matrix, cluster_labels, postprocessing)
    print("sample_silhouette_values:", sample_silhouette_values.shape[0])
    titles[i] = f"{titles[i]}, avg sil:{silhouette_avg:.02f}"
    k = np.unique(cluster_labels).shape[0]
    # add silhouette row
    fig = plotSilhouette(fig, 
                        k, 
                        silhouette_avg, sample_silhouette_values, cluster_labels,
                        silhouette_row=i+1) # plotly starts from 1!
    fig.layout.annotations[i].update(text=titles[i])

  # Update layout
  fig.update_layout(
      #title=f"silhouette_avg: {silhouette_avg}",
      height=row_height*rows,
      showlegend=False,
      bargap=0.05,
      font=dict(size=fontsize)
  )
      

  if save_path:
    fig.write_html(save_path)
  if showFig:
    fig.show()

# Run silhouette

In [6]:
from metrics.elbow import getElbows

In [8]:
algo = "HC"
metric = "VDM"
vars_values = [6]

postprocessing_values = [False]


range_min = 2
range_max = 600
range_n_clusters = range(range_min, range_max)


linkages = ["single"]#, "complete", "average", "weighted"]

for vars in vars_values:
  print(f"Vars: {vars} ...")
  ds_path = f'{dataset_path}4_DNA_{vars}values_normalized.csv'
  df_DNA = pd.read_csv(ds_path)
  
  try:
    del distance_matrix
  except:
    distance_matrix = []
  distance_matrix = np.load(f"{results_path}distance_matrix_{metric}{vars}{numpy_file_type}")

  for link in linkages:
    print(f"Link: {link} ...")
    for postprocessing in postprocessing_values:
      print(f"Postprocessing: {postprocessing} ...")  
      is_postprocessing = "_fix" if postprocessing else ""
      
      WSS_path = f"{HC_base_path}{metric}{vars}/WSS/WSS_{metric}_{link}_range({range_min},{range_max}){numpy_file_type}"
      wss = np.load(WSS_path)

      k_values = getElbows(wss, range_min, add_first_k=True)

      img_path = f"{pictures_path}HC/Silhouette/{metric}{vars}/Silhouette_{link}_{str(k_values)}{is_postprocessing}{image_file_type}"
      plot_silhouette_from_k_values(distance_matrix, metric, link, k_values, postprocessing, img_path, showFig=False)

Vars: 6 ...
Link: single ...
Postprocessing: False ...



divide by zero encountered in double_scalars



For n_clusters = 2 The average silhouette_score is : 0.6781566428354224
sample_silhouette_values: 26605
For n_clusters = 4 The average silhouette_score is : 0.35924298160189577
sample_silhouette_values: 26605
For n_clusters = 6 The average silhouette_score is : 0.37037372514994626
sample_silhouette_values: 26605
For n_clusters = 15 The average silhouette_score is : 0.1958254683462254
sample_silhouette_values: 26605
For n_clusters = 19 The average silhouette_score is : 0.19949662889128839
sample_silhouette_values: 26605
For n_clusters = 25 The average silhouette_score is : 0.2014177625353179
sample_silhouette_values: 26605
For n_clusters = 29 The average silhouette_score is : 0.20166060194835128
sample_silhouette_values: 26605
For n_clusters = 31 The average silhouette_score is : 0.2017420854704587
sample_silhouette_values: 26605
For n_clusters = 41 The average silhouette_score is : 0.19852862355281273
sample_silhouette_values: 26605
For n_clusters = 59 The average silhouette_score is :

In [None]:
metric = "VDM"
vars = 5
link = "average"
postprocessing = False
is_postprocessing = "_fix" if postprocessing else ""
      
WSS_path = f"{HC_base_path}{metric}{vars}/WSS/WSS_{metric}_{link}_range({range_min},{range_max}){numpy_file_type}"
wss = np.load(WSS_path)

k_range = getElbows(wss, range_min)

img_path = f"{pictures_path}HC/Silhouette/{metric}{vars}/Silhouette_{link}_{str(k_range)}{is_postprocessing}{image_file_type}"
plot_silhouette_from_k_values(distance_matrix, metric, link, k_range, postprocessing, img_path, showFig=False)


divide by zero encountered in double_scalars



For n_clusters = 3 The average silhouette_score is : 0.4319858845525847
sample_silhouette_values: 26605
For n_clusters = 5 The average silhouette_score is : 0.369350979505518
sample_silhouette_values: 26605
For n_clusters = 7 The average silhouette_score is : 0.30984640384933915
sample_silhouette_values: 26605
For n_clusters = 9 The average silhouette_score is : 0.2859367022736226
sample_silhouette_values: 26605
For n_clusters = 12 The average silhouette_score is : 0.257256769179167
sample_silhouette_values: 26605
For n_clusters = 14 The average silhouette_score is : 0.26739220290238574
sample_silhouette_values: 26605
For n_clusters = 20 The average silhouette_score is : 0.25888112500312166
sample_silhouette_values: 26605
For n_clusters = 22 The average silhouette_score is : 0.241103305410257
sample_silhouette_values: 26605
For n_clusters = 28 The average silhouette_score is : 0.217167281129417
sample_silhouette_values: 26605
For n_clusters = 31 The average silhouette_score is : 0.2386