In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib

from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, inconsistent
from scipy.spatial.distance import cdist, pdist

import matplotlib.pyplot as plt
%matplotlib inline

algo = "HC"
base_path = "/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/"
pictures_path = f"{base_path}Pictures/"
dataset_path = f"{base_path}Survey+dataset/"
code_path = f"{base_path}Code/"
results_path = f"{base_path}Code/Data/"
HC_base_path = f"{results_path}{algo}/"
numpy_file_type = ".npy"
image_file_type = ".html"

import sys
sys.path.append(code_path)
np.set_printoptions(precision=5, suppress=True)

# Silohuette analisys

In [4]:
import metrics.silhouette 
import importlib
importlib.reload(metrics.silhouette) 

<module 'metrics.silhouette' from '/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Code/metrics/silhouette.py'>

In [5]:
from sklearn.metrics import silhouette_score#, silhouette_samples
from metrics.silhouette import getSilhouette

# Define plot silhouette

In [6]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly import tools
#import matplotlib.cm as cm

def plotSilhouette(fig, n_clusters, silhouette_avg, sample_silhouette_values, cluster_labels, silhouette_row=1, silhouette_col=1):

  x_lower = 10

  for i in np.unique(cluster_labels):
      # Aggregate the silhouette scores for samples belonging to
      # cluster i, and sort them
      ith_cluster_silhouette_values = \
          sample_silhouette_values[cluster_labels == i]

      ith_cluster_silhouette_values.sort()

      size_cluster_i = ith_cluster_silhouette_values.shape[0]
      x_upper = x_lower + size_cluster_i

      #colors = plt.cm.Spectral(cluster_labels.astype(float) / n_clusters)
      
      filled_area = go.Scatter(x=np.arange(x_lower, x_upper),
                                y=ith_cluster_silhouette_values,
                                mode='lines',
                                name=str(i),
                                #showlegend=True,
                                line=dict(width=0.5,
                                        #color=colors
                                        ),
                                fill='tozeroy')
      fig.add_trace(filled_area, silhouette_row, silhouette_col)
      
      # Compute the new y_lower for next plot
      x_lower = x_upper + 10  # 10 for the 0 samples

  # The 1st subplot is the silhouette plot
  # The silhouette coefficient can range from -1, 1 
  fig.update_yaxes(title_text='The silhouette coefficient values',
                   row=silhouette_row, col=silhouette_col,
                   range=[-1, 1])

  # The (n_clusters+1)*10 is for inserting blank space between silhouette
  # plots of individual clusters, to demarcate them clearly.
  fig.update_xaxes(title_text='Cluster label',
                   row=silhouette_row, col=silhouette_col,
                   #showticklabels=False,
                   range=[0, len(df_DNA) + (n_clusters + 1) * 10])

  # The vertical line for average silhouette score of all the values
  axis_line = go.Scatter(y=[silhouette_avg]*100,
                         x=np.linspace(0, len(df_DNA), 100,),
                         #showlegend=True,
                         name='silhouette avg',
                         mode='lines',
                         line=dict(color="red", dash='dash',
                                   width =1) )

  fig.add_trace(axis_line, silhouette_row, silhouette_col)

  return fig
    

In [11]:
import plotly.express as px # for colors
def plot_silhouette_from_k_values(target_class, distance_matrix, metric, link, k_values, postprocessing, save_path=None, 
                    row_height=300, showFig=False, columnNameCharLimit=27):
  # Customization options
  marker_colors = ['#eb4034', '#346beb', '#32a838', '#ff99ff', '#f5a742']
  fontsize = 8

  #
  titles = []
  for k in k_values:
    titles.append(f"Silhouette for k={k} clusters with {metric}{vars} and {link}")

  rows = len(k_values)
  fig = make_subplots(rows=rows, cols=1,
                      vertical_spacing=0.4/rows,
                      subplot_titles=titles,
                      )
  
  for i in range(rows):
    k = k_values[i]

    HC_path = f"{HC_base_path}{metric}{vars}/{target_class}/{algo}_{metric}_{link}{numpy_file_type}"
    Z = {}
    Z[metric] = np.load(HC_path)
    cluster_labels = fcluster(Z[metric], k, criterion='maxclust')

    silhouette_avg, sample_silhouette_values, cluster_labels = getSilhouette(distance_matrix, cluster_labels, postprocessing)
    print("sample_silhouette_values:", sample_silhouette_values.shape[0])
    titles[i] = f"{titles[i]}, avg sil:{silhouette_avg:.02f}"
    k = np.unique(cluster_labels).shape[0]
    # add silhouette row
    fig = plotSilhouette(fig, 
                        k, 
                        silhouette_avg, sample_silhouette_values, cluster_labels,
                        silhouette_row=i+1) # plotly starts from 1!
    fig.layout.annotations[i].update(text=titles[i])

  # Update layout
  fig.update_layout(
      #title=f"silhouette_avg: {silhouette_avg}",
      height=row_height*rows,
      showlegend=False,
      bargap=0.05,
      font=dict(size=fontsize)
  )
      

  if save_path:
    fig.write_html(save_path)
  if showFig:
    fig.show()

# Run silhouette

In [8]:
from metrics.elbow import getElbows

In [13]:
import os
target_class = "binary"
algo = "HC"
metric = "VDM"
vars_values = [5,6]

postprocessing_values = [False, True]


range_min = 2
range_max = 30
range_n_clusters = range(range_min, range_max)


linkages = ["single", "complete", "average", "weighted"]

for vars in vars_values:
  print(f"Vars: {vars} ...")
  ds_path = f'{dataset_path}4_DNA_{vars}values_normalized.csv'
  df_DNA = pd.read_csv(ds_path)
  
  try:
    del distance_matrix
  except:
    distance_matrix = []
  distance_matrix = np.load(f"{results_path}distance_matrix_{metric}{vars}_{target_class}{numpy_file_type}")

  for link in linkages:
    print(f"Link: {link} ...")
    for postprocessing in postprocessing_values:
      print(f"Postprocessing: {postprocessing} ...")  
      is_postprocessing = "_fix" if postprocessing else ""
      
      
      WSS_path = f"{HC_base_path}{metric}{vars}/{target_class}/WSS/WSS_{metric}_{link}_range({range_min},{range_max}){numpy_file_type}"
      wss = np.load(WSS_path)

      k_values = getElbows(wss, range_min, add_first_k=True)

      folder_path = f"{pictures_path}HC/Silhouette/{metric}{vars}/{target_class}/"
      folder_exists = os.path.isdir(folder_path)
      # If folder doesn't exist, then create it.
      if not folder_exists:
        os.makedirs(folder_path)
        print("created folder : ", folder_path)
      img_path = f"{folder_path}/Silhouette_{link}_{str(k_values)}{is_postprocessing}{image_file_type}"
      plot_silhouette_from_k_values(target_class,distance_matrix, metric, link, k_values, postprocessing, img_path, showFig=False)

Vars: 5 ...
Link: single ...
Postprocessing: False ...



divide by zero encountered in double_scalars



For n_clusters = 2 The average silhouette_score is : 0.6813266011032647
sample_silhouette_values: 26605
For n_clusters = 4 The average silhouette_score is : 0.3631315657428829
sample_silhouette_values: 26605
For n_clusters = 6 The average silhouette_score is : 0.3745207312960272
sample_silhouette_values: 26605
For n_clusters = 14 The average silhouette_score is : 0.19828009038782587
sample_silhouette_values: 26605
For n_clusters = 19 The average silhouette_score is : 0.2019085925209002
sample_silhouette_values: 26605
For n_clusters = 24 The average silhouette_score is : 0.20381776770240165
sample_silhouette_values: 26605
For n_clusters = 28 The average silhouette_score is : 0.20415181435393145
sample_silhouette_values: 26605
Postprocessing: True ...



divide by zero encountered in double_scalars



For n_clusters = 2 The average silhouette_score is : 0.6813266011032647
>>> After fix: For n_clusters = 2 (including outliers). The average silhouette_score is : 0.6813266011032647
sample_silhouette_values: 26605
For n_clusters = 4 The average silhouette_score is : 0.3631315657428829
To be fixed len:  (1,)
While fix: For n_clusters = 4 The average silhouette_score is : 0.40446638281600455
To be fixed len:  (0,)
While fix: For n_clusters = 4 The average silhouette_score is : 0.404668482559434
>>> After fix: For n_clusters = 4 (including outliers). The average silhouette_score is : 0.404668482559434
sample_silhouette_values: 26605
For n_clusters = 6 The average silhouette_score is : 0.3745207312960272
To be fixed len:  (1882,)
While fix: For n_clusters = 6 The average silhouette_score is : 0.34130744586193773
Score lowered, keeping previous iteration labels..
>>> After fix: For n_clusters = 5 (including outliers). The average silhouette_score is : 0.3848132457404899
sample_silhouette_val


divide by zero encountered in double_scalars



For n_clusters = 2 The average silhouette_score is : 0.6813266011032647
sample_silhouette_values: 26605
For n_clusters = 4 The average silhouette_score is : 0.296415266203789
sample_silhouette_values: 26605
For n_clusters = 6 The average silhouette_score is : 0.23381069014318095
sample_silhouette_values: 26605
For n_clusters = 8 The average silhouette_score is : 0.2240038709928677
sample_silhouette_values: 26605
For n_clusters = 16 The average silhouette_score is : 0.1569737182750499
sample_silhouette_values: 26605
For n_clusters = 19 The average silhouette_score is : 0.13988271016643355
sample_silhouette_values: 26605
For n_clusters = 25 The average silhouette_score is : 0.1932934549375236
sample_silhouette_values: 26605
Postprocessing: True ...



divide by zero encountered in double_scalars



For n_clusters = 2 The average silhouette_score is : 0.6813266011032647
>>> After fix: For n_clusters = 2 (including outliers). The average silhouette_score is : 0.6813266011032647
sample_silhouette_values: 26605
For n_clusters = 4 The average silhouette_score is : 0.296415266203789
To be fixed len:  (341,)
While fix: For n_clusters = 4 The average silhouette_score is : 0.3227645597499335
To be fixed len:  (133,)
While fix: For n_clusters = 4 The average silhouette_score is : 0.3226374829210044
Score lowered, keeping previous iteration labels..
>>> After fix: For n_clusters = 5 (including outliers). The average silhouette_score is : 0.3295702104556006
sample_silhouette_values: 26605
For n_clusters = 6 The average silhouette_score is : 0.23381069014318095
To be fixed len:  (877,)
While fix: For n_clusters = 6 The average silhouette_score is : 0.27972801700826905
To be fixed len:  (640,)
While fix: For n_clusters = 6 The average silhouette_score is : 0.28453100548265803
To be fixed len: 


divide by zero encountered in double_scalars



For n_clusters = 2 The average silhouette_score is : 0.6813266011032647
sample_silhouette_values: 26605
For n_clusters = 3 The average silhouette_score is : 0.43646404653023724
sample_silhouette_values: 26605
For n_clusters = 5 The average silhouette_score is : 0.3734842162553623
sample_silhouette_values: 26605
For n_clusters = 7 The average silhouette_score is : 0.31297782896945553
sample_silhouette_values: 26605
For n_clusters = 9 The average silhouette_score is : 0.2903483127031065
sample_silhouette_values: 26605
For n_clusters = 12 The average silhouette_score is : 0.2610462121552691
sample_silhouette_values: 26605
For n_clusters = 14 The average silhouette_score is : 0.27200204832186886
sample_silhouette_values: 26605
For n_clusters = 20 The average silhouette_score is : 0.2627903763481223
sample_silhouette_values: 26605
For n_clusters = 22 The average silhouette_score is : 0.24448846093994428
sample_silhouette_values: 26605
For n_clusters = 28 The average silhouette_score is : 0.


divide by zero encountered in double_scalars



For n_clusters = 2 The average silhouette_score is : 0.6813266011032647
>>> After fix: For n_clusters = 2 (including outliers). The average silhouette_score is : 0.6813266011032647
sample_silhouette_values: 26605
For n_clusters = 3 The average silhouette_score is : 0.43646404653023724
To be fixed len:  (2501,)
While fix: For n_clusters = 3 The average silhouette_score is : 0.33398112579525113
Score lowered, keeping previous iteration labels..
>>> After fix: For n_clusters = 4 (including outliers). The average silhouette_score is : 0.45264480921881767
sample_silhouette_values: 26605
For n_clusters = 5 The average silhouette_score is : 0.3734842162553623
To be fixed len:  (1882,)
While fix: For n_clusters = 5 The average silhouette_score is : 0.34130744586193773
Score lowered, keeping previous iteration labels..
>>> After fix: For n_clusters = 5 (including outliers). The average silhouette_score is : 0.3848132457404899
sample_silhouette_values: 26605
For n_clusters = 7 The average silhou


divide by zero encountered in double_scalars



For n_clusters = 2 The average silhouette_score is : 0.6813266011032647
sample_silhouette_values: 26605
For n_clusters = 4 The average silhouette_score is : 0.37304874519344533
sample_silhouette_values: 26605
For n_clusters = 6 The average silhouette_score is : 0.2737191429337443
sample_silhouette_values: 26605
For n_clusters = 8 The average silhouette_score is : 0.259878191435912
sample_silhouette_values: 26605
For n_clusters = 11 The average silhouette_score is : 0.22505845683091733
sample_silhouette_values: 26605
For n_clusters = 15 The average silhouette_score is : 0.22008464061620836
sample_silhouette_values: 26605
For n_clusters = 21 The average silhouette_score is : 0.23606147778691616
sample_silhouette_values: 26605
For n_clusters = 23 The average silhouette_score is : 0.22604605757358348
sample_silhouette_values: 26605
Postprocessing: True ...



divide by zero encountered in double_scalars



For n_clusters = 2 The average silhouette_score is : 0.6813266011032647
>>> After fix: For n_clusters = 2 (including outliers). The average silhouette_score is : 0.6813266011032647
sample_silhouette_values: 26605
For n_clusters = 4 The average silhouette_score is : 0.37304874519344533
To be fixed len:  (1882,)
While fix: For n_clusters = 4 The average silhouette_score is : 0.341085855214051
Score lowered, keeping previous iteration labels..
>>> After fix: For n_clusters = 5 (including outliers). The average silhouette_score is : 0.38437570672334614
sample_silhouette_values: 26605
For n_clusters = 6 The average silhouette_score is : 0.2737191429337443
To be fixed len:  (1447,)
While fix: For n_clusters = 6 The average silhouette_score is : 0.29856515357214564
To be fixed len:  (1182,)
While fix: For n_clusters = 6 The average silhouette_score is : 0.30403794917016863
To be fixed len:  (1014,)
While fix: For n_clusters = 6 The average silhouette_score is : 0.3067965147269962
>>> After fi


divide by zero encountered in double_scalars



For n_clusters = 2 The average silhouette_score is : 0.6737692309603952
sample_silhouette_values: 26605
For n_clusters = 4 The average silhouette_score is : 0.35437815619912677
sample_silhouette_values: 26605
For n_clusters = 6 The average silhouette_score is : 0.3655375155826871
sample_silhouette_values: 26605
For n_clusters = 15 The average silhouette_score is : 0.19282521250876652
sample_silhouette_values: 26605
For n_clusters = 19 The average silhouette_score is : 0.19643605784670315
sample_silhouette_values: 26605
For n_clusters = 25 The average silhouette_score is : 0.19832274121081458
sample_silhouette_values: 26605
Postprocessing: True ...



divide by zero encountered in double_scalars



For n_clusters = 2 The average silhouette_score is : 0.6737692309603952
>>> After fix: For n_clusters = 2 (including outliers). The average silhouette_score is : 0.6737692309603952
sample_silhouette_values: 26605
For n_clusters = 4 The average silhouette_score is : 0.35437815619912677
To be fixed len:  (1,)
While fix: For n_clusters = 4 The average silhouette_score is : 0.3954368184866342
To be fixed len:  (0,)
While fix: For n_clusters = 4 The average silhouette_score is : 0.3956345924025526
>>> After fix: For n_clusters = 4 (including outliers). The average silhouette_score is : 0.3956345924025526
sample_silhouette_values: 26605
For n_clusters = 6 The average silhouette_score is : 0.3655375155826871
To be fixed len:  (1914,)
While fix: For n_clusters = 6 The average silhouette_score is : 0.332727002419577
Score lowered, keeping previous iteration labels..
>>> After fix: For n_clusters = 5 (including outliers). The average silhouette_score is : 0.37564115495736977
sample_silhouette_va


divide by zero encountered in double_scalars



For n_clusters = 2 The average silhouette_score is : 0.6737692309603952
sample_silhouette_values: 26605
For n_clusters = 3 The average silhouette_score is : 0.28730001643956893
sample_silhouette_values: 26605
For n_clusters = 5 The average silhouette_score is : 0.2366802312815976
sample_silhouette_values: 26605
For n_clusters = 9 The average silhouette_score is : 0.21014345752745428
sample_silhouette_values: 26605
For n_clusters = 20 The average silhouette_score is : 0.1537778137239153
sample_silhouette_values: 26605
For n_clusters = 26 The average silhouette_score is : 0.16746656983300565
sample_silhouette_values: 26605
Postprocessing: True ...



divide by zero encountered in double_scalars



For n_clusters = 2 The average silhouette_score is : 0.6737692309603952
>>> After fix: For n_clusters = 2 (including outliers). The average silhouette_score is : 0.6737692309603952
sample_silhouette_values: 26605
For n_clusters = 3 The average silhouette_score is : 0.28730001643956893
To be fixed len:  (198,)
While fix: For n_clusters = 3 The average silhouette_score is : 0.3130475708242065
To be fixed len:  (95,)
While fix: For n_clusters = 3 The average silhouette_score is : 0.31334348286375835
>>> After fix: For n_clusters = 4 (including outliers). The average silhouette_score is : 0.3153894900006675
sample_silhouette_values: 26605
For n_clusters = 5 The average silhouette_score is : 0.2366802312815976
To be fixed len:  (337,)
While fix: For n_clusters = 5 The average silhouette_score is : 0.29431414544788626
To be fixed len:  (170,)
While fix: For n_clusters = 5 The average silhouette_score is : 0.2947578597551766
>>> After fix: For n_clusters = 5 (including outliers). The average 


divide by zero encountered in double_scalars



For n_clusters = 2 The average silhouette_score is : 0.6737692309603952
sample_silhouette_values: 26605
For n_clusters = 3 The average silhouette_score is : 0.4279151603632042
sample_silhouette_values: 26605
For n_clusters = 5 The average silhouette_score is : 0.36452919023776353
sample_silhouette_values: 26605
For n_clusters = 7 The average silhouette_score is : 0.28851957851749765
sample_silhouette_values: 26605
For n_clusters = 9 The average silhouette_score is : 0.31483962590597997
sample_silhouette_values: 26605
For n_clusters = 11 The average silhouette_score is : 0.2958244584324967
sample_silhouette_values: 26605
For n_clusters = 16 The average silhouette_score is : 0.2666977088850617
sample_silhouette_values: 26605
For n_clusters = 24 The average silhouette_score is : 0.23787360494478313
sample_silhouette_values: 26605
Postprocessing: True ...



divide by zero encountered in double_scalars



For n_clusters = 2 The average silhouette_score is : 0.6737692309603952
>>> After fix: For n_clusters = 2 (including outliers). The average silhouette_score is : 0.6737692309603952
sample_silhouette_values: 26605
For n_clusters = 3 The average silhouette_score is : 0.4279151603632042
To be fixed len:  (2576,)
While fix: For n_clusters = 3 The average silhouette_score is : 0.322295280286836
Score lowered, keeping previous iteration labels..
>>> After fix: For n_clusters = 4 (including outliers). The average silhouette_score is : 0.44417572581524
sample_silhouette_values: 26605
For n_clusters = 5 The average silhouette_score is : 0.36452919023776353
To be fixed len:  (1914,)
While fix: For n_clusters = 5 The average silhouette_score is : 0.332727002419577
Score lowered, keeping previous iteration labels..
>>> After fix: For n_clusters = 5 (including outliers). The average silhouette_score is : 0.37564115495736977
sample_silhouette_values: 26605
For n_clusters = 7 The average silhouette_s


divide by zero encountered in double_scalars



For n_clusters = 2 The average silhouette_score is : 0.6737692309603952
sample_silhouette_values: 26605
For n_clusters = 4 The average silhouette_score is : 0.36411425750093696
sample_silhouette_values: 26605
For n_clusters = 8 The average silhouette_score is : 0.23674053949231152
sample_silhouette_values: 26605
For n_clusters = 11 The average silhouette_score is : 0.22358185256477803
sample_silhouette_values: 26605
For n_clusters = 13 The average silhouette_score is : 0.22629224214489202
sample_silhouette_values: 26605
For n_clusters = 18 The average silhouette_score is : 0.1895202392163395
sample_silhouette_values: 26605
For n_clusters = 26 The average silhouette_score is : 0.2201404161282192
sample_silhouette_values: 26605
Postprocessing: True ...



divide by zero encountered in double_scalars



For n_clusters = 2 The average silhouette_score is : 0.6737692309603952
>>> After fix: For n_clusters = 2 (including outliers). The average silhouette_score is : 0.6737692309603952
sample_silhouette_values: 26605
For n_clusters = 4 The average silhouette_score is : 0.36411425750093696
To be fixed len:  (1914,)
While fix: For n_clusters = 4 The average silhouette_score is : 0.33251790886728266
Score lowered, keeping previous iteration labels..
>>> After fix: For n_clusters = 5 (including outliers). The average silhouette_score is : 0.3752184832367042
sample_silhouette_values: 26605
For n_clusters = 8 The average silhouette_score is : 0.23674053949231152
To be fixed len:  (947,)
While fix: For n_clusters = 8 The average silhouette_score is : 0.29742496438082655
To be fixed len:  (588,)
While fix: For n_clusters = 8 The average silhouette_score is : 0.30224880709224605
To be fixed len:  (288,)
While fix: For n_clusters = 8 The average silhouette_score is : 0.30379770508051995
>>> After fi

In [None]:
metric = "VDM"
vars = 5
link = "average"
postprocessing = False
is_postprocessing = "_fix" if postprocessing else ""
      
WSS_path = f"{HC_base_path}{metric}{vars}/WSS/WSS_{metric}_{link}_range({range_min},{range_max}){numpy_file_type}"
wss = np.load(WSS_path)

k_range = getElbows(wss, range_min)

img_path = f"{pictures_path}HC/Silhouette/{metric}{vars}/Silhouette_{link}_{str(k_range)}{is_postprocessing}{image_file_type}"
plot_silhouette_from_k_values(distance_matrix, metric, link, k_range, postprocessing, img_path, showFig=False)


divide by zero encountered in double_scalars



For n_clusters = 3 The average silhouette_score is : 0.4319858845525847
sample_silhouette_values: 26605
For n_clusters = 5 The average silhouette_score is : 0.369350979505518
sample_silhouette_values: 26605
For n_clusters = 7 The average silhouette_score is : 0.30984640384933915
sample_silhouette_values: 26605
For n_clusters = 9 The average silhouette_score is : 0.2859367022736226
sample_silhouette_values: 26605
For n_clusters = 12 The average silhouette_score is : 0.257256769179167
sample_silhouette_values: 26605
For n_clusters = 14 The average silhouette_score is : 0.26739220290238574
sample_silhouette_values: 26605
For n_clusters = 20 The average silhouette_score is : 0.25888112500312166
sample_silhouette_values: 26605
For n_clusters = 22 The average silhouette_score is : 0.241103305410257
sample_silhouette_values: 26605
For n_clusters = 28 The average silhouette_score is : 0.217167281129417
sample_silhouette_values: 26605
For n_clusters = 31 The average silhouette_score is : 0.2386