In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib

from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, inconsistent
from scipy.spatial.distance import cdist, pdist

import matplotlib.pyplot as plt
%matplotlib inline

algo = "HC"
base_path = "/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/"
pictures_path = f"{base_path}Pictures/"
dataset_path = f"{base_path}Survey+dataset/"
code_path = f"{base_path}Code/"
results_path = f"{base_path}Code/Data/"
HC_base_path = f"{results_path}{algo}/"
numpy_file_type = ".npy"
image_file_type = ".html"

import sys
sys.path.append(code_path)
np.set_printoptions(precision=5, suppress=True)

# Merge cuts

In [3]:
np.set_printoptions(threshold=sys.maxsize)

In [4]:
metric = "VDM"
vars = 5
link = "complete"
n_clusters = 5
print(metric, vars, link)

VDM 5 complete


In [5]:
HC_path = f"{HC_base_path}{metric}{vars}/{algo}_{metric}_{link}{numpy_file_type}"
Z = {}
Z[metric] = np.load(HC_path)
cluster_labels = fcluster(Z[metric], n_clusters, criterion='maxclust')

In [6]:
non_zero_distance_merges = Z[metric][:,2] > 0
#print(Z[metric][non_zero_distance_merges])
print(Z[metric][non_zero_distance_merges].shape)

(3292, 4)


# Merge cuts

In [7]:
import postprocessing.merge_cuts
import importlib
importlib.reload(postprocessing.merge_cuts)

<module 'postprocessing.merge_cuts' from '/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Code/postprocessing/merge_cuts.py'>

In [8]:
from postprocessing.merge_cuts import merge_multiple_cuts

In [9]:
k_values = [4, 6, 8]

merged_cuts = merge_multiple_cuts(Z[metric], k_values)
print(np.unique(merged_cuts, return_counts=True))

max_split: 14703
[[   5 5734]
 [   6 8969]]
[5 6]
k1(4): [1 2 3 4]
k2(6): [5 6]
[4] [5 6]
[4] [10 11]
max_split: 9902
[[   5 3279]
 [   6 6623]]
[5 6]
k1(4): [ 1  2  3 10 11]
k2(8): [5 6]
[3] [5 6]
[3] [17 18]
(array([ 1,  2, 10, 11, 17, 18], dtype=int32), array([1486,  514, 5734, 8969, 3279, 6623]))


# Plot silhouette

In [10]:
metric = "VDM"
vars = 6

In [11]:
ds_path = f'{dataset_path}4_DNA_{vars}values_normalized.csv'
df_DNA = pd.read_csv(ds_path)

In [12]:
import metrics.silhouette
import plots.silhouette
import importlib
importlib.reload(metrics.silhouette) 
importlib.reload(plots.silhouette)

<module 'plots.silhouette' from '/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/Code/plots/silhouette.py'>

In [13]:
from metrics.silhouette import getSilhouette
from plots.silhouette import plotSilhouette
from plotly.subplots import make_subplots
from metrics.elbow import getElbows

In [14]:
link = "average"
postprocessing = False
is_postprocessing = "_fix" if postprocessing else ""
print(metric, vars, link, "postprocessing:", postprocessing)
range_min = 2
range_max = 100

VDM 6 average postprocessing: False


In [15]:
from plots.silhouette import plot_silhouette_merge_k_values

In [16]:
vars_values = [5,6]
linkages = ["single"]
postprocessing_values = [False, True]
range_min = 2
range_max = 600
for vars in vars_values:
  ds_path = f'{dataset_path}4_DNA_{vars}values_normalized.csv'
  df_DNA = pd.read_csv(ds_path)
  try:
    del distance_matrix
  except:
    distance_matrix = []
  distance_matrix = np.load(f"{results_path}distance_matrix_{metric}{vars}.npy")
  for link in linkages:
    WSS_path = f"{HC_base_path}{metric}{vars}/WSS/WSS_{metric}_{link}_range({range_min},{range_max}){numpy_file_type}"
    wss = np.load(WSS_path)
    k_values = getElbows(wss, range_min, add_first_k=True)
    HC_path = f"{HC_base_path}{metric}{vars}/{algo}_{metric}_{link}{numpy_file_type}"
    Z = {}
    Z[metric] = np.load(HC_path)
    for postprocessing in postprocessing_values:
      is_postprocessing = "_fix" if postprocessing else ""
      save_path = f"{pictures_path}HC/Silhouette/{metric}{vars}/MergedCuts/MergedCuts_{metric}_{link}_{str(k_values)}{is_postprocessing}.html"
      plot_silhouette_merge_k_values(df_DNA, distance_matrix, Z[metric], metric, link ,k_values, postprocessing, save_path=save_path)


divide by zero encountered in double_scalars



For n_clusters = 2 The average silhouette_score is : 0.6858174526302973
max_split: 25119
[[    1   966]
 [    2 24153]]
[1 2]
k1(2): [1 2]
k2(4): [1 2]
[1] [1 2]
[1] [4 5]
For n_clusters = 3 The average silhouette_score is : 0.36768191087525387
max_split: 25119
[[    1   966]
 [    2 24153]]
[1 2]
k1(2): [1 2]
k2(4): [1 2]
[1] [1 2]
[1] [4 5]
max_split: 24153
[[    2 23639]
 [    3   514]]
[2 3]
k1(2): [2 4 5]
k2(6): [2 3]
[5] [2 3]
[5] [8 9]
For n_clusters = 4 The average silhouette_score is : 0.378107366107184
max_split: 25119
[[    1   966]
 [    2 24153]]
[1 2]
k1(2): [1 2]
k2(4): [1 2]
[1] [1 2]
[1] [4 5]
max_split: 24153
[[    2 23639]
 [    3   514]]
[2 3]
k1(2): [2 4 5]
k2(6): [2 3]
[5] [2 3]
[5] [8 9]
max_split: 23639
[[    3  1401]
 [    4 22238]]
[3 4]
k1(2): [2 4 8 9]
k2(14): [3 4]
[8] [3 4]
[8] [13 14]
For n_clusters = 5 The average silhouette_score is : 0.19915978137168913
max_split: 25119
[[    1   966]
 [    2 24153]]
[1 2]
k1(2): [1 2]
k2(4): [1 2]
[1] [1 2]
[1] [4 5]



divide by zero encountered in double_scalars



For n_clusters = 2 The average silhouette_score is : 0.6781566428354224
max_split: 25119
[[    1   966]
 [    2 24153]]
[1 2]
k1(2): [1 2]
k2(4): [1 2]
[1] [1 2]
[1] [4 5]
For n_clusters = 3 The average silhouette_score is : 0.3587142464589318
max_split: 25119
[[    1   966]
 [    2 24153]]
[1 2]
k1(2): [1 2]
k2(4): [1 2]
[1] [1 2]
[1] [4 5]
max_split: 24153
[[    2 23639]
 [    3   514]]
[2 3]
k1(2): [2 4 5]
k2(6): [2 3]
[5] [2 3]
[5] [8 9]
For n_clusters = 4 The average silhouette_score is : 0.36893235108930045
max_split: 25119
[[    1   966]
 [    2 24153]]
[1 2]
k1(2): [1 2]
k2(4): [1 2]
[1] [1 2]
[1] [4 5]
max_split: 24153
[[    2 23639]
 [    3   514]]
[2 3]
k1(2): [2 4 5]
k2(6): [2 3]
[5] [2 3]
[5] [8 9]
max_split: 23639
[[    3  1401]
 [    4 22238]]
[3 4]
k1(2): [2 4 8 9]
k2(15): [3 4]
[8] [3 4]
[8] [13 14]
For n_clusters = 5 The average silhouette_score is : 0.19361276768170615
max_split: 25119
[[    1   966]
 [    2 24153]]
[1 2]
k1(2): [1 2]
k2(4): [1 2]
[1] [1 2]
[1] [4 5]

# Good results


## VDM5

|Link|Cuts|(No postprocessing) Sil Avg|(Postprocessing) Sil Avg|Outliers|
|------|-----|-----|-----|----|
|average|[2,3,5,7,9]|  0.31| 0.36 |131|
|average|[2,3,5,7,9,12]|  0.28| 0.33 |639|
|complete|[2,4]|  0.30| 0.34 |355|
|complete|[2,4,6]|  0.26| 0.33 |685|
|complete|[2,4,6,8]|  0.26| 0.3 |151|
|single|[2,4,6,14,19,24,28,33,49,86]|0.19| 0.32 |2293|
|weighted|[2,4,6,8]|  0.26| 0.33 |1039|
|weighted|[2,4,6,8,11]|  0.24| 0.33 |1480|

## VDM6

|Link|Cuts|(No postprocessing) Sil Avg|(Postprocessing) Sil Avg|Outliers|
|------|-----|-----|-----|----|
|average|[2,3,5,7,9]|  0.31| 0.35 |160|
|complete|[2,3,5]|  0.24| 0.30 |211|
|single|[2,4,6,15,19,25,29,31,41,59,75,85,124]|0.18| 0.31 |2212|
|weighted|[2,4,6,8,11,13]|  0.25| 0.33 |532|