In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib

from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, inconsistent
from scipy.spatial.distance import cdist, pdist

import matplotlib.pyplot as plt
%matplotlib inline

algo = "HC"
base_path = "/content/drive/MyDrive/MIDA2/IntelligentMobilityProject/"
pictures_path = f"{base_path}Pictures/"
dataset_path = f"{base_path}Survey+dataset/"
code_path = f"{base_path}Code/"
results_path = f"{base_path}Code/Data/"
HC_base_path = f"{results_path}{algo}/"
numpy_file_type = ".npy"
image_file_type = ".html"

import sys
sys.path.append(code_path)
np.set_printoptions(precision=5, suppress=True)

# Merge cuts

In [5]:
np.set_printoptions(threshold=sys.maxsize)

In [6]:
metric = "VDM"
vars = 6
link = "complete"
n_clusters = 5
print(metric, vars, link)

VDM 6 complete


In [7]:
HC_path = f"{HC_base_path}{metric}{vars}/{algo}_{metric}_{link}{numpy_file_type}"
Z = {}
Z[metric] = np.load(HC_path)
cluster_labels = fcluster(Z[metric], n_clusters, criterion='maxclust')

In [8]:
non_zero_distance_merges = Z[metric][:,2] > 0
#print(Z[metric][non_zero_distance_merges])
print(Z[metric][non_zero_distance_merges].shape)

(4982, 4)


# Merge cuts

In [9]:
def get_split_cluster_labels(Z, k, split_th=200):
  cluster_init_label = 1
  split_cluster_labels = []

  k1 = k - 1
  k2 = k

  cluster_labels_k1 = fcluster(Z, k1, criterion='maxclust')
  cluster_labels_k2 = fcluster(Z, k2, criterion='maxclust')

  k1_counts = np.unique(cluster_labels_k1, return_counts=True)
  k2_counts = np.unique(cluster_labels_k2, return_counts=True)
  # print("k1_counts\tk2_counts")
  # j = 0
  # sum = 0
  # for i in range(len(k2_counts[0])):
  #   # if i < len(k1_counts[0]):
  #   #   k1_counts_i = k1_counts[0][i], k1_counts[1][i]
  #   # else:
  #   #   k1_counts_i = 0, 0
  #   if k1_counts[1][i-j] != k2_counts[1][i]:
  #     k1_counts_i = k1_counts[0][i-j], k1_counts[1][i-j]
  #     sum += k2_counts[1][i]
  #     if sum < k1_counts[1][i-j]:
  #       j += 1
  #     elif sum == k1_counts[1][i-j]:
  #       # initialize for next split
  #       sum = 0
  #     else:
  #       raise ValueError("split not correct")
  #   else:
  #     k1_counts_i = k1_counts[0][i-j], k1_counts[1][i-j]
  #   print(f'{k1_counts_i[0]}\t{k1_counts_i[1]}\t{k2_counts[0][i]}\t{k2_counts[1][i]}')

  # print('real calculation')
  split_sizes = []
  k2_splits = []

  j = 0
  sum = 0
  for i in range(len(k2_counts[0])):
    if k1_counts[1][i-j] != k2_counts[1][i]:
      # print(k1_counts[0][i-j], k1_counts[1][i-j], k2_counts[0][i], k2_counts[1][i], sep='\t')
      split_sizes.append(k1_counts[1][i-j])
      k2_splits.append((k2_counts[0][i], k2_counts[1][i]))
      # there has been a split, so k2 has one more label (the one that came from the k1 one)
      sum += k2_counts[1][i]
      if sum < k1_counts[1][i-j]:
        j += 1
      elif sum == k1_counts[1][i-j]:
        # initialize for next split
        sum = 0
      else:
        raise ValueError("split not correct")
  split_sizes = np.array(split_sizes)
  k2_splits = np.array(k2_splits)
  max_split = np.max(split_sizes)
  k_mask = np.flatnonzero(split_sizes == max_split)
  k2_selected = k2_splits[k_mask]
  print(f'max_split: {max_split}')
  print(k2_selected)
  split_cluster_labels = k2_selected[k2_selected[:,1] > split_th,0]
  print(split_cluster_labels)
  return split_cluster_labels

In [10]:
k = np.array([1,2,2,2,1,0])
k[np.flatnonzero(k == np.max(k))]


array([2, 2, 2])

In [11]:
def reassign_labels(cluster_labels_k1, cluster_labels_k2):
  k1_values = np.unique(cluster_labels_k1)
  k2_values = np.unique(cluster_labels_k2)

  first_unassigned_value = k1_values[-1] + 1
  print(f'first_unassigned_value: {first_unassigned_value}')
  
  cluster_labels_k2 += first_unassigned_value
  
  return cluster_labels_k2

def merge_multiple_cuts(Z, k_values, split_th=200):
  k1 = k_values[0]
  cluster_labels_k1 = fcluster(Z, k1, criterion='maxclust')
  for k2 in k_values[1:]:
    split_cluster_labels = get_split_cluster_labels(Z, k=k2, split_th=split_th)

    cluster_labels_k2 = fcluster(Z, k2, criterion='maxclust')
    print(f"k1({k1}):", np.unique(cluster_labels_k1))
    print(f"k2({k2}):", split_cluster_labels)
    split_cluster_mask = np.isin(cluster_labels_k2, split_cluster_labels)
    print(np.unique(cluster_labels_k1[split_cluster_mask]), np.unique(cluster_labels_k2[split_cluster_mask]))
    cluster_labels_k2 = reassign_labels(cluster_labels_k1, cluster_labels_k2)
    print(np.unique(cluster_labels_k1[split_cluster_mask]), np.unique(cluster_labels_k2[split_cluster_mask]))
    cluster_labels_k1[split_cluster_mask] = cluster_labels_k2[split_cluster_mask]
  return cluster_labels_k1

In [12]:
k_values = [4]

merged_cuts = merge_multiple_cuts(Z[metric], k_values)
print(np.unique(merged_cuts, return_counts=True))

(array([1, 2, 3, 4], dtype=int32), array([   90,  1396, 12363, 12756]))


# Plot silhouette

In [13]:
ds_path = f'{dataset_path}4_DNA_{vars}values_normalized.csv'
df_DNA = pd.read_csv(ds_path)

try:
    del distance_matrix
except:
  distance_matrix = []
distance_matrix = np.load(f"{results_path}distance_matrix_{metric}{vars}{numpy_file_type}")

In [14]:
from metrics.silhouette import plotSilhouette, getSilhouette
from plotly.subplots import make_subplots

ImportError: ignored

In [None]:
# import metrics.silhouette
# import importlib
# importlib.reload(metrics.silhouette) 

In [None]:
from metrics.silhouette import silhouette_samples, fixSilhouette, getSilhouetteAvg, setOutliers

def getSilhouette(distance_matrix, cluster_labels, postprocessing=False, cluster_size_threshold=200):

    # Compute the silhouette scores for each sample
    sample_silhouette_values, clust_dists, _ = silhouette_samples(distance_matrix, cluster_labels, metric="precomputed")

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = np.average(sample_silhouette_values)
    print("For n_clusters =", len(np.unique(cluster_labels)),
          "The average silhouette_score is :", silhouette_avg)
    
    i = 0
    sil_title = f"Iteration {i}"
    sil_fig = make_subplots(rows=1, cols=1, subplot_titles=[sil_title])
    sil_fig = plotSilhouette(df=df_DNA,
                            fig=sil_fig,
                            silhouette_avg=silhouette_avg,
                            sample_silhouette_values=sample_silhouette_values,
                            cluster_labels=cluster_labels,
                            silhouette_row=1, silhouette_col=1)
    sil_fig.show()
    i += 1



    if postprocessing == True:
        prev_fix_score = 0.0
        prev_cluster_labels = np.copy(cluster_labels)
        prev_sample_silhouette_values = np.copy(sample_silhouette_values)
        # prev_indices = np.where(sample_silhouette_values<0.0)[0]
        while np.any(sample_silhouette_values<0.0):
            sample_silhouette_values, _, small_clusters_mask = fixSilhouette(distance_matrix, 
                cluster_labels, sample_silhouette_values, clust_dists, metric="precomputed")
            # print("Prev:", np.average(sample_silhouette_values[sample_silhouette_values >= 0]))
            silhouette_avg = getSilhouetteAvg(sample_silhouette_values, cluster_labels, small_clusters_mask)

            indices = np.where(sample_silhouette_values<0.0)[0]
            # prev_indices = np.intersect1d(prev_indices, indices)
            # print("Common indices:", prev_indices)
            # print("First still to fix: ", indices[0])
            print("To be fixed len: ", indices.shape)
            print("While fix: For n_clusters =", len(np.unique(cluster_labels)),
              "The average silhouette_score is :", silhouette_avg)
            print(np.unique(cluster_labels))

            if silhouette_avg - prev_fix_score < 0.01*prev_fix_score:
                print("!!! Silhouette would have stopped here")
                if silhouette_avg < prev_fix_score:
                    print("Score lowered, keeping previous iteration labels..")
                    cluster_labels = prev_cluster_labels
                    sample_silhouette_values = prev_sample_silhouette_values
                break
            else:
                prev_fix_score = silhouette_avg
                prev_cluster_labels = np.copy(cluster_labels)
                prev_sample_silhouette_values = np.copy(sample_silhouette_values)
            

            sil_title = f"Iteration {i}"
            sil_fig = make_subplots(rows=1, cols=1, subplot_titles=[sil_title])
            sil_fig = plotSilhouette(df=df_DNA,
                                    fig=sil_fig,
                                    silhouette_avg=silhouette_avg,
                                    sample_silhouette_values=sample_silhouette_values,
                                    cluster_labels=cluster_labels,
                                    silhouette_row=1, silhouette_col=1)
            sil_fig.show()
            i += 1

        # Set outliers
        cluster_labels = setOutliers(cluster_labels, sample_silhouette_values, cluster_size_threshold)
        # Compute the silhouette scores for each sample
        sample_silhouette_values, _, small_clusters_mask = silhouette_samples(distance_matrix, cluster_labels, metric="precomputed")
        silhouette_avg = getSilhouetteAvg(sample_silhouette_values, cluster_labels, small_clusters_mask)
        print(">>> After fix: For n_clusters =", len(np.unique(cluster_labels)), "(including outliers)",
          "The average silhouette_score is :", silhouette_avg)
      
    return silhouette_avg, sample_silhouette_values, cluster_labels

In [None]:
from metrics.elbow import getElbows

In [68]:
postprocessing = False
is_postprocessing = "_fix" if postprocessing else ""
link = "single"
k_values = [200]
cluster_size_threshold = 200
print(metric, vars, link, "postprocessing:", postprocessing, '; k_values:', k_values)

VDM 6 single postprocessing: False ; k_values: [200]


In [None]:
elbows = getElbows()

In [76]:

HC_path = f"{HC_base_path}{metric}{vars}/{algo}_{metric}_{link}{numpy_file_type}"
Z = {}
Z[metric] = np.load(HC_path)
cluster_labels = fcluster(Z[metric], n_clusters, criterion='maxclust')
cluster_labels = merge_multiple_cuts(Z[metric], k_values, split_th=cluster_size_threshold)
silhouette_avg, sample_silhouette_values, cluster_labels = getSilhouette(
    distance_matrix, cluster_labels, postprocessing, 
    cluster_size_threshold=cluster_size_threshold)

sil_title = f"Cuts with k={k_values} clusters, {metric}{vars} and link {link}, with silhouette average: {silhouette_avg}"
sil_fig = make_subplots(rows=1, cols=1, subplot_titles=[sil_title])
sil_fig = plotSilhouette(df=df_DNA,
                         fig=sil_fig,
                         silhouette_avg=silhouette_avg,
                         sample_silhouette_values=sample_silhouette_values,
                         cluster_labels=cluster_labels,
                         silhouette_row=1, silhouette_col=1)
sil_fig.show()

Output hidden; open in https://colab.research.google.com to view.

In [83]:
postprocessing = True
is_postprocessing = "_fix" if postprocessing else ""
link = "single"
k_values = [2,4,6,15,41,59,124,154,201]
print(metric, vars, link, "postprocessing:", postprocessing, '; k_values:', k_values)

VDM 6 single postprocessing: True ; k_values: [2, 4, 6, 15, 41, 59, 124, 154, 201]


In [86]:
HC_path = f"{HC_base_path}{metric}{vars}/{algo}_{metric}_{link}{numpy_file_type}"
Z = {}
Z[metric] = np.load(HC_path)
cluster_labels = fcluster(Z[metric], n_clusters, criterion='maxclust')
cluster_labels = merge_multiple_cuts(Z[metric], k_values)
silhouette_avg, sample_silhouette_values, cluster_labels = getSilhouette(distance_matrix, cluster_labels, postprocessing)

sil_title = f"Cuts with k={k_values} clusters, {metric}{vars} and link {link}, with silhouette average: {silhouette_avg}"
sil_fig = make_subplots(rows=1, cols=1, subplot_titles=[sil_title])
sil_fig = plotSilhouette(df=df_DNA,
                         fig=sil_fig,
                         silhouette_avg=silhouette_avg,
                         sample_silhouette_values=sample_silhouette_values,
                         cluster_labels=cluster_labels,
                         silhouette_row=1, silhouette_col=1)
sil_fig.show()

Output hidden; open in https://colab.research.google.com to view.

# Good results
## VDM5

|Link|Cuts|No postprocessing|Postprocessing|Outliers|
|------|-----|-----|-----|----|
|single|[2,4,6,33,49]|  0.367| 0.388 |3885|
|average|[2,3,5,7,9,12]|  0.28| 0.33 |639|
|complete|[2,4]|  0.376| 0.41 |0|
|complete|[2,4,6]|  0.378| 0.403 |3807|
|complete|[2,4,6,8]|  0.379| 0.403 |3807|
