<a href="https://colab.research.google.com/github/AnzorGozalishvili/AnzorGozalishvili/blob/main/notebooks/experiment_4/clusterization_experiment_(compare_different_clusterization_models_using_pycaret).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Eye color clustering

## Introduction

The objective of this analysis is identifying a suitable clustering for eye colors. We will use a dataset of eye images where for each eye we have both the average color of the collarette (eye internal part close to the pupil) and iris, both expressed in the CIELab system. We will perform four different clusterings, by considering:

- either the color of the whole eye (average between collarette and iris) or only the iris color;
- either the CIELab colors in their original values or their scaled (zero mean, unary variance) version.

# Set up

In [1]:
# !mkdir --parent notebooks/experiment4/
# !pip install pandas==1.2.1 \
# xlrd==1.2.0 \
# numpy==1.19.5 \
# opencv-python==4.5.2.54 \
# plotly==4.14.3 \
# colormath==3.0.0 \
# scikit-learn==1.0.1 \

# !pip install scikit-learn-extra
# !pip install -U pycaret
# !pip install --upgrade openpyxl

In [2]:
import sys
import os

import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import cv2
import plotly.io as pio

from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.metrics import adjusted_rand_score

from colormath.color_objects import sRGBColor,LabColor
from colormath.color_conversions import convert_color

if "notebooks/experiment4" not in os.getcwd():
  os.chdir('notebooks/experiment4/')

In [3]:
%config Completer.use_jedi = False

  """Entry point for launching an IPython kernel.


In [4]:
FIGURES_DIR = './Figures/'
CLUSTERS_DIR = "./Clusters/"
DATASETS_DIR = "./Datasets/"
METRICS_DIR = "./Metrics/"

In [5]:
if not os.path.exists(FIGURES_DIR):
    os.mkdir(FIGURES_DIR)

if not os.path.exists(CLUSTERS_DIR):
  os.mkdir(CLUSTERS_DIR)

if not os.path.exists(DATASETS_DIR):
  os.mkdir(DATASETS_DIR)

if not os.path.exists(METRICS_DIR):
  os.mkdir(METRICS_DIR)

In [6]:
NUM_CLUSTERS = 4

# Data Preparation

Let’s now load and prepare our data. We will create two datasets, one considering the whole eye (collarette + iris), the other only the iris

## Helper Functions

In [7]:
def lab_to_hex(l, a, b):

    lab = LabColor(lab_l=l, lab_a=a, lab_b=b, observer='2', illuminant='d65')
    rgb = convert_color(color=lab, target_cs=sRGBColor)

    return rgb.get_rgb_hex()

In [8]:
def show_lab_scatter_3d(dataframe):
    trace=dict(type='scatter3d',
           x=dataframe.l.values,
           y=dataframe.a.values,
           z=dataframe.b.values,
           mode='markers',
           marker=dict(color=dataframe.hex_color.values, size=7),
          )
    
    pio.show(trace)
    return trace

In [9]:
def show_lab_scatter_3d_shapes(dataframe, labels):
    cluster_ids = dataframe.Cluster.value_counts().index.tolist()
    shapes = ['circle', 'square',  'diamond', 'cross']
    shapes_map = {}
    for idx, cluster_id in enumerate(cluster_ids):
        # set shape value in order but if no more shape is available then the last one to each.
        shapes_map[cluster_id] = shapes[idx if idx < len(shapes) else -1]
    
    trace=dict(type='scatter3d',
           x=dataframe.l.values,
           y=dataframe.a.values,
           z=dataframe.b.values,
           text=labels.Colore_IRISPLEX.values,
           mode='markers',
           marker=dict(
               color=dataframe.hex_color.values,
               size=7,
               symbol=dataframe.Cluster.apply(lambda x: shapes_map[x])),
          )
    
    pio.show(trace)
    return trace

## Outer Iris

In [10]:
dataset_outer_iris = pd.read_excel('../../dataset/IrisPlex_20_01_2022_filtered.xlsx')[['ID', 'Outer.Iris.L', 'Outer.Iris.a', 'Outer.Iris.b']]

In [11]:
dataset_outer_iris.set_index('ID', inplace=True)

In [12]:
dataset_outer_iris.columns = ['l', 'a', 'b']

In [13]:
dataset_outer_iris['hex_color'] = dataset_outer_iris.apply(lambda x: lab_to_hex(*x.values), axis=1)

In [14]:
dataset_outer_iris.head(3)

Unnamed: 0_level_0,l,a,b,hex_color
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FDP001,17.671129,1.004831,13.675901,#332a18
FDP002,22.662753,2.493072,17.055058,#42341d
FDP003,44.268597,-5.288547,8.743492,#666b5a


In [15]:
pio.write_html(fig=show_lab_scatter_3d(dataset_outer_iris), file=FIGURES_DIR + "dataset_outer_iris.html")

### Get Labels

In [16]:
labels = dataset_outer_iris.merge(
    pd.read_excel('../../dataset/IrisPlex_20_01_2022_filtered.xlsx', index_col=0)[['Colore_IRISPLEX']],
    how='inner',
    left_index=True,
    right_index=True,
)[['Colore_IRISPLEX']]

In [17]:
labels.shape

(238, 1)

# Clustering (outer iris)

Run pycaret experiments using different models and on 2 different datasets that we got here

In [18]:
from pycaret.clustering import *


calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.



In [19]:
exp_003 = setup(
    data=dataset_outer_iris,
    preprocess=True,
    ignore_features=['hex_color'],
    
    # normalize
    normalize=False,
    normalize_method='zscore',
    
    # transform
    transformation=False,
    transformation_method='yeo-johnson',
    
    # pca
    pca=False,
    pca_method='linear',
    
    
    silent=True,
    n_jobs=-1,
    
    # set experiment seed
    session_id=123
)

Unnamed: 0,Description,Value
0,session_id,123
1,Original Data,"(238, 4)"
2,Missing Values,False
3,Numeric Features,3
4,Categorical Features,0
5,Ordinal Features,False
6,High Cardinality Features,False
7,High Cardinality Method,
8,Transformed Data,"(238, 3)"
9,CPU Jobs,-1


show all outputs from cell

In [20]:
def try_model(model_name, **kwargs):
    # create model
    model = create_model(model_name, **kwargs)
    # show model object
    print(model)
    # store model results
    results = assign_model(model)
    print(results.head(2))
    # evaluate model with interactive plot
    evaluate_model(model)
    
    return model, results

In [21]:
models_003 = {}
results_003 = {}

MODEL_NAMES = ["kmeans", "ap", "meanshift", "sc", "hclust", "dbscan", "optics", "birch", "kmodes"]

In [22]:
models_003["kmeans"], results_003["kmeans"] = try_model("kmeans",  **{"num_clusters":4}) 

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4066,226.4846,0.8729,0,0,0


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=4, n_init=10, n_jobs=-1, precompute_distances='deprecated',
       random_state=123, tol=0.0001, verbose=0)
                l         a          b hex_color    Cluster
ID                                                         
FDP001  17.671129  1.004831  13.675901   #332a18  Cluster 2
FDP002  22.662753  2.493072  17.055058   #42341d  Cluster 0


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Cluster PCA Plot (2d)', …

In [23]:
models_003["ap"], results_003["ap"] = try_model("ap")

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.309,173.6177,0.9338,0,0,0


AffinityPropagation(affinity='euclidean', convergence_iter=15, copy=True,
                    damping=0.5, max_iter=200, preference=None,
                    random_state='warn', verbose=False)
                l         a          b hex_color     Cluster
ID                                                          
FDP001  17.671129  1.004831  13.675901   #332a18  Cluster 16
FDP002  22.662753  2.493072  17.055058   #42341d  Cluster 14


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Cluster PCA Plot (2d)', …

In [24]:
models_003["meanshift"], results_003["meanshift"] = try_model("meanshift")

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4891,193.0746,0.836,0,0,0


MeanShift(bandwidth=None, bin_seeding=False, cluster_all=True, max_iter=300,
          min_bin_freq=1, n_jobs=-1, seeds=None)
                l         a          b hex_color    Cluster
ID                                                         
FDP001  17.671129  1.004831  13.675901   #332a18  Cluster 0
FDP002  22.662753  2.493072  17.055058   #42341d  Cluster 0


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Cluster PCA Plot (2d)', …

In [25]:
models_003["sc"], results_003["sc"] = try_model("sc")

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3044,7.0123,0.4815,0,0,0


SpectralClustering(affinity='rbf', assign_labels='kmeans', coef0=1, degree=3,
                   eigen_solver=None, eigen_tol=0.0, gamma=1.0,
                   kernel_params=None, n_clusters=4, n_components=None,
                   n_init=10, n_jobs=-1, n_neighbors=10, random_state=123)
                l         a          b hex_color    Cluster
ID                                                         
FDP001  17.671129  1.004831  13.675901   #332a18  Cluster 0
FDP002  22.662753  2.493072  17.055058   #42341d  Cluster 0


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Cluster PCA Plot (2d)', …

In [26]:
models_003["hclust"], results_003["hclust"] = try_model("hclust")

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3511,186.9224,0.9567,0,0,0


AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
                        connectivity=None, distance_threshold=None,
                        linkage='ward', memory=None, n_clusters=4)
                l         a          b hex_color    Cluster
ID                                                         
FDP001  17.671129  1.004831  13.675901   #332a18  Cluster 0
FDP002  22.662753  2.493072  17.055058   #42341d  Cluster 0


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Cluster PCA Plot (2d)', …

In [27]:
models_003["dbscan"], results_003["dbscan"] = try_model("dbscan")

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0,0,0,0,0,0


DBSCAN(algorithm='auto', eps=0.5, leaf_size=30, metric='euclidean',
       metric_params=None, min_samples=5, n_jobs=-1, p=None)
                l         a          b hex_color     Cluster
ID                                                          
FDP001  17.671129  1.004831  13.675901   #332a18  Cluster -1
FDP002  22.662753  2.493072  17.055058   #42341d  Cluster -1


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Cluster PCA Plot (2d)', …

In [28]:
models_003["optics"], results_003["optics"] = try_model("optics")

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,-0.2278,13.0414,1.8741,0,0,0


OPTICS(algorithm='auto', cluster_method='xi', eps=None, leaf_size=30,
       max_eps=inf, metric='minkowski', metric_params=None,
       min_cluster_size=None, min_samples=5, n_jobs=-1, p=2,
       predecessor_correction=True, xi=0.05)
                l         a          b hex_color     Cluster
ID                                                          
FDP001  17.671129  1.004831  13.675901   #332a18  Cluster -1
FDP002  22.662753  2.493072  17.055058   #42341d  Cluster -1


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Cluster PCA Plot (2d)', …

In [29]:
models_003["birch"], results_003["birch"] = try_model("birch")

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.389,204.0435,0.8864,0,0,0


Birch(branching_factor=50, compute_labels=True, copy=True, n_clusters=4,
      threshold=0.5)
                l         a          b hex_color    Cluster
ID                                                         
FDP001  17.671129  1.004831  13.675901   #332a18  Cluster 2
FDP002  22.662753  2.493072  17.055058   #42341d  Cluster 0


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Cluster PCA Plot (2d)', …

In [30]:
models_003["kmodes"], results_003["kmodes"] = try_model("kmodes")

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,-0.3124,1.154,0.8978,0,0,0


KModes(cat_dissim=<function matching_dissim at 0x7fce8525b0e0>, init='Cao',
       max_iter=100, n_clusters=4, n_init=1, n_jobs=-1, random_state=123,
       verbose=0)
                l         a          b hex_color    Cluster
ID                                                         
FDP001  17.671129  1.004831  13.675901   #332a18  Cluster 0
FDP002  22.662753  2.493072  17.055058   #42341d  Cluster 1


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Cluster PCA Plot (2d)', …

## Evaluate Results by Eye Color Labels

In [31]:
from sklearn.metrics import (
    silhouette_score, calinski_harabasz_score, davies_bouldin_score,
    homogeneity_score, adjusted_rand_score, completeness_score
)

In [32]:
assert (labels.index == dataset_outer_iris.index).sum() == dataset_outer_iris.shape[0]

In [33]:
color_map = {"BROWN": 0, "INTERMEDIATE": 1, "BLUE": 2}

In [34]:
def find_transformed_dataset(experiment):
    for item in experiment:
        if (
            isinstance(item, list) 
            and len(item) > 1 
            and isinstance(item[1], tuple) 
            and len(item[1]) > 1 
            and item[1][0] == 'Transformed Data'
            and isinstance(item[1][1], pd.DataFrame) 
        ):
            return item[1][1]

In [35]:
exp_003_dataset = find_transformed_dataset(exp_003)
X = exp_003_dataset.values
y = labels['Colore_IRISPLEX'].apply(lambda x: color_map[x])

In [36]:
dataset_outer_iris.shape

(238, 4)

In [37]:
pd.DataFrame(
    [
        {
            "silhouette": silhouette_score(X, y),
            "calinski_harabasz": calinski_harabasz_score(X, y),
            "davies_bouldin": davies_bouldin_score(X, y),
        }
    ]
)

Unnamed: 0,silhouette,calinski_harabasz,davies_bouldin
0,0.244573,112.552727,1.293065


## Kmedoids Clustering with Custom Distance Metric

Here we use additional clustering method: `KMedoids` which supports custom distance metric. We will apply clustering for euclidean distance metric and cielab color distance metric which we will define bellow.

In [38]:
from sklearn_extra.cluster import KMedoids
import colormath as cmth
from colormath.color_diff import delta_e_cie2000
from colormath.color_objects import sRGBColor,LabColor
from sklearn.metrics import pairwise_distances
import colormath.color_diff as cldf

In [39]:
def cie2000_distance(p1, p2):
    return cldf.delta_e_cie2000(
        color1=LabColor(*p1),
        color2=LabColor(*p2),
    )

def euclidean_distance(p1, p2):
    diff = p1 - p2
    return np.vdot(diff, diff) ** 0.5

We set `n_clusters=4` bellow because that number was consistent across three best model results tried out above!!!

In [40]:
kmedoids_euclidean_003 = KMedoids(
    n_clusters=4,
    metric=euclidean_distance,
    method='alternate',
    init='heuristic',
    max_iter=300,
    random_state=0,
).fit(X)

In [41]:
kmedoids_cielab_003 = KMedoids(
    n_clusters=4,
    metric=cie2000_distance,
    method='alternate',
    init='heuristic',
    max_iter=300,
    random_state=0,
).fit(X)

In [42]:
pd.DataFrame(
    [
        {
            "silhouette": silhouette_score(X, kmedoids_euclidean_003.labels_),
            "calinski_harabasz": calinski_harabasz_score(X, kmedoids_euclidean_003.labels_),
            "davies_bouldin": davies_bouldin_score(X, kmedoids_euclidean_003.labels_),
        }
    ]
)

Unnamed: 0,silhouette,calinski_harabasz,davies_bouldin
0,0.309583,193.574715,1.086957


In [43]:
pd.DataFrame(
    [
        {
            "silhouette": silhouette_score(X, kmedoids_cielab_003.labels_),
            "calinski_harabasz": calinski_harabasz_score(X, kmedoids_cielab_003.labels_),
            "davies_bouldin": davies_bouldin_score(X, kmedoids_cielab_003.labels_),
        }
    ]
)

Unnamed: 0,silhouette,calinski_harabasz,davies_bouldin
0,0.285652,178.279022,1.318061


<font color='red'>**As we see the CIE2000 distance metric is worsening clustering performance a little bit compared to traditional euclidean distance one!**</font>

### Add KMedoid results

In [44]:
_sample_results = next(iter(results_003.items()))[1]

In [45]:
results_003['kmedoids_euclidean'] = _sample_results.iloc[:, :-1]
results_003['kmedoids_cielab'] = _sample_results.iloc[:, :-1]

In [46]:
results_003['kmedoids_euclidean']["Cluster"] = pd.Series(kmedoids_euclidean_003.labels_).apply(lambda x: f"Cluster {x}").tolist()
results_003['kmedoids_cielab']["Cluster"] = pd.Series(kmedoids_cielab_003.labels_).apply(lambda x: f"Cluster {x}").tolist()

# Check consistency between manual labels and clustering results 

In [47]:
def consistency_check(clustering_result, labels):
    return pd.concat([clustering_result, labels], axis=1).groupby('Cluster').Colore_IRISPLEX.value_counts()

In [48]:
for model_name, model_results in results_003.items():
    print('='*50)
    print(f"MODEL: {model_name}")
    print(consistency_check(model_results, labels))
    print('+'*50)
    print(f'Adjusted Rand Index: {adjusted_rand_score(labels.Colore_IRISPLEX.values, model_results.Cluster.values)}')
    print('+'*50)

MODEL: kmeans
Cluster    Colore_IRISPLEX
Cluster 0  BROWN              61
           INTERMEDIATE       29
Cluster 1  BLUE               19
Cluster 2  BROWN              90
           INTERMEDIATE        3
Cluster 3  INTERMEDIATE       23
           BLUE               10
           BROWN               3
Name: Colore_IRISPLEX, dtype: int64
++++++++++++++++++++++++++++++++++++++++++++++++++
Adjusted Rand Index: 0.3315495669828409
++++++++++++++++++++++++++++++++++++++++++++++++++
MODEL: ap
Cluster     Colore_IRISPLEX
Cluster 0   BLUE                8
Cluster 1   BROWN              16
Cluster 10  BROWN              23
Cluster 11  BROWN              13
            INTERMEDIATE        1
Cluster 12  INTERMEDIATE       12
            BROWN               2
Cluster 13  BROWN              15
Cluster 14  INTERMEDIATE       10
            BROWN               9
Cluster 15  BROWN              14
            INTERMEDIATE        1
Cluster 16  BROWN              12
            INTERMEDIATE        1
Clu

## Check Clusterings visually

In [49]:
for model_name, model_results in results_003.items():
    print('='*50)
    print(f"MODEL: {model_name}\tN_Clusters: {model_results.Cluster.unique().shape[0]}")
    pio.write_html(
        fig=show_lab_scatter_3d_shapes(model_results, labels),
        file=FIGURES_DIR + f"outer_eye_original_{model_name}_{model_results.Cluster.unique().shape[0]}_clusters.html"
    )
    print('='*50)

MODEL: kmeans	N_Clusters: 4


MODEL: ap	N_Clusters: 17


MODEL: meanshift	N_Clusters: 2


MODEL: sc	N_Clusters: 4


MODEL: hclust	N_Clusters: 4


MODEL: dbscan	N_Clusters: 1


MODEL: optics	N_Clusters: 12


MODEL: birch	N_Clusters: 4


MODEL: kmodes	N_Clusters: 4


MODEL: kmedoids_euclidean	N_Clusters: 4


MODEL: kmedoids_cielab	N_Clusters: 4




## Store Clustering Results as a JSON

In [50]:
for model_name, model_results in results_003.items():
    _clustering_results = pd.merge(results_003[model_name][['Cluster']], labels, left_index=True, right_index=True)
    _clustering_results.to_json(CLUSTERS_DIR + f"outer_eye_original_{model_name}_{model_results.Cluster.unique().shape[0]}_clusters.json")

## Store Clustering Dataset

In [51]:
exp_003_dataset.to_csv(DATASETS_DIR + f"outer_eye_original_dataset.csv")

## Store Clustering Metrics Results for each model

In [92]:
def get_metrics(experiment):
  for item in experiment:
    if (
        isinstance(item, list)
        and len(item) > 1
        and isinstance(item[1], pd.DataFrame)
        and "".join(item[0].columns) == "DescriptionValue"
        and "".join(item[1].columns) == "SilhouetteCalinski-HarabaszDavies-BouldinHomogeneityRand IndexCompleteness"
    ):
      only_metrics = pd.concat(item[1:]).reset_index(drop=True)
      model_metrics = pd.concat([only_metrics, pd.DataFrame({"model":MODEL_NAMES})],axis=1)

      return model_metrics

In [93]:
exp_003_metrics = get_metrics(exp_003)
exp_003_metrics

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness,model
0,0.4066,226.4846,0.8729,0,0,0,kmeans
1,0.309,173.6177,0.9338,0,0,0,ap
2,0.4891,193.0746,0.836,0,0,0,meanshift
3,0.3044,7.0123,0.4815,0,0,0,sc
4,0.3511,186.9224,0.9567,0,0,0,hclust
5,0.0,0.0,0.0,0,0,0,dbscan
6,-0.2278,13.0414,1.8741,0,0,0,optics
7,0.389,204.0435,0.8864,0,0,0,birch
8,-0.3124,1.154,0.8978,0,0,0,kmodes


In [94]:
exp_003_metrics.to_csv(METRICS_DIR + f"outer_eye_original_model_metrics.csv")

# Clustering (outer iris normalized)

Run pycaret experiments using different models and on 2 different datasets that we got here

In [95]:
from pycaret.clustering import *

In [96]:
exp_004 = setup(
    data=dataset_outer_iris,
    preprocess=True,
    ignore_features=['hex_color'],
    
    # normalize
    normalize=True,
    normalize_method='minmax',
    
    # transform
    transformation=False,
    transformation_method='yeo-johnson',
    
    # pca
    pca=False,
    pca_method='linear',
    
    
    silent=True,
    n_jobs=-1,
    
    # set experiment seed
    session_id=123
)

Unnamed: 0,Description,Value
0,session_id,123
1,Original Data,"(238, 4)"
2,Missing Values,False
3,Numeric Features,3
4,Categorical Features,0
5,Ordinal Features,False
6,High Cardinality Features,False
7,High Cardinality Method,
8,Transformed Data,"(238, 3)"
9,CPU Jobs,-1


show all outputs from cell

In [97]:
def try_model(model_name, **kwargs):
    # create model
    model = create_model(model_name, **kwargs)
    # show model object
    print(model)
    # store model results
    results = assign_model(model)
    print(results.head(2))
    # evaluate model with interactive plot
    evaluate_model(model)
    
    return model, results

In [98]:
models_004 = {}
results_004 = {}

MODEL_NAMES = ["kmeans", "ap", "meanshift", "sc", "hclust", "dbscan", "optics", "birch", "kmodes"]

In [99]:
models_004["kmeans"], results_004["kmeans"] = try_model("kmeans", **{"num_clusters":3}) 

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3849,252.2206,0.9397,0,0,0


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=3, n_init=10, n_jobs=-1, precompute_distances='deprecated',
       random_state=123, tol=0.0001, verbose=0)
                l         a          b hex_color    Cluster
ID                                                         
FDP001  17.671129  1.004831  13.675901   #332a18  Cluster 2
FDP002  22.662753  2.493072  17.055058   #42341d  Cluster 0


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Cluster PCA Plot (2d)', …

In [100]:
models_004["ap"], results_004["ap"] = try_model("ap")

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.2952,160.465,0.9633,0,0,0


AffinityPropagation(affinity='euclidean', convergence_iter=15, copy=True,
                    damping=0.5, max_iter=200, preference=None,
                    random_state='warn', verbose=False)
                l         a          b hex_color     Cluster
ID                                                          
FDP001  17.671129  1.004831  13.675901   #332a18  Cluster 13
FDP002  22.662753  2.493072  17.055058   #42341d   Cluster 5


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Cluster PCA Plot (2d)', …

In [101]:
models_004["meanshift"], results_004["meanshift"] = try_model("meanshift")

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.5277,268.4269,0.7586,0,0,0


MeanShift(bandwidth=None, bin_seeding=False, cluster_all=True, max_iter=300,
          min_bin_freq=1, n_jobs=-1, seeds=None)
                l         a          b hex_color    Cluster
ID                                                         
FDP001  17.671129  1.004831  13.675901   #332a18  Cluster 0
FDP002  22.662753  2.493072  17.055058   #42341d  Cluster 0


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Cluster PCA Plot (2d)', …

In [102]:
models_004["sc"], results_004["sc"] = try_model("sc", **{"num_clusters": 3})

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.371,232.5254,0.9485,0,0,0


SpectralClustering(affinity='rbf', assign_labels='kmeans', coef0=1, degree=3,
                   eigen_solver=None, eigen_tol=0.0, gamma=1.0,
                   kernel_params=None, n_clusters=3, n_components=None,
                   n_init=10, n_jobs=-1, n_neighbors=10, random_state=123)
                l         a          b hex_color    Cluster
ID                                                         
FDP001  17.671129  1.004831  13.675901   #332a18  Cluster 1
FDP002  22.662753  2.493072  17.055058   #42341d  Cluster 2


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Cluster PCA Plot (2d)', …

In [103]:
models_004["hclust"], results_004["hclust"] = try_model("hclust")

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.3493,213.5912,0.9667,0,0,0


AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
                        connectivity=None, distance_threshold=None,
                        linkage='ward', memory=None, n_clusters=4)
                l         a          b hex_color    Cluster
ID                                                         
FDP001  17.671129  1.004831  13.675901   #332a18  Cluster 2
FDP002  22.662753  2.493072  17.055058   #42341d  Cluster 0


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Cluster PCA Plot (2d)', …

In [104]:
models_004["dbscan"], results_004["dbscan"] = try_model("dbscan")

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0,0,0,0,0,0


DBSCAN(algorithm='auto', eps=0.5, leaf_size=30, metric='euclidean',
       metric_params=None, min_samples=5, n_jobs=-1, p=None)
                l         a          b hex_color    Cluster
ID                                                         
FDP001  17.671129  1.004831  13.675901   #332a18  Cluster 0
FDP002  22.662753  2.493072  17.055058   #42341d  Cluster 0


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Cluster PCA Plot (2d)', …

In [105]:
models_004["optics"], results_004["optics"] = try_model("optics")

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,-0.256,13.8155,1.7117,0,0,0


OPTICS(algorithm='auto', cluster_method='xi', eps=None, leaf_size=30,
       max_eps=inf, metric='minkowski', metric_params=None,
       min_cluster_size=None, min_samples=5, n_jobs=-1, p=2,
       predecessor_correction=True, xi=0.05)
                l         a          b hex_color     Cluster
ID                                                          
FDP001  17.671129  1.004831  13.675901   #332a18  Cluster -1
FDP002  22.662753  2.493072  17.055058   #42341d  Cluster -1


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Cluster PCA Plot (2d)', …

In [106]:
models_004["birch"], results_004["birch"] = try_model("birch")

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0,0,0,0,0,0


Birch(branching_factor=50, compute_labels=True, copy=True, n_clusters=4,
      threshold=0.5)
                l         a          b hex_color    Cluster
ID                                                         
FDP001  17.671129  1.004831  13.675901   #332a18  Cluster 0
FDP002  22.662753  2.493072  17.055058   #42341d  Cluster 0


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Cluster PCA Plot (2d)', …

In [107]:
models_004["kmodes"], results_004["kmodes"] = try_model("kmodes")

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,-0.3688,0.9472,1.0108,0,0,0


KModes(cat_dissim=<function matching_dissim at 0x7fce8525b0e0>, init='Cao',
       max_iter=100, n_clusters=4, n_init=1, n_jobs=-1, random_state=123,
       verbose=0)
                l         a          b hex_color    Cluster
ID                                                         
FDP001  17.671129  1.004831  13.675901   #332a18  Cluster 0
FDP002  22.662753  2.493072  17.055058   #42341d  Cluster 1


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Cluster PCA Plot (2d)', …

## Evaluate Results by Eye Color Labels

In [108]:
from sklearn.metrics import (
    silhouette_score, calinski_harabasz_score, davies_bouldin_score,
    homogeneity_score, adjusted_rand_score, completeness_score
)

In [109]:
assert (labels.index == dataset_outer_iris.index).sum() == dataset_outer_iris.shape[0]

In [110]:
color_map = {"BROWN": 0, "INTERMEDIATE": 1, "BLUE": 2}

In [111]:
def find_transformed_dataset(experiment):
    for item in experiment:
        if (
            isinstance(item, list) 
            and len(item) > 1 
            and isinstance(item[1], tuple) 
            and len(item[1]) > 1 
            and item[1][0] == 'Transformed Data'
            and isinstance(item[1][1], pd.DataFrame) 
        ):
            return item[1][1]

In [112]:
exp_004_dataset = find_transformed_dataset(exp_004)
X = exp_004_dataset.values
y = labels['Colore_IRISPLEX'].apply(lambda x: color_map[x])

In [113]:
pd.DataFrame(
    [
        {
            "silhouette": silhouette_score(X, y),
            "calinski_harabasz": calinski_harabasz_score(X, y),
            "davies_bouldin": davies_bouldin_score(X, y),
        }
    ]
)

Unnamed: 0,silhouette,calinski_harabasz,davies_bouldin
0,0.315665,153.535066,1.107469


## Kmedoids Clustering with Custom Distance Metric

Here we use additional clustering method: `KMedoids` which supports custom distance metric. We will apply clustering for euclidean distance metric and cielab color distance metric which we will define bellow.

In [114]:
from sklearn_extra.cluster import KMedoids
import colormath as cmth
from colormath.color_diff import delta_e_cie2000
from colormath.color_objects import sRGBColor,LabColor
from sklearn.metrics import pairwise_distances
import colormath.color_diff as cldf

In [115]:
def cie2000_distance(p1, p2):
    return cldf.delta_e_cie2000(
        color1=LabColor(*p1),
        color2=LabColor(*p2),
    )

def euclidean_distance(p1, p2):
    diff = p1 - p2
    return np.vdot(diff, diff) ** 0.5

We set `n_clusters=4` bellow because that number was consistent across three best model results tried out above!!!

In [116]:
kmedoids_euclidean_004 = KMedoids(
    n_clusters=4,
    metric=euclidean_distance,
    method='alternate',
    init='heuristic',
    max_iter=300,
    random_state=0,
).fit(X)

**we use previous kmedoids results (kmedoids_cielab_004 = kmedoids_cielab_003) since cie2000 distance works with original L,A,B values, but we measure clustering metrics on transformed dataset in LAB space**

In [117]:
pd.DataFrame(
    [
        {
            "silhouette": silhouette_score(X, kmedoids_euclidean_004.labels_),
            "calinski_harabasz": calinski_harabasz_score(X, kmedoids_euclidean_004.labels_),
            "davies_bouldin": davies_bouldin_score(X, kmedoids_euclidean_004.labels_),
        }
    ]
)

Unnamed: 0,silhouette,calinski_harabasz,davies_bouldin
0,0.341955,217.772654,1.028019


In [118]:
pd.DataFrame(
    [
        {
            "silhouette": silhouette_score(X, kmedoids_cielab_003.labels_),
            "calinski_harabasz": calinski_harabasz_score(X, kmedoids_cielab_003.labels_),
            "davies_bouldin": davies_bouldin_score(X, kmedoids_cielab_003.labels_),
        }
    ]
)

Unnamed: 0,silhouette,calinski_harabasz,davies_bouldin
0,0.329934,213.03161,1.020328


<font color='red'>**As we see the CIE2000 distance metric is worsening clustering performance a little bit compared to traditional euclidean distance one!**</font>

### Add KMedoid results

In [119]:
_sample_results = next(iter(results_004.items()))[1]

In [120]:
results_004['kmedoids_euclidean'] = _sample_results.iloc[:, :-1]
results_004['kmedoids_cielab'] = _sample_results.iloc[:, :-1]

In [121]:
results_004['kmedoids_euclidean']["Cluster"] = pd.Series(kmedoids_euclidean_004.labels_).apply(lambda x: f"Cluster {x}").tolist()
results_004['kmedoids_cielab']["Cluster"] = pd.Series(kmedoids_cielab_003.labels_).apply(lambda x: f"Cluster {x}").tolist()

# Check consistency between manual labels and clustering results 

In [122]:
def consistency_check(clustering_result, labels):
    return pd.concat([clustering_result, labels], axis=1).groupby('Cluster').Colore_IRISPLEX.value_counts()

In [123]:
for model_name, model_results in results_004.items():
    print('='*50)
    print(f"MODEL: {model_name}")
    print(consistency_check(model_results, labels))
    print('+'*50)
    print(f'Adjusted Rand Index: {adjusted_rand_score(labels.Colore_IRISPLEX.values, model_results.Cluster.values)}')
    print('+'*50)

MODEL: kmeans
Cluster    Colore_IRISPLEX
Cluster 0  BROWN              62
           INTERMEDIATE       32
Cluster 1  BLUE               29
           INTERMEDIATE       22
Cluster 2  BROWN              92
           INTERMEDIATE        1
Name: Colore_IRISPLEX, dtype: int64
++++++++++++++++++++++++++++++++++++++++++++++++++
Adjusted Rand Index: 0.3433642853269452
++++++++++++++++++++++++++++++++++++++++++++++++++
MODEL: ap
Cluster     Colore_IRISPLEX
Cluster 0   BROWN              23
            INTERMEDIATE        1
Cluster 1   BLUE                6
Cluster 10  BROWN              18
Cluster 11  INTERMEDIATE        7
            BLUE                1
Cluster 12  BROWN              11
            INTERMEDIATE        6
Cluster 13  INTERMEDIATE       15
            BROWN               5
Cluster 14  BROWN               9
Cluster 2   BROWN              23
Cluster 3   BLUE                7
            INTERMEDIATE        5
Cluster 4   INTERMEDIATE        5
            BLUE                1
C

## Check Clusterings visually

In [124]:
for model_name, model_results in results_004.items():
    print('='*50)
    print(f"MODEL: {model_name}\tN_Clusters: {model_results.Cluster.unique().shape[0]}")
    pio.write_html(
        fig=show_lab_scatter_3d_shapes(model_results, labels),
        file=FIGURES_DIR + f"outer_eye_normalized_{model_name}_{model_results.Cluster.unique().shape[0]}_clusters.html"
    )
    print('='*50)

MODEL: kmeans	N_Clusters: 3


MODEL: ap	N_Clusters: 15


MODEL: meanshift	N_Clusters: 2


MODEL: sc	N_Clusters: 3


MODEL: hclust	N_Clusters: 4


MODEL: dbscan	N_Clusters: 1


MODEL: optics	N_Clusters: 10


MODEL: birch	N_Clusters: 1


MODEL: kmodes	N_Clusters: 4


MODEL: kmedoids_euclidean	N_Clusters: 4


MODEL: kmedoids_cielab	N_Clusters: 4




## Store Clustering Results as a JSON

In [125]:
for model_name, model_results in results_004.items():
    _clustering_results = pd.merge(results_004[model_name][['Cluster']], labels, left_index=True, right_index=True)
    _clustering_results.to_json(CLUSTERS_DIR + f"outer_eye_normalized_{model_name}_{model_results.Cluster.unique().shape[0]}_clusters.json")

## Store Clustering Dataset

In [126]:
exp_004_dataset.to_csv(DATASETS_DIR + f"outer_eye_normalized_dataset.csv")

## Store Clustering Metrics Results for each model

In [127]:
def get_metrics(experiment):
  for item in experiment:
    if (
        isinstance(item, list)
        and len(item) > 1
        and isinstance(item[1], pd.DataFrame)
        and "".join(item[0].columns) == "DescriptionValue"
        and "".join(item[1].columns) == "SilhouetteCalinski-HarabaszDavies-BouldinHomogeneityRand IndexCompleteness"
    ):
      only_metrics = pd.concat(item[1:]).reset_index(drop=True)
      model_metrics = pd.concat([only_metrics, pd.DataFrame({"model":MODEL_NAMES})],axis=1)

      return model_metrics

In [128]:
exp_004_metrics = get_metrics(exp_004)
exp_004_metrics

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness,model
0,0.3849,252.2206,0.9397,0,0,0,kmeans
1,0.2952,160.465,0.9633,0,0,0,ap
2,0.5277,268.4269,0.7586,0,0,0,meanshift
3,0.371,232.5254,0.9485,0,0,0,sc
4,0.3493,213.5912,0.9667,0,0,0,hclust
5,0.0,0.0,0.0,0,0,0,dbscan
6,-0.256,13.8155,1.7117,0,0,0,optics
7,0.0,0.0,0.0,0,0,0,birch
8,-0.3688,0.9472,1.0108,0,0,0,kmodes


In [129]:
exp_004_metrics.to_csv(METRICS_DIR + f"outer_eye_normalized_model_metrics.csv")