In [1]:
from recursive_clustering.experiment.open_ml_clustering_experiment import OpenmlClusteringExperiment
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import rgb2hex
from graphviz import Source
import pandas as pd

In [7]:
experiment = OpenmlClusteringExperiment(check_if_exists=False)
model_nickname = 'RecursiveClustering'
dataset_id = 1568
seed_model = 1684
standardize = True
model_params = dict(components_size=11, repetitions=3, kmeans_n_clusters=2, representative_method='rbf')
fit_params = dict()
results = experiment.run_openml_experiment_combination(model_nickname=model_nickname, seed_model=seed_model, dataset_id=dataset_id, standardize=standardize,
                                                       model_params=model_params, fit_params=fit_params, return_results=True, log_to_mlflow=True, n_jobs=4)

Running...
model_nickname: RecursiveClustering
seed_model: 1684
dataset_id: 1568
model_params: {'components_size': 11, 'repetitions': 3, 'kmeans_n_clusters': 2, 'representative_method': 'rbf'}
fit_params: {}
standardize: True

Finished!
total_elapsed_time: 2.8702340189993265
model_nickname: RecursiveClustering
seed_model: 1684
dataset_id: 1568
model_params: {'components_size': 11, 'repetitions': 3, 'kmeans_n_clusters': 2, 'representative_method': 'rbf'}
fit_params: {}
standardize: True



In [8]:
results['evaluate_model_return']

{'n_clusters_': 3,
 'rand_score': np.float64(0.776080552717602),
 'adjusted_rand': 0.5098514149312312,
 'mutual_info': np.float64(0.6377848951318092),
 'adjusted_mutual_info': np.float64(0.5767073360366544),
 'normalized_mutual_info': np.float64(0.5767960528129983),
 'homogeneity': np.float64(0.5366486333772807),
 'completeness': np.float64(0.6234361664943219),
 'v_measure': np.float64(0.5767960528129982),
 'silhouette': np.float64(0.05220179458681342),
 'calinski_harabasz_score': np.float64(707.2474639793057),
 'davies_bouldin_score': np.float64(3.9769502100984924),
 'inertia_score': np.float64(44312.4521799226),
 'elapsed_time': 0.11954998200235423}

In [9]:
# first lets transform label_sequence in a graph, with nodes being clusters and edges being transitions between clusters (across iterations)
# inside each cluster we will store the samples that belong to that cluster
# we will first build the graph with dictionaries and then convert it to a graphviz Digraph object
label_sequence = results['load_model_return']['model'].labels_sequence_
y = results['load_data_return']['y']
max_samples = 5
n_iterations = label_sequence.shape[1]
n_samples = label_sequence.shape[0]
graph = dict()
max_samples_shown = 5
color_sequence = plt.cm.tab20.colors
y_codes = y.astype('category').cat.codes
y_colors = y_codes.map(lambda x: rgb2hex(color_sequence[x]))
for i in range(n_iterations):
    unique_clusters = np.unique(label_sequence[:, i])
    for cluster in unique_clusters:
        cluster_label = f"cluster_{cluster}_iter_{i}"
        samples_in_cluster_idx = np.where(label_sequence[:, i] == cluster)[0]
        samples_in_cluster = y[samples_in_cluster_idx]
        
        # # Limit to max_samples in a stratified way
        # if len(samples_in_cluster_colors) > max_samples:
        #     frac_of_code_in_cluster = samples_in_cluster_colors.value_counts(normalize=True)
        #     number_of_samples_to_show = np.ceil(frac_of_code_in_cluster * max_samples).astype(int)
        #     sampled = pd.concat([
        #         samples_in_cluster_colors[samples_in_cluster_colors == category].sample(sample_size, random_state=42)
        #         for category, sample_size in number_of_samples_to_show.items()
        #     ])
        # else:
        #     sampled = samples_in_cluster_colors
        
        graph[cluster_label] = dict()
        graph[cluster_label]['samples'] = samples_in_cluster
        graph[cluster_label]['cluster'] = cluster
        graph[cluster_label]['iter'] = i
        
        if i > 0:
            graph[cluster_label]['prev_cluster'] = []
            previous_labels = label_sequence[:, i - 1]
            previous_clusters = np.unique(previous_labels[samples_in_cluster_idx])
            for prev_cluster in previous_clusters:
                prev_cluster_label = f"cluster_{prev_cluster}_iter_{i-1}"
                graph[cluster_label]['prev_cluster'].append(prev_cluster_label)

In [10]:
# now we will convert the graph to a graphviz in the dot language
dot_str = 'digraph G {\n'
# LR -> left to right
dot_str += "rankdir=LR;\n"
# compound must be true to allow subgraphs
dot_str += "compound=true;\n"

for cluster_label, cluster_dict in graph.items():
    label = cluster_dict['samples'].value_counts().sort_index()
    label.index.name = 'Class count'
    label = label.to_string()
    dot_str += f"{cluster_label} [label=\"{label}\", fontsize=15];\n"

for cluster_label, cluster_dict in graph.items():
    if 'prev_cluster' in cluster_dict:
        for prev_cluster in cluster_dict['prev_cluster']:
            dot_str += f"{prev_cluster} -> {cluster_label};\n"
            
dot_str += "}"

In [11]:
# render the graph
s = Source(dot_str, filename="nursery_agg", format="pdf")
s.save()
s.view()

'nursery_agg.pdf'