In [1]:
from recursive_clustering.experiment.open_ml_clustering_experiment import OpenmlClusteringExperiment
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import rgb2hex
from graphviz import Source
import pandas as pd

In [2]:
datasets = pd.read_csv('recursive_clustering/openml_datasets.csv')

In [3]:
datasets

Unnamed: 0,task_id,dataset_id,dataset_name,task_type,task_name,target_name,evaluation_type,n_folds,n_instances,n_features,n_categorical_features,n_continuous_features,n_nans,n_rows_nans,n_classes,url
0,167124,40927,CIFAR_10,Supervised Classification,classification,class,crossvalidation,10,60000.0,3073.0,1.0,3072.0,0.0,0.0,10,https://www.openml.org/d/40927
1,146825,40996,Fashion-MNIST,Supervised Classification,classification,class,crossvalidation,10,70000.0,785.0,1.0,784.0,0.0,0.0,10,https://www.openml.org/d/40996
2,14969,4538,GesturePhaseSegmentationProcessed,Supervised Classification,classification,Phase,crossvalidation,10,9873.0,33.0,1.0,32.0,0.0,0.0,5,https://www.openml.org/d/4538
3,3510,375,JapaneseVowels,Supervised Classification,classification,speaker,crossvalidation,10,9961.0,15.0,1.0,14.0,0.0,0.0,9,https://www.openml.org/d/375
4,125921,40496,LED-display-domain-7digit,Supervised Classification,classification,Class,crossvalidation,10,500.0,8.0,1.0,7.0,0.0,0.0,10,https://www.openml.org/d/40496
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73,168331,41166,volkert,Supervised Classification,classification,class,crossvalidation,10,58310.0,181.0,1.0,180.0,0.0,0.0,10,https://www.openml.org/d/41166
74,3022,307,vowel,Supervised Classification,classification,Class,crossvalidation,10,990.0,13.0,3.0,10.0,0.0,0.0,11,https://www.openml.org/d/307
75,9945,1509,walking-activity,Supervised Classification,classification,Class,crossvalidation,10,149332.0,5.0,1.0,4.0,0.0,0.0,22,https://www.openml.org/d/1509
76,9960,1497,wall-robot-navigation,Supervised Classification,classification,Class,crossvalidation,10,5456.0,25.0,1.0,24.0,0.0,0.0,4,https://www.openml.org/d/1497


In [77]:
experiment = OpenmlClusteringExperiment(check_if_exists=False)
model_nickname = 'RecursiveClustering'
dataset_id = 61
seed_model = 4735
standardize = True
model_params = dict(components_size=29, repetitions=3, kmeans_n_clusters=5)
fit_params = dict()
results = experiment.run_openml_experiment_combination(model_nickname=model_nickname, seed_model=seed_model, dataset_id=dataset_id, standardize=standardize,
                                                       model_params=model_params, fit_params=fit_params, return_results=True, log_to_mlflow=True)

Running...
model_nickname: RecursiveClustering
seed_model: 4735
dataset_id: 61
model_params: {'components_size': 29, 'repetitions': 3, 'kmeans_n_clusters': 5}
fit_params: {}
standardize: True

Finished!
total_elapsed_time: 0.0878590370011807
model_nickname: RecursiveClustering
seed_model: 4735
dataset_id: 61
model_params: {'components_size': 29, 'repetitions': 3, 'kmeans_n_clusters': 5}
fit_params: {}
standardize: True



In [509]:
# first lets transform label_sequence in a graph, with nodes being clusters and edges being transitions between clusters (across iterations)
# inside each cluster we will store the samples that belong to that cluster
# we will first build the graph with dictionaries and then convert it to a graphviz Digraph object
label_sequence = results['load_model_return']['model'].labels_sequence_
y = results['load_data_return']['y']
max_samples = 5
n_iterations = label_sequence.shape[1]
n_samples = label_sequence.shape[0]
graph = dict()
max_samples_shown = 5
color_sequence = plt.cm.tab20.colors
y_codes = y.astype('category').cat.codes
y_colors = y_codes.map(lambda x: rgb2hex(color_sequence[x]))
for i in range(n_iterations):
    unique_clusters = np.unique(label_sequence[:, i])
    for cluster in unique_clusters:
        cluster_label = f"cluster_{cluster}_iter_{i}"
        samples_in_cluster_idx = np.where(label_sequence[:, i] == cluster)[0]
        samples_in_cluster = y[samples_in_cluster_idx]
        
        # # Limit to max_samples in a stratified way
        # if len(samples_in_cluster_colors) > max_samples:
        #     frac_of_code_in_cluster = samples_in_cluster_colors.value_counts(normalize=True)
        #     number_of_samples_to_show = np.ceil(frac_of_code_in_cluster * max_samples).astype(int)
        #     sampled = pd.concat([
        #         samples_in_cluster_colors[samples_in_cluster_colors == category].sample(sample_size, random_state=42)
        #         for category, sample_size in number_of_samples_to_show.items()
        #     ])
        # else:
        #     sampled = samples_in_cluster_colors
        
        graph[cluster_label] = dict()
        graph[cluster_label]['samples'] = samples_in_cluster
        graph[cluster_label]['cluster'] = cluster
        graph[cluster_label]['iter'] = i
        
        if i > 0:
            graph[cluster_label]['prev_cluster'] = []
            previous_labels = label_sequence[:, i - 1]
            previous_clusters = np.unique(previous_labels[samples_in_cluster_idx])
            for prev_cluster in previous_clusters:
                prev_cluster_label = f"cluster_{prev_cluster}_iter_{i-1}"
                graph[cluster_label]['prev_cluster'].append(prev_cluster_label)

In [592]:
# now we will convert the graph to a graphviz in the dot language
dot_str = 'digraph G {\n'
# LR -> left to right
dot_str += "rankdir=LR;\n"
# compound must be true to allow subgraphs
dot_str += "compound=true;\n"

for cluster_label, cluster_dict in graph.items():
    label = cluster_dict['samples'].value_counts().sort_index()
    label.index.name = 'Class count'
    label = label.to_string()
    dot_str += f"{cluster_label} [label=\"{label}\", fontsize=15];\n"

for cluster_label, cluster_dict in graph.items():
    if 'prev_cluster' in cluster_dict:
        for prev_cluster in cluster_dict['prev_cluster']:
            dot_str += f"{prev_cluster} -> {cluster_label};\n"
            
dot_str += "}"

In [594]:
# render the graph
s = Source(dot_str, filename="iris_agg", format="pdf")
s.save()
s.view()

'iris_agg.pdf'