# Imports and Functions

In [1]:
from ml_experiments.utils import unflatten_dict, update_recursively
from sklearn.metrics.cluster import adjusted_rand_score
from cohirf.models.cohirf import ModularCoHiRF, CoHiRF
from cohirf.models.batch_cohirf import BatchCoHiRF
from cohirf.models.kernel_kmeans import KernelKMeans
from cohirf.models.pseudo_kernel import PseudoKernelClustering
from cohirf.models.lazy_minibatchkmeans import LazyMiniBatchKMeans
from sklearn.kernel_approximation import Nystroem, RBFSampler
from sklearn.datasets import make_blobs
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from functools import partial
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Dataset (easy)

In [2]:
random_state_dataset = 42
n_samples = int(1e5)
n_features = 10
n_clusters = 5
distance = 5
cluster_std = 1
rng = np.random.default_rng(random_state_dataset)
# Generate equally spaced centers using a regular simplex
# Start with a random orthonormal basis in P dimensions
centers = rng.standard_normal((n_clusters, n_features))
centers, _ = np.linalg.qr(centers.T)  # Orthonormalize columns
centers = centers.T
# Scale the simplex to achieve the desired pairwise distance
centers *= distance / np.sqrt(2)
X, y = make_blobs(
    n_samples=n_samples,
    n_features=n_features,
    centers=centers,
    cluster_std=cluster_std,
    random_state=random_state_dataset,
)
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
# visualize with 2D projection (TSNE)
tsne = TSNE(n_components=2, random_state=random_state_dataset)
X_2d = tsne.fit_transform(X)
fig, ax = plt.subplots(figsize=(8, 6))
ax.scatter(X_2d[:, 0], X_2d[:, 1], c=y, cmap="viridis", s=10)
ax.set_title("2D Projection of the Dataset")
ax.set_xlabel("TSNE 1")
ax.set_ylabel("TSNE 2")
plt.show()

In [3]:
seed = 42
model = CoHiRF(repetitions=3, n_features='full', kmeans_n_clusters=5, random_state=seed)
labels = model.fit_predict(X)
ari = adjusted_rand_score(y, labels)
print(f"Adjusted Rand Index: {ari:.3f}")

Adjusted Rand Index: 0.941


In [26]:
import numpy as np
import pandas as pd
from typing import Optional
from cohirf.models.cohirf import BaseCoHiRF, CoHiRF


class BatchCoHiRF:
    def __init__(
        self,
        cohirf_model: type[BaseCoHiRF] = CoHiRF,
        cohirf_kwargs: Optional[dict] = None,
        batch_size: int = 1000,
        max_epochs: int = 10,
        verbose: bool = False,
    ):
        self.cohirf_model = cohirf_model
        self.cohirf_kwargs = cohirf_kwargs if cohirf_kwargs is not None else {}
        self.batch_size = batch_size
        self.max_epochs = max_epochs
        self.verbose = verbose

    def do_one_epoch(self, X_representatives):
        n_samples = X_representatives.shape[0]
        n_batches = np.ceil(n_samples / self.batch_size).astype(int)
        if n_batches == 1:
            stop = True
        else:
            stop = False
        all_parents = []
        all_representatives_indexes = []
        all_n_clusters = 0
        for i in range(n_batches):
            start = i * self.batch_size
            end = min((i + 1) * self.batch_size, n_samples)
            indexes = np.arange(start, end)
            X_batch = X_representatives[indexes]
            # fit the cohirf model on the batch
            cohirf_model = self.cohirf_model(**self.cohirf_kwargs)
            cohirf_model.fit(X_batch)

            parents = cohirf_model.parents_
            # but the parents are relative to the batch
            # so we need to update them to be relative to the whole dataset
            parents = indexes[parents]
            all_parents.append(parents)

            representatives_indexes = cohirf_model.representatives_indexes_
            # but the representatives are relative to the batch
            # so we need to update them to be relative to the whole dataset
            representatives_indexes = indexes[representatives_indexes]
            all_representatives_indexes.append(representatives_indexes)

            n_clusters = cohirf_model.n_clusters_
            all_n_clusters = all_n_clusters + n_clusters

        all_parents = np.concatenate(all_parents)
        all_representatives_indexes = np.concatenate(all_representatives_indexes)
        return all_representatives_indexes, all_parents, all_n_clusters, stop

    def update_parents(self, old_parents, old_representatives_absolute_indexes, new_absolute_parents):
        new_parents = old_parents.copy()
        new_parents[old_representatives_absolute_indexes] = new_absolute_parents
        return new_parents

    def get_all_parents_indexes(self, parents, representative_index):
        all_indexes = set()
        indexes_to_append = [representative_index]
        first = True
        while len(indexes_to_append) > 0:  # the representative_index itself will always be in the list
            all_indexes.update(indexes_to_append)
            indexes_to_append = np.where(np.isin(parents, indexes_to_append))[0]
            if first:
                first = False
                indexes_to_append = np.setdiff1d(indexes_to_append, representative_index, assume_unique=True)
        return list(all_indexes)

    def get_labels_from_parents(self, parents, representative_indexes):
        if self.verbose:
            print("Getting labels from parents")
        labels = np.empty(parents.shape[0], dtype=int)
        for i, representative_index in enumerate(representative_indexes):
            all_indexes = self.get_all_parents_indexes(parents, representative_index)
            labels[all_indexes] = i
        return labels

    def fit(self, X: pd.DataFrame | np.ndarray, y=None, sample_weight=None, representatives_indexes=None, parents=None):

        if isinstance(X, pd.DataFrame):
            X = X.to_numpy()

        n_samples = X.shape[0]

        if representatives_indexes is None and parents is None:
            # indexes of the representative samples, start with (n_samples) but will be updated when we have less than
            # n_samples as representatives
            representatives_absolute_indexes = np.arange(n_samples)
            # representatives_local_indexes = representatives_absolute_indexes
            # each sample starts as its own cluster (and its own parent)
            parents = representatives_absolute_indexes
        else:
            # we consider that we are starting from a previous run
            # we make sure that both representatives_indexes and parents are not None
            if representatives_indexes is None or parents is None:
                raise ValueError("If you provide representatives_indexes, you must also provide parents.")
            representatives_absolute_indexes = np.array(representatives_indexes)

        i = 0
        stop = False
        # stop when we have run with n_batches == 1
        while not stop and i < self.max_epochs:
            if self.verbose:
                print(f"Starting epoch {i}")

            X_representatives = X[representatives_absolute_indexes]

            (
                new_representatives_local_indexes,
                new_local_parents,
                n_clusters,
                stop,
            ) = self.do_one_epoch(X_representatives)

            new_absolute_parents = representatives_absolute_indexes[new_local_parents]

            parents = self.update_parents(parents, representatives_absolute_indexes, new_absolute_parents)

            representatives_absolute_indexes = representatives_absolute_indexes[new_representatives_local_indexes]

            i += 1

        self.n_clusters_ = n_clusters
        self.labels_ = self.get_labels_from_parents(parents, representatives_absolute_indexes)
        self.parents_ = parents
        self.representatives_indexes_ = representatives_absolute_indexes
        self.cluster_representatives_ = X[representatives_absolute_indexes]
        self.n_epoch_ = i
        return self

    def fit_predict(
        self, X: pd.DataFrame | np.ndarray, y=None, sample_weight=None, representatives_indexes=None, parents=None
    ):
        return self.fit(X, y, sample_weight, representatives_indexes, parents).labels_

In [12]:
model = BatchCoHiRF(batch_size=100, verbose=True, cohirf_kwargs=dict(repetitions=3, n_features='full', kmeans_n_clusters=5, random_state=seed))
labels = model.fit_predict(X)
ari = adjusted_rand_score(y, labels)
print(f"Adjusted Rand Index: {ari:.3f}")

Starting epoch 0
Starting epoch 1
Getting labels from parents
Adjusted Rand Index: 0.857


In [7]:
model = BatchCoHiRF(
    batch_size=100,
    verbose=True,
    cohirf_kwargs=dict(repetitions=3, n_features="full", kmeans_n_clusters=5, random_state=seed),
)
labels = model.fit_predict(X)
ari = adjusted_rand_score(y, labels)
print(f"Adjusted Rand Index: {ari:.3f}")

Starting epoch 0
Starting epoch 1
Starting epoch 2
Starting epoch 3
Starting epoch 4
Getting labels from parents
Adjusted Rand Index: 0.911


In [27]:
model = BatchCoHiRF(
    batch_size=int(1e4),
    verbose=True,
    cohirf_kwargs=dict(repetitions=3, n_features="full", kmeans_n_clusters=5, random_state=seed),
)
labels = model.fit_predict(X)
ari = adjusted_rand_score(y, labels)
print(f"Adjusted Rand Index: {ari:.3f}")

Starting epoch 0
Starting epoch 1
Getting labels from parents
Adjusted Rand Index: 0.940


In [28]:
model1 = BatchCoHiRF(
    batch_size=int(1e4),
    verbose=True,
    cohirf_kwargs=dict(repetitions=3, n_features="full", kmeans_n_clusters=5, random_state=seed),
    max_epochs=1,
)
model1.fit(X)
representatives_indexes = model1.representatives_indexes_
parents = model1.parents_
model2 = BatchCoHiRF(
    batch_size=int(1e4),
    verbose=True,
    cohirf_kwargs=dict(repetitions=3, n_features="full", kmeans_n_clusters=5, random_state=seed),
    max_epochs=10,
)
labels = model2.fit_predict(X, representatives_indexes=representatives_indexes, parents=parents)
ari = adjusted_rand_score(y, labels)
print(f"Adjusted Rand Index: {ari:.3f}")

Starting epoch 0
Getting labels from parents
Starting epoch 0
Getting labels from parents
Adjusted Rand Index: 0.940


In [11]:
np.unique(labels)

array([0, 1, 2, 3, 4])

In [14]:
model = BatchCoHiRF(
    batch_size=int(1e4),
    verbose=True,
    cohirf_kwargs=dict(repetitions=3, n_features="full", kmeans_n_clusters=5, random_state=seed),
    max_epochs=1,
)
labels = model.fit_predict(X)
ari = adjusted_rand_score(y, labels)
print(f"Adjusted Rand Index: {ari:.3f}")

Starting epoch 0
Getting labels from parents
Adjusted Rand Index: 0.140


In [18]:
model.representatives_indexes_

array([ 4454,  8155,  1319,  5786,  4345,  9208,  4520, 18948, 16856,
       14054, 15850, 13536, 19827, 10760, 24089, 28736, 27064, 20993,
       26825, 21190, 26363, 30020, 36488, 38093, 32367, 37499, 36266,
       34995, 33285, 48802, 46824, 40562, 41756, 43830, 48583, 49298,
       56716, 51998, 52416, 54558, 56296, 56646, 51177, 57272, 66626,
       68126, 64008, 64608, 61422, 68127, 61759, 68686, 66784, 68325,
       60677, 60330, 64582, 62871, 64859, 68320, 68447, 67373, 63041,
       68166, 61140, 62192, 66565, 76307, 73294, 75226, 73120, 79526,
       74780, 78344, 76163, 88460, 87522, 80434, 84246, 83843, 81000,
       96397, 91836, 97007, 95245, 93106, 91025])