# Imports

In [1]:
from cohirf.experiment.open_ml_clustering_experiment import OpenmlClusteringExperiment
from cohirf.experiment.hpo_open_ml_clustering_experiment import HPOOpenmlClusteringExperiment
from pathlib import Path
import pandas as pd
from IPython.display import clear_output
import optuna
from cohirf.models.batch_cohirf import BatchCoHiRF
from cohirf.models.cohirf import BaseCoHiRF
from sklearn.cluster import DBSCAN

In [2]:
# results_dir = Path('/home/belucci/code/cohirf/results') / 'real'
results_dir = Path("/home/users/belucci/cohirf/results") / "real"
results_dir.mkdir(parents=True, exist_ok=True)
mlflow_tracking_uri = f"sqlite:///{results_dir}/mlflow.db"

In [3]:
experiment_params = dict(
    profile_memory=False,
    mlflow_tracking_uri=mlflow_tracking_uri,
	check_if_exists=False,
	verbose=1,
)

# KDD 99

## KMeans

In [5]:
seeds = [i for i in range(1)]
aris = []
times = []
memories = []
for seed in seeds:
    experiment = OpenmlClusteringExperiment(
        # model
        experiment_name="test",
        model="KMeans",
        seed_model=0,
        # dataset
        dataset_id=1110,
        standardize=True,
        raise_on_error=True,
        **experiment_params,
    )
    result = experiment.run(return_results=True)[0]
    ari = result["evaluate_model_return"]["adjusted_rand"]
    time = result["fit_model_return"]["elapsed_time"]
    max_memory = result["max_memory_used_after_fit"]
    aris.append(ari)
    times.append(time)
    memories.append(max_memory)
clear_output(wait=True)
df = pd.DataFrame({"ARI": aris, "Time": times, "Memory": memories})
print(df.describe())

            ARI      Time   Memory
count  1.000000  1.000000     1.00
mean   0.359913  2.222778  4797.82
std         NaN       NaN      NaN
min    0.359913  2.222778  4797.82
25%    0.359913  2.222778  4797.82
50%    0.359913  2.222778  4797.82
75%    0.359913  2.222778  4797.82
max    0.359913  2.222778  4797.82


In [4]:
seeds = [i for i in range(1)]
aris = []
times = []
memories = []
for seed in seeds:
    experiment = OpenmlClusteringExperiment(
        # model
        experiment_name="test",
        model="CoHiRF",
        model_params=dict(n_samples_representative=10000),
        seed_model=0,
        # dataset
        dataset_id=1110,
        standardize=True,
        raise_on_error=True,
        **experiment_params,
    )
    result = experiment.run(return_results=True)[0]
    ari = result["evaluate_model_return"]["adjusted_rand"]
    time = result["fit_model_return"]["elapsed_time"]
    max_memory = result["max_memory_used_after_fit"]
    aris.append(ari)
    times.append(time)
    memories.append(max_memory)
clear_output(wait=True)
df = pd.DataFrame({"ARI": aris, "Time": times, "Memory": memories})
print(df.describe())

            ARI       Time   Memory
count  1.000000   1.000000     1.00
mean  -0.003163  25.386191  4274.88
std         NaN        NaN      NaN
min   -0.003163  25.386191  4274.88
25%   -0.003163  25.386191  4274.88
50%   -0.003163  25.386191  4274.88
75%   -0.003163  25.386191  4274.88
max   -0.003163  25.386191  4274.88


In [18]:
import numpy as np

shape = (10000, 10000)
itemsize = np.dtype(np.float64).itemsize  # 8 bytes for float64
total_bytes = shape[0] * shape[1] * itemsize
total_gb = total_bytes / (1024 ** 3)
print(f"Memory required: {total_gb:.2f} GB")

Memory required: 0.75 GB


In [4]:
seeds = [i for i in range(1)]
aris = []
times = []
memories = []
for seed in seeds:
    experiment = OpenmlClusteringExperiment(
        # model
        experiment_name="test",
        model="BatchCoHiRF-1iter",
        model_params=dict(verbose=1),
        n_jobs=10,
        # model_params=dict(n_samples_representative=10000),
        seed_model=0,
        # dataset
        dataset_id=1110,
        standardize=True,
        # raise_on_error=True,
        **experiment_params,
    )
    result = experiment.run(return_results=True)[0]
    ari = result["evaluate_model_return"]["adjusted_rand"]
    time = result["fit_model_return"]["elapsed_time"]
    max_memory = result["max_memory_used_after_fit"]
    aris.append(ari)
    times.append(time)
    memories.append(max_memory)
clear_output(wait=True)
df = pd.DataFrame({"ARI": aris, "Time": times, "Memory": memories})
print(df.describe())

            ARI       Time  Memory
count  1.000000   1.000000     1.0
mean   0.164858  69.262438  4101.2
std         NaN        NaN     NaN
min    0.164858  69.262438  4101.2
25%    0.164858  69.262438  4101.2
50%    0.164858  69.262438  4101.2
75%    0.164858  69.262438  4101.2
max    0.164858  69.262438  4101.2


In [4]:
model_cls = BatchCoHiRF
model_params = dict(
    cohirf_model=BaseCoHiRF,
    cohirf_kwargs=dict(base_model=DBSCAN, max_iter=1),
    n_batches=1000,
	n_jobs=10,
	verbose=1,
)
search_space = dict(
	cohirf_kwargs=dict(
		n_features=optuna.distributions.FloatDistribution(0.1, 1),
		repetitions=optuna.distributions.IntDistribution(1, 10),
		base_model_kwargs=dict(
			eps=optuna.distributions.FloatDistribution(1e-1, 10),
			min_samples=optuna.distributions.IntDistribution(2, 50),
		),
	)
)
default_values = [
	dict(
		cohirf_kwargs=dict(
			n_features=0.3,
			repetitions=5,
			base_model_kwargs=dict(
				eps=0.5,
				min_samples=5,
			),
		)
	),
]

In [5]:
seeds = [i for i in range(1)]
aris = []
times = []
memories = []
for seed in seeds:
    experiment = OpenmlClusteringExperiment(
        # model
        experiment_name="test",
        # model="BatchCoHiRF-DBSCAN-1iter",
        model=model_cls,
        model_params=model_params,
        n_jobs=10,
        # search_space=search_space,
        # default_values=default_values,
        seed_model=0,
        # dataset
        dataset_id=1110,
        standardize=True,
        raise_on_error=True,
        **experiment_params,
    )
    result = experiment.run(return_results=True)[0]
    ari = result["evaluate_model_return"]["adjusted_rand"]
    time = result["fit_model_return"]["elapsed_time"]
    max_memory = result["max_memory_used_after_fit"]
    aris.append(ari)
    times.append(time)
    memories.append(max_memory)
clear_output(wait=True)
df = pd.DataFrame({"ARI": aris, "Time": times, "Memory": memories})
print(df.describe())

            ARI       Time    Memory
count  1.000000   1.000000     1.000
mean   0.065867  83.848245  4100.824
std         NaN        NaN       NaN
min    0.065867  83.848245  4100.824
25%    0.065867  83.848245  4100.824
50%    0.065867  83.848245  4100.824
75%    0.065867  83.848245  4100.824
max    0.065867  83.848245  4100.824


In [6]:
seeds = [i for i in range(1)]
aris = []
times = []
memories = []
for seed in seeds:
    experiment = OpenmlClusteringExperiment(
        # model
        experiment_name="test",
        model="BatchCoHiRF-KernelRBF-1iter",
        n_jobs=10,
        # search_space=search_space,
        # default_values=default_values,
        seed_model=0,
        # dataset
        dataset_id=1110,
        standardize=True,
        raise_on_error=True,
        **experiment_params,
    )
    result = experiment.run(return_results=True)[0]
    ari = result["evaluate_model_return"]["adjusted_rand"]
    time = result["fit_model_return"]["elapsed_time"]
    max_memory = result["max_memory_used_after_fit"]
    aris.append(ari)
    times.append(time)
    memories.append(max_memory)
clear_output(wait=True)
df = pd.DataFrame({"ARI": aris, "Time": times, "Memory": memories})
print(df.describe())

           ARI        Time    Memory
count  1.00000    1.000000     1.000
mean   0.04139  198.484997  4896.528
std        NaN         NaN       NaN
min    0.04139  198.484997  4896.528
25%    0.04139  198.484997  4896.528
50%    0.04139  198.484997  4896.528
75%    0.04139  198.484997  4896.528
max    0.04139  198.484997  4896.528


In [None]:
seeds = [i for i in range(1)]
aris = []
times = []
memories = []
for seed in seeds:
    experiment = OpenmlClusteringExperiment(
        # model
        experiment_name="test",
        model="BatchCoHiRF-SC-SRGF",
        model_params=dict(n_batches=100, cohirf_kwargs=dict(n_samples_representative=10000)),
        n_jobs=1,
        # search_space=search_space,
        # default_values=default_values,
        seed_model=0,
        # dataset
        dataset_id=1110,
        standardize=True,
        raise_on_error=True,
        **experiment_params,
    )
    result = experiment.run(return_results=True)[0]
    ari = result["evaluate_model_return"]["adjusted_rand"]
    time = result["fit_model_return"]["elapsed_time"]
    max_memory = result["max_memory_used_after_fit"]
    aris.append(ari)
    times.append(time)
    memories.append(max_memory)
clear_output(wait=True)
df = pd.DataFrame({"ARI": aris, "Time": times, "Memory": memories})
print(df.describe())

Combinations completed:   0%|          | 0/1 [00:00<?, ?it/s]

fatal: not a git repository (or any parent up to mount point /mnt/nfs)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).


: 

# MNIST

## KMeans

In [12]:
seeds = [i for i in range(1)]
aris = []
times = []
memories = []
for seed in seeds:
    experiment = OpenmlClusteringExperiment(
        # model
        experiment_name="test",
        model="KMeans",
        seed_model=0,
        # dataset
        dataset_id=554,
        standardize=True,
        raise_on_error=True,
        **experiment_params,
    )
    result = experiment.run(return_results=True)[0]
    ari = result["evaluate_model_return"]["adjusted_rand"]
    time = result["fit_model_return"]["elapsed_time"]
    max_memory = result["max_memory_used_after_fit"]
    aris.append(ari)
    times.append(time)
    memories.append(max_memory)
clear_output(wait=True)
df = pd.DataFrame({"ARI": aris, "Time": times, "Memory": memories})
print(df.describe())

           ARI      Time    Memory
count  1.00000  1.000000     1.000
mean   0.28394  5.946795  2598.252
std        NaN       NaN       NaN
min    0.28394  5.946795  2598.252
25%    0.28394  5.946795  2598.252
50%    0.28394  5.946795  2598.252
75%    0.28394  5.946795  2598.252
max    0.28394  5.946795  2598.252


In [13]:
seeds = [i for i in range(1)]
aris = []
times = []
memories = []
for seed in seeds:
    experiment = OpenmlClusteringExperiment(
        # model
        experiment_name="test",
        model="DBSCAN",
        seed_model=0,
        # dataset
        dataset_id=554,
        standardize=True,
        raise_on_error=True,
        **experiment_params,
    )
    result = experiment.run(return_results=True)[0]
    ari = result["evaluate_model_return"]["adjusted_rand"]
    time = result["fit_model_return"]["elapsed_time"]
    max_memory = result["max_memory_used_after_fit"]
    aris.append(ari)
    times.append(time)
    memories.append(max_memory)
clear_output(wait=True)
df = pd.DataFrame({"ARI": aris, "Time": times, "Memory": memories})
print(df.describe())

       ARI       Time    Memory
count  1.0   1.000000     1.000
mean   0.0  47.488333  2974.928
std    NaN        NaN       NaN
min    0.0  47.488333  2974.928
25%    0.0  47.488333  2974.928
50%    0.0  47.488333  2974.928
75%    0.0  47.488333  2974.928
max    0.0  47.488333  2974.928


In [14]:
seeds = [i for i in range(1)]
aris = []
times = []
memories = []
for seed in seeds:
    experiment = OpenmlClusteringExperiment(
        # model
        experiment_name="test",
        model="SpectralSubspaceRandomization",
        seed_model=0,
        # dataset
        dataset_id=554,
        standardize=True,
        raise_on_error=True,
        **experiment_params,
    )
    result = experiment.run(return_results=True)[0]
    ari = result["evaluate_model_return"]["adjusted_rand"]
    time = result["fit_model_return"]["elapsed_time"]
    max_memory = result["max_memory_used_after_fit"]
    aris.append(ari)
    times.append(time)
    memories.append(max_memory)
clear_output(wait=True)
df = pd.DataFrame({"ARI": aris, "Time": times, "Memory": memories})
print(df.describe())

Combinations completed:   0%|          | 0/1 [00:00<?, ?it/s]

MemoryError: Unable to allocate 36.5 GiB for an array with shape (70000, 70000) and data type float64

In [None]:
seeds = [i for i in range(1)]
aris = []
times = []
memories = []
for seed in seeds:
    experiment = OpenmlClusteringExperiment(
        # model
        experiment_name="test",
        model="SpectralSubspaceRandomization",
        seed_model=0,
        # dataset
        dataset_id=554,
        standardize=True,
        raise_on_error=True,
        **experiment_params,
    )
    result = experiment.run(return_results=True)[0]
    ari = result["evaluate_model_return"]["adjusted_rand"]
    time = result["fit_model_return"]["elapsed_time"]
    max_memory = result["max_memory_used_after_fit"]
    aris.append(ari)
    times.append(time)
    memories.append(max_memory)
clear_output(wait=True)
df = pd.DataFrame({"ARI": aris, "Time": times, "Memory": memories})
print(df.describe())