# MiniBatchKMeans Demo
This demo illustrates how SIMBSIG can be used for MiniBatchKMeans, and how the use compares to scikit-learn. 

In [36]:
# DELETE
# Mount your google drive in google colab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [37]:
# DELETE
# Insert the directory
import sys
sys.path.insert(0,'/content/drive/MyDrive/simbsig/')
# Import worker classes
# from src.TestModule import TestClass
from simbsig.cluster import MiniBatchKMeans

In [38]:
# from simbsig.cluster import MiniBatchKMeans
from sklearn.cluster import MiniBatchKMeans as MiniBatchKMeans_sk
import h5py as h5py
import numpy as np
from sklearn.datasets import make_blobs
import os

## Set Parameters

In [39]:
n_clusters = 3
trn_pts_per_cluster = 10
query_pts_per_cluster = 5
n_dim = 2
rng = np.random.RandomState(42)
centers = rng.uniform(low=-20,high=20,size=(n_clusters,n_dim))
centers_init = rng.uniform(low=-20,high=20,size=(n_clusters,n_dim))

## Create Toy Data
### numpy arrays

In [41]:
X_train, y_train = make_blobs(n_samples=n_clusters * trn_pts_per_cluster, centers=centers, n_features=n_dim,
                              random_state=42)

X_query, y_query = make_blobs(n_samples=n_clusters * query_pts_per_cluster, centers=centers, n_features=n_dim,
                              random_state=43)

X_train = np.array(X_train)
X_query = np.array(X_query)

X_train = np.random.uniform(low=0, high=5, size=(30,2))
X_query = np.random.uniform(low=0, high=5, size=(15,2))


### hdf5 files

In [42]:
# when working on colab, google drive can be used to save and read data
from google.colab import drive
drive.mount('/content/drive')

# depending on the structure of your google drive, you might want to choose a different dataset path
dataset_path = '/content/drive/MyDrive/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [43]:
# hdf5 files using h5py
train_file = f'train.hdf5'
query_file = f'query.hdf5'

train_data.close()
query_data.close()
with h5py.File(os.path.join(dataset_path, f"{train_file}"), 'w') as f:
    f.create_dataset("X", data=X_train)

with h5py.File(os.path.join(dataset_path, f"{query_file}"), 'w') as f:
    f.create_dataset("X", data=X_query)

# open the hdf5 file for use
train_data = h5py.File(os.path.join(dataset_path, train_file), 'r')
query_data = h5py.File(os.path.join(dataset_path, query_file))
a = train_data['X']
len(a)

30

## Scikit-Learn


In [44]:
mbkm_sk = MiniBatchKMeans_sk(n_clusters=3, init=centers_init)

mbkm_sk.fit(X_train)
mbkm_sk.predict(X_query)

  super()._check_params(X)


array([2, 0, 1, 0, 1, 0, 1, 2, 2, 0, 0, 1, 0, 1, 1], dtype=int32)

## SIMBSIG
### Using numpy arrays and CPU only
SIMBSIG can be used very similar to scikit-learn. In an existing workflow using scikit-learn, which may be on the verge of exceeding runtime or memory requirements, this allows a seamless transition to SIMBSIG. Notice that when using different methods for KMeans clustering, the labels assigned to the clusters may be of different order.


In [45]:
mbkm = MiniBatchKMeans(n_clusters=3, init=centers_init)

mbkm.fit(X_train)
mbkm.predict(X_query)

30
30
[[1.75301537 2.74224   ]
 [2.87866859 2.92583461]
 [0.67943025 0.25190101]
 [1.44707225 2.4795751 ]
 [2.63554612 2.59275228]
 [0.30016169 0.98933987]
 [2.9817255  4.2730066 ]
 [3.32022373 4.39239379]
 [1.12020171 4.20809347]
 [2.44162769 2.97034718]
 [2.75304759 2.92197316]
 [1.71175546 1.1315079 ]
 [0.35563929 4.80950369]
 [4.46041419 2.62490706]
 [3.15991216 1.83338047]
 [2.24264762 2.91461374]
 [3.14840631 3.05225147]
 [2.53191357 3.70320035]
 [4.0324448  1.84200755]
 [1.30622026 0.57575946]
 [0.52815336 2.53817984]
 [0.5609386  1.62152762]
 [0.5859996  2.67519745]
 [2.84887705 0.1606158 ]
 [0.7623426  1.00685724]
 [3.96765312 3.08023   ]
 [3.64135359 4.44460153]
 [0.92844268 4.35097879]
 [3.13119109 3.98271301]
 [0.48799849 0.70331443]]




array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

### Using hdf5 files and CPU only
If saving the entire data at once in the computer memory using numpy arrays is not reasonable anymore, the hdf5 file format can help. SIMBSIG can use data in hdf5 files, by setting the `mode` argument to `cpu`

In [46]:
mbkm_hdf5 = MiniBatchKMeans(n_clusters=3, init=centers_init)

mbkm_hdf5.fit(train_data)
#mbkm_hdf5.predict(query_data)

1
1
['X']


ValueError: ignored

### Using GPU acceleration
If data gets big, the execution time becomes an issue. SIMBSIG features GPU acceleration, by setting the `device` argument to `gpu`. This works with both inputs, numpy arrays and hdf5 files.

In [None]:
mbkm_hdf5 = MiniBatchKMeans(n_neighbors=2, mode='hdf5', device='gpu', init=centers_init)

mbkm_hdf5.fit(train_data)
mbkm_hdf5.predict(query_data)