In [None]:
!python hubert/learn_kmeans.py /content/km 0 1 /content/km/kmeans_model.ext 100 --percent 0.1

In [None]:
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
import pyarrow.dataset as ds

import joblib
from sklearn.cluster import MiniBatchKMeans

In [None]:
feat_dir_path = f"/content/mfcc_features"
columns_to_load = ["features"]
dataset = ds.dataset(feat_dir_path, format="parquet")
df = dataset.to_table(columns=columns_to_load).to_pandas().to_numpy()
for i in range(df.shape[0]):
    df[i, 0] = df[i, 0].reshape(-1, 39)

training_data = np.vstack([df[i, 0] for i in range(df.shape[0])])

In [None]:
n_clusters = 100
batch_size = 2 # experiment with ones that fit better 
init = "k-means++"
max_iter = 100
max_no_improvement = 100
reassignment_ratio = 0
tol = 0.0
n_init = 20
knn = MiniBatchKMeans(
    n_clusters=n_clusters,
    init=init,
    max_iter=max_iter,
    batch_size=batch_size,
    verbose=1,
    compute_labels=False,
    tol=tol,
    max_no_improvement=max_no_improvement,
    init_size=None,
    n_init=n_init,
    reassignment_ratio=reassignment_ratio,
)

print(training_data.shape)

knn.fit(training_data)

km_path = "/content/knn_100.model"
joblib.dump(knn, km_path)

inertia = -knn.score(training_data) / len(training_data)
print(f"total intertia: {inertia}")

In [None]:
!python hubert/dump_km_label.py /content/km 0 /content/km/dm.ext 1 0 /content/km

In [None]:
class ApplyKmeans(object):
    def __init__(self, km_path):
        self.km_model = joblib.load(km_path)
        self.C_np = self.km_model.cluster_centers_.transpose()
        self.Cnorm_np = (self.C_np ** 2).sum(0, keepdims=True)

    def __call__(self, x):
          x = x.reshape(1, -1)
          dist = ((x ** 2).sum(1, keepdims=True) - 2 * np.matmul(x, self.C_np) + self.Cnorm_np)
          return np.argmin(dist, axis=1)

km_path = "/content/knn_100.model"
lab_dir = "/content/labels"
lab_path = f"{lab_dir}/knn100.km"
feat_dir = "/content/mfcc_features"
columns_to_load = ["features"]

apply_kmeans = ApplyKmeans(km_path)
dataset = ds.dataset(feat_dir, format="parquet")
df = dataset.to_table(columns=columns_to_load).to_pandas().to_numpy()

for i in range(df.shape[0]):
    df[i, 0] = df[i, 0].reshape(-1, 39)

mfcc_features_np = np.vstack([df[i, 0] for i in range(df.shape[0])])

with open(lab_path, mode="w", newline="") as file:    
    for row in mfcc_features_np:
        labels = apply_kmeans(row).tolist()
        file.write(" ".join(map(str, labels)) + "\n")