
# Scikit Learn

Apply Scikit learn's kmeans algorithm on a grid and density from Octopus, as a means of testing choice of centroids/the fortran centroid algorithm

In [2]:
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans

# number of grid points
n_points = 97013

# Import density
rho = np.loadtxt("../fortran_data/rho.dat")
assert rho.size == n_points

# Import grid
grid = np.loadtxt("../fortran_data/grid.dat")
assert grid.shape == (n_points, 3)

for n_clusters in [5, 10, 15, 20, 30, 50, 100, 150, 200]:
    k_means = KMeans(n_clusters=n_clusters, init='k-means++', n_init='auto', max_iter=300, tol=0.0001, verbose=0, algorithm='lloyd')
    k_means.fit(grid, sample_weight = rho)
    clusters = k_means.cluster_centers_
    assert clusters.shape == (n_clusters, 3)
    
    # Discretise the continuous cluster points to the grid
    distances = cdist(clusters, grid)
    indices = np.argmin(distances, axis=1)
    discrete_clusters = grid[indices]
    
    # Output to file
    output = np.column_stack([indices, discrete_clusters])
    np.savetxt(f'sklearn_centroids_{n_clusters}.txt', output, fmt='%d %20.16f %20.16f %20.16f')





In [28]:
# Alternative implementation
# indices2 = np.empty(shape=n_points, dtype=int)
# for i in range(n_clusters):
#     # Calculate Euclidean distance to all grid points
#     distances = np.sqrt(np.sum((grid - clusters[i, :]) ** 2, axis=1))
#     # Find index of minimum distance
#     indices2[i] = np.argmin(distances)


