In [None]:
# Import libraries
# Dask
from dask.distributed import Client, SSHCluster, progress
import dask.array as da
import dask.dataframe as dd
# Others
import numpy as np
import pandas as pd

In [None]:
cluster = SSHCluster(
    ["10.67.22.240", "10.67.22.17", "10.67.22.100", "10.67.22.126"],
    connect_options={"known_hosts": None},
    worker_options={"nthreads": 2},
    scheduler_options={"port": 0, "dashboard_address": ":8787"}
)
client = Client(cluster)

In [None]:
client

## Developing the K-means|| algorithm

### Aux functions

One important function here is the cost:

$$\phi_X(C) = \sum_{x\in X} d^2(x, C) = \sum_{x\in X} \min_{i=1,..., k}||x-c_i||^2$$

Notice that:

$$||x-y||^2 = \sum_j (x_j - y_j)^2 = \sum_j x_j^2 - 2x_j y_j + y_j^2$$

Then, for $x_n\in\{x_1, x_2, ..., x_{|X|}\}$ and $c_m\in{c_1, c_2, ..., c_{|C|}}$, we can define the squared distance matrix $D^2\in\mathbb{R}^{|X|\times |C|}$ as:

$$D^2_{nm} = ||x_n-c_m||^2 = \sum_j x_{nj}^2 - 2x_{nj} c_{mj} + c_{mj}^2$$

$$D^2_{nm} = \sum_j x_{nj}x^T_{jn} - 2\sum_j x_{nj} c^T_{jm} + \sum_j c_{mj}c^T_{jm} = (XX^T)_{nn} - 2 (XC^T)_{nm} + (CC^T)_{mm}$$

Furthermore, notice that if the data $X$ is fixed, then the matrix $(XX^T)$ is constant and does not depend on the choice of centroids $C$. Therefore, we may calculate the vector $(XX^T)_{nn}$ at the beggining of the process and store it so that we don't need to recalculate it every time we want to estimate the cost function.

In [None]:
def get_first_sample(data_path):
    '''
    Params:
        data_path : str
            Path to a given data file.
    Output:
        Returns the first row of the given data.
    '''
    # Read only first row using pandas
    return pd.read_csv(data_path, nrows=1)


def get_XXT_term(X):
    '''
    Params:
        X : Dask array or dataframe
            Array containing data points.
    Output:
        Returns the self multiplication term from the
        squared distance matrix formula.
    '''
    # Turn into array
    X_da = da.array(X)
    # Get diagonal of X*X_T
    XXT = da.diag(da.matmul(X_da, da.transpose(X_da)))
    return XXT


def partial_squared_dist_matrix(C, X):
    '''
    Params:
        C : Dask array or dataframe
            Array containing centroid locations
        X : Dask array or dataframe
            Array containing data points.
    Output:
        Returns the partial squared distance matrix, 
        evaluated over the set of points X with respect 
        to the centroids C. The partial matrix is defined
        as:
         D'^2 = 2*XC^T + CC^T
    '''
    # Turn into arrays
    X_da = da.array(X)
    
    # Calculate XC term
    XC_term = da.matmul(X_da, da.transpose(C))

    # Calculate CC term
    CC_term = da.einsum('ij,ji -> i', C, da.transpose(C))
    
    return 2*XC_term + CC_term
    

def cost_function(C, X, XXT):
    '''
    Params:
        C : Dask array or dataframe
            Array containing centroid locations
        X : Dask array or dataframe
            Array containing data points.
        XXT : Dask array or dataframe
            Array with same number of rows as X, 
            containing the X*X^T term of the
            squared distance matrix.
    Output:
        Returns the K-means cost function, evaluated
        over the set of points X with respect to the
        centroids C.
    '''
    # First, get partial squared distances
    D2 = partial_squared_dist_matrix(C, X)

    # Minimize over C axis and sum
    D2_min_sum = da.sum(da.min(D2, axis=1))

    # Add to XXT sum and return
    return da.sum(XXT) + D2_min_sum

### Main pipeline

In [None]:
# Script parameters
RANDOM_SEED = 42
INPUT_DATA = 'testing_data.csv'
LABEL_COLUMN = 'label'
NPARTITIONS = 1

# Initialize random number generator from Dask and numpy seed
rng = da.random.default_rng(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

# Read input data
data = pd.read_csv(INPUT_DATA)
data_shape = data.shape

# Separate labels from input
if LABEL_COLUMN != None:
    # Labels
    future = client.scatter(data[LABEL_COLUMN])  # send labels to one worker
    y_true = dd.from_delayed([future], meta=data[LABEL_COLUMN])  # build dask.dataframe on remote data
    y_true = y_true.repartition(npartitions=NPARTITIONS).persist()  # split
    client.rebalance(y_true)  # spread around all of your workers

    # Input
    X_width = data_shape[1]-1
    X = data.drop(columns=[LABEL_COLUMN])
    future = client.scatter(X) # send data to one worker
    X = dd.from_delayed([future], meta=X)  # build dask.dataframe on remote data
    X = X.repartition(npartitions=NPARTITIONS).persist()  # split
    client.rebalance(X)  # spread around all of your workers
    
else:
    # Only input
    X_width = data_shape[1]
    X = data
    future = client.scatter(X) # send data to one worker
    X = dd.from_delayed([future], meta=X, shape=data_shape)  # build dask.dataframe on remote data
    X = X.repartition(npartitions=NPARTITIONS).persist()  # split
    client.rebalance(X)  # spread around all of your workers

# Run the K-means algorithm:
# Get first sample as initial centroid
first_sample = get_first_sample(INPUT_DATA)
if LABEL_COLUMN != None:
    first_sample = first_sample.drop(columns=[LABEL_COLUMN])
C = da.array([np.array(first_sample).flatten()])

# Calculate constant XXT term, 
# also persist since we are going to reuse it 
XXT = get_XXT_term(X).persist()

# Get initial cost function
phi_init = cost_function(C, X, XXT)
phi_init = phi_init.compute()
phi_init

## Closing the client

In [None]:
client.close()