# Dask client setup

In [1]:
from dask.distributed import LocalCluster
cluster = LocalCluster(n_workers=4,processes=True,
    threads_per_worker=1)          # Fully-featured local Dask cluster
client = cluster.get_client()
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 4,Total memory: 14.86 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:41797,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 4
Started: Just now,Total memory: 14.86 GiB

0,1
Comm: tcp://127.0.0.1:36787,Total threads: 1
Dashboard: http://127.0.0.1:42309/status,Memory: 3.71 GiB
Nanny: tcp://127.0.0.1:40767,
Local directory: /tmp/dask-scratch-space/worker-z43j90ar,Local directory: /tmp/dask-scratch-space/worker-z43j90ar

0,1
Comm: tcp://127.0.0.1:40465,Total threads: 1
Dashboard: http://127.0.0.1:42775/status,Memory: 3.71 GiB
Nanny: tcp://127.0.0.1:42549,
Local directory: /tmp/dask-scratch-space/worker-4f1833z1,Local directory: /tmp/dask-scratch-space/worker-4f1833z1

0,1
Comm: tcp://127.0.0.1:37235,Total threads: 1
Dashboard: http://127.0.0.1:33441/status,Memory: 3.71 GiB
Nanny: tcp://127.0.0.1:44593,
Local directory: /tmp/dask-scratch-space/worker-btqu0s9x,Local directory: /tmp/dask-scratch-space/worker-btqu0s9x

0,1
Comm: tcp://127.0.0.1:43769,Total threads: 1
Dashboard: http://127.0.0.1:37655/status,Memory: 3.71 GiB
Nanny: tcp://127.0.0.1:35669,
Local directory: /tmp/dask-scratch-space/worker-ilfml5wb,Local directory: /tmp/dask-scratch-space/worker-ilfml5wb


In [2]:
import sys
sys.path.append('../')

import algorithms.lloyd_clustering as lloyd

# Generate data

In [18]:
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
import numpy as np

# Generate data for different center amounts
for center_amount in range(2, 11):
    X, y = make_blobs(n_samples=1000, centers=center_amount, n_features=2, random_state=42)
    X = StandardScaler().fit_transform(X)
    np.savetxt(f'../data/data_center_{center_amount}.txt', X, fmt='%.8f')

# Generate data for different dimension amounts
for dimension_amount in range(2, 11):
    X, y = make_blobs(n_samples=1000, centers=5, n_features=dimension_amount, random_state=42)
    X = StandardScaler().fit_transform(X)
    np.savetxt(f'../data/data_dimension_{dimension_amount}.txt', X, fmt='%.8f')

# Generate data for different sample amounts
for sample_amount in range(1000, 10001, 1000):
    X, y = make_blobs(n_samples=sample_amount, centers=5, n_features=2, random_state=42)
    X = StandardScaler().fit_transform(X)
    np.savetxt(f'../data/data_sample_{sample_amount}.txt', X, fmt='%.8f')


## Test the dask version with the dashboard to see if the code is running in parallel

In [3]:
lloyd.lloyd_algorithm('../data/data_05.txt',5,True,3)

{(np.float64(24.253505300698617),
  np.float64(77.24744735942157),
  np.float64(57.60686892373834),
  np.float64(41.833895158532414),
  np.float64(50.9036591919488)),
 (np.float64(32.794065774457934),
  np.float64(27.666926029671128),
  np.float64(23.206297333748314),
  np.float64(51.28078638862953),
  np.float64(49.50612096690528)),
 (np.float64(46.36948501541416),
  np.float64(25.585434881487846),
  np.float64(78.56031738009804),
  np.float64(60.84580785364128),
  np.float64(51.28473240006065)),
 (np.float64(71.65840429845905),
  np.float64(67.70377128953771),
  np.float64(42.65845498783455),
  np.float64(77.68881792376318),
  np.float64(49.4102798053528)),
 (np.float64(77.4585302343524),
  np.float64(52.477625614813384),
  np.float64(48.97892757257209),
  np.float64(22.298968077924584),
  np.float64(51.6286044941653))}

it's visibly parallel but the data is quite small

# Run all versions of the algorithm.
Use memray and timeit to measure performance.

## Import all required extra packages for benchmarking

In [None]:
import logging
import timeit
import memray
import ast
import dask.array as da

## Time benchmarks using timeit.

In [1]:
log = logging.getLogger(__name__)

def time_benchmark():
    log.info('Starting time benchmark')
    
    for center_amount in range(2,11):
        # Read the data from the file
        with open(f'../data/data_center_{center_amount}.txt', encoding="utf-8") as file:
            filecontent = file.read()
            points: list[tuple[float, ...]] = list(ast.literal_eval(filecontent))
        
        # Iterate 100 times to check means and error margin.
        for i in range(0,100):
            initial_centers= lloyd.k_means_plus_plus(points, center_amount)            
            # Time the implementations
            time_base = timeit.timeit(lambda: lloyd.k_means_base(points, initial_centers), number=1)
            time_numpy = timeit.timeit(lambda: lloyd.k_means_numpy(np.array(points), initial_centers), number=1)
            time_dask = timeit.timeit(lambda: lloyd.k_means_dask(da.array(points), initial_centers), number=1)
            
            log.info(f'saving results for center amount: {center_amount}, iteration: {i}')
            np.savetxt(f'../output/output_center_{center_amount}.csv', f'{time_base},', fmt='%.8f')
            np.savetxt(f'../output/output_center_{center_amount}.csv', f'{time_numpy},', fmt='%.8f')
            np.savetxt(f'../output/output_center_{center_amount}.csv', f'{time_dask},', fmt='%.8f')

    for dimensions in range(2,11):
        #Read the data from the file
        with open(f'../data/data_dimension_{dimensions}.txt', encoding="utf-8") as file:
            filecontent = file.read()
            points: list[tuple[float, ...]] = list(ast.literal_eval(filecontent))
        
        # Iterate 100 times to check means and error margin.
        for i in range(0,100):
            initial_centers= lloyd.k_means_plus_plus(points, 5)
            
            # Time the implementations
            time_base = timeit.timeit(lambda: lloyd.k_means_base(points, initial_centers), number=1)
            time_numpy = timeit.timeit(lambda: lloyd.k_means_numpy(np.array(points), initial_centers), number=1)
            time_dask = timeit.timeit(lambda: lloyd.k_means_dask(da.array(points), initial_centers), number=1)
            
            log.info(f'saving results for dimension amount: {dimensions}, iteration: {i}')
            with open(f'../output/output_dimension_{dimensions}.csv', 'a') as file:
                file.write(f'{time_base},')
                file.write(f'{time_numpy},')
                file.write(f'{time_dask}\n')
        
    for sample_amount in range(1000, 10001, 1000):
        # Read the data from the file
        with open(f'../data/data_sample_{sample_amount}.txt', encoding="utf-8") as file:
            filecontent = file.read()
            points: list[tuple[float, ...]] = list(ast.literal_eval(filecontent))
        
        # Iterate 100 times to check means and error margin.
        for i in range(0,100):
            initial_centers= lloyd.k_means_plus_plus(points, 5)
            
            # Time the implementations
            time_base = timeit.timeit(lambda: lloyd.k_means_base(points, initial_centers), number=1)
            time_numpy = timeit.timeit(lambda: lloyd.k_means_numpy(np.array(points), initial_centers), number=1)
            time_dask = timeit.timeit(lambda: lloyd.k_means_dask(da.array(points), initial_centers), number=1)
            
            log.info(f'saving results for sample amount: {sample_amount}, iteration: {i}')
            np.savetxt(f'../output/output_sample_{sample_amount}.csv', f'{time_base},', fmt='%.8f')
            np.savetxt(f'../output/output_sample_{sample_amount}.csv', f'{time_numpy},', fmt='%.8f')
            np.savetxt(f'../output/output_sample_{sample_amount}.csv', f'{time_dask},', fmt='%.8f')

SyntaxError: incomplete input (3361026211.py, line 29)