# Dask client setup

In [10]:
from dask.distributed import LocalCluster
cluster = LocalCluster(n_workers=4,processes=True,
    threads_per_worker=1)          # Fully-featured local Dask cluster
client = cluster.get_client()
client

In [2]:
import sys
sys.path.append('../')

import algorithms.lloyd_clustering as lloyd

# Generate data

In [18]:
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
import numpy as np

# Generate data for different center amounts
for center_amount in range(2, 11):
    X, y = make_blobs(n_samples=1000, centers=center_amount, n_features=2, random_state=42)
    X = StandardScaler().fit_transform(X)
    np.savetxt(f'../data/data_center_{center_amount}.txt', X, fmt='%.8f')

# Generate data for different dimension amounts
for dimension_amount in range(2, 11):
    X, y = make_blobs(n_samples=1000, centers=5, n_features=dimension_amount, random_state=42)
    X = StandardScaler().fit_transform(X)
    np.savetxt(f'../data/data_dimension_{dimension_amount}.txt', X, fmt='%.8f')

# Generate data for different sample amounts
for sample_amount in range(1000, 10001, 1000):
    X, y = make_blobs(n_samples=sample_amount, centers=5, n_features=2, random_state=42)
    X = StandardScaler().fit_transform(X)
    np.savetxt(f'../data/data_sample_{sample_amount}.txt', X, fmt='%.8f')


## Test the dask version with the dashboard to see if the code is running in parallel

In [None]:
lloyd_algorithm('../data/data_05.txt',5,True,3)

it's visibly parallel but the data is quite small

# Run all versions of the algorithm.
Use memray and timeit to measure performance.

## Import all required extra packages for benchmarking

In [None]:
import logging
import tqdm
import timeit
import memray
import ast

## Time benchmarks using timeit.

In [1]:
log = logging.getLogger(__name__)

def time_benchmark():
    log.info('Starting time benchmark')
    
    for center_amount in range(2,11):
        # Read the data from the file
        with open(f'../data/data_center_{center_amount}.txt', encoding="utf-8") as file:
            filecontent = file.read()
            points: list[tuple[float, ...]] = list(ast.literal_eval(filecontent))
        
        # Iterate 100 times to check means and error margin.
        for i in range(0,100):
            initial_centers= lloyd._k_means_plus_plus(center_amount, points)
            
            # Time the implementations
            time_base = timeit.timeit(lambda: lloyd._k_means_base(points, initial_centers), number=1)
            time_numpy = timeit.timeit(lambda: lloyd._k_means_numpy(points, initial_centers), number=1)
            time_dask = timeit.timeit(lambda: lloyd._k_means_dask(points, initial_centers), number=1)
            
            log.info(f'saving results for center amount: {center_amount}, iteration: {i}')
            np.savetxt(f'../output/output_center_{center_amount}.csv', f'{time_base},', fmt='%.8f')
            np.savetxt(f'../output/output_center_{center_amount}.csv', f'{time_numpy},', fmt='%.8f')
            np.savetxt(f'../output/output_center_{center_amount}.csv', f'{time_dask},', fmt='%.8f')

    for dimensions in range(2,11):
        #Read the data from the file
        with open(f'../data/data_dimension_{dimensions}.txt'
                
        
        