## Benchmarking

In [1]:
from functions import compute_choleskyQR_parallel_optimal, direct_tsqr

### 1) Changing the number of partitions 

Let's spawn a cluster with 12 workers:

In [None]:
from dask.distributed import Client, SSHCluster

cluster = SSHCluster(
    ["10.67.22.154", "10.67.22.216", "10.67.22.116", "10.67.22.113"],
    connect_options={"known_hosts": None},
    remote_python="/home/ubuntu/miniconda3/bin/python",
    scheduler_options={"port": 8786, "dashboard_address": ":8797"},
    worker_options={
        "nprocs": 4,        # We spawn 4 processes for each VM (4-core) -> 12 workers
        "nthreads": 1       # We use 1 threads. Following Dask documentation, however, Numpy should release well the GIL lock thus we could use more than 1 thread. 
    }
)

client = Client(cluster)

In [3]:
# Let's see if everything went smoothly
cluster

0,1
Dashboard: http://10.67.22.154:8797/status,Workers: 12
Total threads: 12,Total memory: 23.25 GiB

0,1
Comm: tcp://10.67.22.154:8786,Workers: 0
Dashboard: http://10.67.22.154:8797/status,Total threads: 0
Started: Just now,Total memory: 0 B

0,1
Comm: tcp://10.67.22.113:35699,Total threads: 1
Dashboard: http://10.67.22.113:42281/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.113:45595,
Local directory: /tmp/dask-scratch-space/worker-w4m5_uo_,Local directory: /tmp/dask-scratch-space/worker-w4m5_uo_

0,1
Comm: tcp://10.67.22.113:35779,Total threads: 1
Dashboard: http://10.67.22.113:45199/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.113:37363,
Local directory: /tmp/dask-scratch-space/worker-3_7yk27r,Local directory: /tmp/dask-scratch-space/worker-3_7yk27r

0,1
Comm: tcp://10.67.22.113:36141,Total threads: 1
Dashboard: http://10.67.22.113:36243/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.113:40497,
Local directory: /tmp/dask-scratch-space/worker-hlge5i0r,Local directory: /tmp/dask-scratch-space/worker-hlge5i0r

0,1
Comm: tcp://10.67.22.113:39159,Total threads: 1
Dashboard: http://10.67.22.113:36015/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.113:34615,
Local directory: /tmp/dask-scratch-space/worker-u5066s53,Local directory: /tmp/dask-scratch-space/worker-u5066s53

0,1
Comm: tcp://10.67.22.116:37217,Total threads: 1
Dashboard: http://10.67.22.116:39585/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.116:35253,
Local directory: /tmp/dask-scratch-space/worker-jifjjtf5,Local directory: /tmp/dask-scratch-space/worker-jifjjtf5

0,1
Comm: tcp://10.67.22.116:38751,Total threads: 1
Dashboard: http://10.67.22.116:34235/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.116:45821,
Local directory: /tmp/dask-scratch-space/worker-xkiizybe,Local directory: /tmp/dask-scratch-space/worker-xkiizybe

0,1
Comm: tcp://10.67.22.116:43411,Total threads: 1
Dashboard: http://10.67.22.116:38189/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.116:44073,
Local directory: /tmp/dask-scratch-space/worker-7t551a0e,Local directory: /tmp/dask-scratch-space/worker-7t551a0e

0,1
Comm: tcp://10.67.22.116:46523,Total threads: 1
Dashboard: http://10.67.22.116:41473/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.116:43699,
Local directory: /tmp/dask-scratch-space/worker-mg7te65q,Local directory: /tmp/dask-scratch-space/worker-mg7te65q

0,1
Comm: tcp://10.67.22.216:36277,Total threads: 1
Dashboard: http://10.67.22.216:46553/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.216:39657,
Local directory: /tmp/dask-scratch-space/worker-q3kdni9i,Local directory: /tmp/dask-scratch-space/worker-q3kdni9i

0,1
Comm: tcp://10.67.22.216:43213,Total threads: 1
Dashboard: http://10.67.22.216:40897/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.216:41635,
Local directory: /tmp/dask-scratch-space/worker-9d_i8x80,Local directory: /tmp/dask-scratch-space/worker-9d_i8x80

0,1
Comm: tcp://10.67.22.216:43347,Total threads: 1
Dashboard: http://10.67.22.216:36847/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.216:37151,
Local directory: /tmp/dask-scratch-space/worker-cp2ag9pp,Local directory: /tmp/dask-scratch-space/worker-cp2ag9pp

0,1
Comm: tcp://10.67.22.216:46565,Total threads: 1
Dashboard: http://10.67.22.216:37665/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.216:42869,
Local directory: /tmp/dask-scratch-space/worker-skj67ijy,Local directory: /tmp/dask-scratch-space/worker-skj67ijy


To test our algorithms, we will use the HIGGS dataset. This is particularly interestung because it's a sufficiently large dataset that wouldn't reside in a laptop RAM and can be easily retrieved from the web. Let's load it first into a dask.array

In [4]:
import dask.dataframe as dd
import os

os.chdir("/home/ubuntu") 
path_HIGGS = os.getcwd() + "/datasets/HIGGS.csv"

df = dd.read_csv(path_HIGGS, header=None, blocksize="200MB")    # The block size can be customized, let's start with 200 MB
X_df = df.iloc[:, 1:] 
X_da = X_df.to_dask_array(lengths=True)
print(X_da.npartitions)

40


In [5]:
X_da

Unnamed: 0,Array,Chunk
Bytes,2.29 GiB,58.75 MiB
Shape,"(11000000, 28)","(275002, 28)"
Dask graph,40 chunks in 1 graph layer,40 chunks in 1 graph layer
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 2.29 GiB 58.75 MiB Shape (11000000, 28) (275002, 28) Dask graph 40 chunks in 1 graph layer Data type float64 numpy.ndarray",28  11000000,

Unnamed: 0,Array,Chunk
Bytes,2.29 GiB,58.75 MiB
Shape,"(11000000, 28)","(275002, 28)"
Dask graph,40 chunks in 1 graph layer,40 chunks in 1 graph layer
Data type,float64 numpy.ndarray,float64 numpy.ndarray


Using blocks of $200 MB$, Dask has decided to divide the array into $40$ chunks. What we want to do now is to study how the performances of each algorithm change as the number of partition is varies. For this first experiment, we'll keep fixed the number of workers (12) and the dataset itself

In [39]:
import time
from dask.distributed import wait
import numpy as np
from dask.array.linalg import tsqr

def benchmark(X_da, n_partition, algorithm, rep = 20):
    res = []
    X_da = X_da.rechunk({0: X_da.shape[0] // n_partition, 1: -1}).persist()
    wait(X_da)   # Wait until Dask has finished to repartition (we don't want to count the time used to repartition
    print("Completion status: ", end = "")
    for _ in range(rep):
        print( _, end = " ")
        #start a timer
        start = time.time()
        # launch the function
        Q, R = algorithm(X_da)
        Q = Q.persist()
        R = R.persist()
        wait([Q, R])   # wait both Q and R are available in the worker's memory. This however should not send data to the client
        end = time.time()
        res.append(end-start)
        # Remove Q and R from the worker's memory, so that each run is the same
        client.cancel(Q)
        client.cancel(R)
    return np.mean(res), np.std(res), res
    
mean, std, res = benchmark(X_da, 20, compute_choleskyQR_parallel_optimal)

Completion status: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 

In [40]:
Qch, Rch = compute_choleskyQR_parallel_optimal(X_da)
Qd, Rd = direct_tsqr(X_da)

Qch = Qch.persist()
Rch = Rch.persist()
Qd = Qd.persist()
Rd = Rd.persist()

In [43]:
Rd = Rd.compute()
Rch = Rch.compute()

In [62]:
Rch.T[0]-Rd[0]

array([-2.27373675e-12, -2.60902411e-15,  8.32667268e-16,  4.54747351e-13,
       -2.05391260e-15,  4.54747351e-13,  1.72084569e-15,  1.22124533e-15,
        1.81898940e-12, -4.54747351e-13, -2.18922103e-15,  2.50494070e-15,
       -4.54747351e-13, -9.09494702e-13, -3.33066907e-16, -3.33066907e-16,
        9.09494702e-13,  1.36424205e-12, -2.13717932e-15,  2.44596010e-15,
        9.09494702e-13, -1.36424205e-12, -4.54747351e-13,  4.54747351e-13,
       -4.54747351e-13,  4.54747351e-13,  0.00000000e+00,  4.54747351e-13])

In [63]:
diff = np.abs(Rch - Rd).flatten()
print(np.sort(diff))
fro_error = np.linalg.norm(Rch.T - Rd, 'fro')
print("Errore Frobenius:", fro_error)

[1.36424205e-12 1.81898940e-12 2.27373675e-12 2.27373675e-12
 2.72848411e-12 3.63797881e-12 4.54747351e-12 8.41282599e-12
 1.20508048e-11 4.34172047e-03 4.34172047e-03 4.37311795e-03
 4.37311795e-03 4.44805613e-03 4.44805613e-03 5.91148248e-03
 5.91148248e-03 8.49588253e-03 8.49588253e-03 1.03344147e-02
 1.03344147e-02 1.46620036e-02 1.46620036e-02 2.14843187e-02
 2.14843187e-02 2.55261880e-02 2.55261880e-02 2.72814253e-02
 2.72814253e-02 2.95001054e-02 2.95001054e-02 3.76331240e-02
 3.76331240e-02 4.78753773e-02 4.78753773e-02 4.96262198e-02
 4.96262198e-02 5.09491727e-02 5.09491727e-02 5.16898833e-02
 5.16898833e-02 5.17671634e-02 5.17671634e-02 5.21464963e-02
 5.21464963e-02 5.82052365e-02 5.82052365e-02 5.84278641e-02
 5.84278641e-02 6.27291796e-02 6.27291796e-02 7.07052160e-02
 7.07052160e-02 7.97986364e-02 7.97986364e-02 8.75882914e-02
 8.75882914e-02 9.00420808e-02 9.00420808e-02 9.54706715e-02
 9.54706715e-02 1.03745440e-01 1.03745440e-01 1.04778718e-01
 1.04778718e-01 1.122142

In [64]:
client.close()
cluster.close()