In [1]:
from dask.distributed import Client, LocalCluster

# Example: 4 workers, 1 thread each
cluster = LocalCluster(n_workers=4, threads_per_worker=1)
client = Client(cluster)

print("Dashboard:", client.dashboard_link)


Dashboard: http://127.0.0.1:8787/status


In [12]:
# CLUSTER DEPLOYMENT ON CLOUDVENETO
from dask.distributed import Client, SSHCluster

cluster = SSHCluster(
    ["10.67.22.154", "10.67.22.216", "10.67.22.116", "10.67.22.113"],
    connect_options={"known_hosts": None},
    remote_python="/home/ubuntu/miniconda3/bin/python",
    scheduler_options={"port": 3333, "dashboard_address": ":8797"},
    worker_options={
        "nprocs": 1,       # N. of processess per VM. CloudVeneto's large VM offers 4-core CPU, but for now we only spawn 1 process per VM
        "nthreads": 1      # N. of threads per process
    }
)

client = Client(cluster)




In [13]:
# check if everything went smoothly
print(client)
print(cluster)


<Client: 'tcp://10.67.22.154:3333' processes=2 threads=2, memory=3.88 GiB>
SSHCluster(SSHCluster, 'tcp://10.67.22.154:3333', workers=2, threads=2, memory=3.88 GiB)


In [43]:
import dask.array as da
import dask
import numpy as np
from scipy.linalg import solve_triangular 
from sklearn.datasets import fetch_california_housing

# Download California Housing dataset
data = fetch_california_housing(as_frame=True)

# Convert features into Dask Array (it's a matrix).
n_partition = 3        # number of partition in memory. We have 4 VMS (1 master + 3 workers), so let's start with just 3 partitions
length_partition = data.data.shape[0] // n_partition
X_da = da.from_array(data.data.values, chunks=(length_partition, data.data.shape[1]))

print("Number of Dask partitions:",  X_da.npartitions) 
print("Length of each partition:", length_partition, "rows")
print("Length of the whole dataset:", data.data.shape[0], "rows")


Number of Dask partitions: 3
Length of each partition: 6880 rows
Length of the whole dataset: 20640 rows


In [44]:
print(X_da)


dask.array<array, shape=(20640, 8), dtype=float64, chunksize=(6880, 8), chunktype=numpy.ndarray>


In [45]:
def indirect_serial(A, n_div):

    """
    Indirect TSQR (serial, NumPy).
    Splits A by rows into n_div blocks, computes local R_i via QR,
    reduces to global R by QR on the stacked R_i, then recovers Q = A R^{-1}.
    Returns (Q, R).
    """

    n_samp = A.shape[0]
    
    div_points = int(np.floor(n_samp/n_div))
    A_divided = []
    Ri = []
    
    A_divided = [A[div_points * i : div_points * (i + 1)] for i in range(n_div - 1)]
    A_divided.append(A[(n_div - 1) * div_points:, :])   # In the case n_samp wasn't divisible by n_div

    Ri = [np.linalg.qr(Ai, mode="reduced")[1] for Ai in A_divided]
    
    R_stack = np.concatenate(Ri, axis = 0)

    # Step 3. Stack R_i and compute global R

    _, R = np.linalg.qr(R_stack, mode="reduced")


    """try:
        I = np.eye(n, dtype=A.dtype)
        Rinv = solve_triangular(R, I, lower=False)
    except Exception:
        Rinv = np.linalg.inv(R)  # fallback if SciPy not available
    Q = A @ Rinv"""

    """
    np.linalg.inv(R) is numerically less stable than solving the triangular system.
    R is upper-triangular, so it is possible to try to use scipy.linalg.solve_triangular.
    """
    Q = A @ np.linalg.inv(R)


    return Q, R

def indirect_serialR(A, n_div):

    """
    Indirect TSQR (serial, NumPy).
    Splits A by rows into n_div blocks, computes local R_i via QR,
    reduces to global R by QR on the stacked R_i, then recovers Q = A R^{-1}.
    Returns (R).
    """

    n_samp = A.shape[0]
    
    div_points = int(np.floor(n_samp/n_div))
    A_divided = []
    Ri = []
    
    A_divided = [A[div_points * i : div_points * (i + 1)] for i in range(n_div - 1)]
    A_divided.append(A[(n_div - 1) * div_points:, :])   # In the case n_samp wasn't divisible by n_div

    Ri = [np.linalg.qr(Ai, mode="reduced")[1] for Ai in A_divided]
    
    R_stack = np.concatenate(Ri, axis = 0)

    # Step 3. Stack R_i and compute global R

    _, R = np.linalg.qr(R_stack, mode="reduced")
    return R


def compute_R(block):
    # np.linalg.qr with mode='r' gives just the R matrix
    R = np.linalg.qr(block, mode="r")
    return R


def indirect_parallel(X_da):

    """
    Indirect TSQR with Dask.
    Input:
        X_da : Dask Array (m x n), chunked row-wise
    Output:
        R    : final global triangular factor (n x n, NumPy array on driver)
        Q_da : Dask Array (m x n), representing Q = A R^{-1} (lazy)
    """


    n_cols = X_da.shape[1]

    R_blocks = X_da.map_blocks(compute_R, dtype=X_da.dtype, chunks=(n_cols, n_cols))
    # Now R_blocks is a stack of n x n matrices (one per partition)
    # Its shape is (#chunks * n, n)

    #R_blocks.visualize(filename="fig/R_blocks_graph", format="png")
 

    #Dask has da.linalg.qr, but it assumes the whole array is large and chunked regularly.
    #To get the final global R, you must combine all the Ri
    #That means at some point, the data has to come together into a single place (can’t keep it sharded).
    # So we bring the data to the driver because it is very small, because it optimizes the uses of np.linalg.qr, we are gathering the small stuff
    R_stack = R_blocks.compute()   # NumPy array, shape (p*n, n)

    # Small QR on driver to combine them into the final R
    R = np.linalg.qr(R_stack, mode="r")


    # Instead of materializing Q, compute a small R^{-1} (n x n).
    I = np.eye(n_cols, dtype=X_da.dtype)
    R_inv = solve_triangular(R, I, lower=False)  # stable

    # Broadcast Rinv to every chunk: Q = A @ R^{-1}
    Q_da = X_da @ R_inv   # still a Dask Array, lazy

    return Q_da, R      #Q_da because it is lazy, it is still a Dask array




def indirect_parallelR(X_da):

    """
    Indirect TSQR with Dask.
    Input:
        X_da : Dask Array (m x n), chunked row-wise
    Output:
        R    : final global triangular factor (n x n, NumPy array on driver)
    """


    n_cols = X_da.shape[1]

    R_blocks = X_da.map_blocks(compute_R, dtype=X_da.dtype, chunks=(n_cols, n_cols))
   
    R_stack = R_blocks.compute()   # NumPy array, shape (p*n, n)

    R = np.linalg.qr(R_stack, mode="r")

    return R      #Q_da because it is lazy, it is still a Dask array


In [46]:
%%time

Q, R = indirect_serial(data.data.values, 50)
size = 20640 / 50 * 8 * 8 #rows_per_chunk × n_cols × 8 bytes
print("The size for each chunk is :", size/1e6, "Mb")


The size for each chunk is : 0.0264192 Mb
CPU times: user 4.38 ms, sys: 0 ns, total: 4.38 ms
Wall time: 3.57 ms


In [47]:
%%time

Q_da, R = indirect_parallel(X_da)   # X_da is already splitted into three partitions


print("Final R shape:", R.shape)  # (n, n), small
print("Q is lazy:", Q_da)         # Dask Array, not yet computed

print("Chunks:", X_da.chunks)
print("Number of partitions:", X_da.npartitions)
print("Total size [MB]:", X_da.nbytes / 1e6)
print("Chunk size [MB]:", X_da.nbytes / X_da.npartitions / 1e6)    # 10–100 MB

print("Shape:", X_da.shape)
print("Chunks:", X_da.chunks)
print("Partitions:", X_da.npartitions)
print("Total size [MB]:", X_da.nbytes/1e6)
print("Avg chunk [MB]:", (X_da.nbytes/X_da.npartitions)/1e6)


#print("Compact sanity check:", X_da.shape, "(m,n);", R_blocks.shape, "(n *n_partitions, n);", R.shape, "(n,n)" )


Final R shape: (8, 8)
Q is lazy: dask.array<getitem, shape=(20640, 8), dtype=float64, chunksize=(6880, 8), chunktype=numpy.ndarray>
Chunks: ((6880, 6880, 6880), (8,))
Number of partitions: 3
Total size [MB]: 1.32096
Chunk size [MB]: 0.44032
Shape: (20640, 8)
Chunks: ((6880, 6880, 6880), (8,))
Partitions: 3
Total size [MB]: 1.32096
Avg chunk [MB]: 0.44032
CPU times: user 9.93 ms, sys: 596 μs, total: 10.5 ms
Wall time: 90 ms


In [50]:
%%time
R = indirect_serial(data.data.values, 50)

CPU times: user 4.13 ms, sys: 0 ns, total: 4.13 ms
Wall time: 3.52 ms


In [51]:
%%time
R = indirect_parallelR(X_da)

CPU times: user 7.15 ms, sys: 0 ns, total: 7.15 ms
Wall time: 36.4 ms


In [39]:

#Q_da.visualize("fig/q_mul_graph.png")


In [88]:
# create again a cluster
# CLUSTER DEPLOYMENT ON CLOUDVENETO
client.close()   # close the previous one
cluster.close()  # close the previous one

cluster = SSHCluster(
    ["10.67.22.154", "10.67.22.216", "10.67.22.116", "10.67.22.113"],
    connect_options={"known_hosts": None},
    remote_python="/home/ubuntu/miniconda3/bin/python",
    scheduler_options={"port": 3333, "dashboard_address": ":8797"},
    worker_options={
        "nprocs": 4,        # Now we use all 4 cores -> 12 workers
        "nthreads": 1       # We use 1 threads. Following Dask documentation, however, Numpy should release well the GIL lock thus we could use more than 1 thread. 
    }
)

client = Client(cluster)




In [89]:
print(client)
print(cluster)


<Client: 'tcp://10.67.22.154:3333' processes=8 threads=8, memory=15.50 GiB>
SSHCluster(SSHCluster, 'tcp://10.67.22.154:3333', workers=8, threads=8, memory=15.50 GiB)


In [90]:
cluster


0,1
Dashboard: http://10.67.22.154:8797/status,Workers: 9
Total threads: 9,Total memory: 17.44 GiB

0,1
Comm: tcp://10.67.22.154:3333,Workers: 0
Dashboard: http://10.67.22.154:8797/status,Total threads: 0
Started: Just now,Total memory: 0 B

0,1
Comm: tcp://10.67.22.113:33377,Total threads: 1
Dashboard: http://10.67.22.113:32923/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.113:35949,
Local directory: /tmp/dask-scratch-space/worker-pw9cy51n,Local directory: /tmp/dask-scratch-space/worker-pw9cy51n

0,1
Comm: tcp://10.67.22.113:40491,Total threads: 1
Dashboard: http://10.67.22.113:34993/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.113:40053,
Local directory: /tmp/dask-scratch-space/worker-vryaiyik,Local directory: /tmp/dask-scratch-space/worker-vryaiyik

0,1
Comm: tcp://10.67.22.113:43971,Total threads: 1
Dashboard: http://10.67.22.113:40021/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.113:38429,
Local directory: /tmp/dask-scratch-space/worker-c5glgy0f,Local directory: /tmp/dask-scratch-space/worker-c5glgy0f

0,1
Comm: tcp://10.67.22.113:44525,Total threads: 1
Dashboard: http://10.67.22.113:34487/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.113:33969,
Local directory: /tmp/dask-scratch-space/worker-tgb6dd16,Local directory: /tmp/dask-scratch-space/worker-tgb6dd16

0,1
Comm: tcp://10.67.22.116:32873,Total threads: 1
Dashboard: http://10.67.22.116:44441/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.116:44849,
Local directory: /tmp/dask-scratch-space/worker-teo0uo3p,Local directory: /tmp/dask-scratch-space/worker-teo0uo3p

0,1
Comm: tcp://10.67.22.116:32979,Total threads: 1
Dashboard: http://10.67.22.116:43041/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.116:43577,
Local directory: /tmp/dask-scratch-space/worker-gd12x5eg,Local directory: /tmp/dask-scratch-space/worker-gd12x5eg

0,1
Comm: tcp://10.67.22.116:33443,Total threads: 1
Dashboard: http://10.67.22.116:39177/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.116:37111,
Local directory: /tmp/dask-scratch-space/worker-e21_8iae,Local directory: /tmp/dask-scratch-space/worker-e21_8iae

0,1
Comm: tcp://10.67.22.116:38123,Total threads: 1
Dashboard: http://10.67.22.116:33925/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.116:37009,
Local directory: /tmp/dask-scratch-space/worker-840e543v,Local directory: /tmp/dask-scratch-space/worker-840e543v

0,1
Comm: tcp://10.67.22.216:44491,Total threads: 1
Dashboard: http://10.67.22.216:39589/status,Memory: 1.94 GiB
Nanny: tcp://10.67.22.216:46091,
Local directory: /tmp/dask-scratch-space/worker-x4bb2h2h,Local directory: /tmp/dask-scratch-space/worker-x4bb2h2h


In [107]:
import dask.dataframe as dd
import os

os.chdir("/home/ubuntu") 
path_HIGGS = os.getcwd() + "/datasets/HIGGS.csv"
# A huge dataset
df = dd.read_csv(path_HIGGS, header=None, blocksize="200MB")    # The block size can be customized, let's start with 200 MB
X_df = df.iloc[:, 1:] 
X_da = X_df.to_dask_array(lengths=True)

X_da = X_da.persist()



In [108]:
#Let's print it
X_da



Unnamed: 0,Array,Chunk
Bytes,2.29 GiB,58.75 MiB
Shape,"(11000000, 28)","(275002, 28)"
Dask graph,40 chunks in 1 graph layer,40 chunks in 1 graph layer
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 2.29 GiB 58.75 MiB Shape (11000000, 28) (275002, 28) Dask graph 40 chunks in 1 graph layer Data type float64 numpy.ndarray",28  11000000,

Unnamed: 0,Array,Chunk
Bytes,2.29 GiB,58.75 MiB
Shape,"(11000000, 28)","(275002, 28)"
Dask graph,40 chunks in 1 graph layer,40 chunks in 1 graph layer
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [109]:
# Print the number of partition
X_da.npartitions

40

In [110]:

%%time

Q_da, R = indirect_parallel(X_da)

CPU times: user 45.8 ms, sys: 4.18 ms, total: 49.9 ms
Wall time: 1.19 s


In [111]:
%%time
R = indirect_parallelR(X_da)

CPU times: user 305 ms, sys: 20.2 ms, total: 325 ms
Wall time: 1.15 s


In [112]:
# Rechunk to ~1M rows per block (~100 MB) so each worker gets a good-sized task
X_da_cached = X_da.rechunk((1_000_000, -1)).persist()

# First persist actually loads CSV into memory on workers
# This itself will take a few seconds (same cost as run 1)


In [113]:
%%time
Q_da2, R2 = indirect_parallel(X_da_cached)   # Now works from cached chunks in RAM


CPU times: user 362 ms, sys: 13.4 ms, total: 375 ms
Wall time: 3.34 s


In [114]:
%%time
R = indirect_parallelR(X_da_cached)

CPU times: user 316 ms, sys: 21 ms, total: 337 ms
Wall time: 1.83 s
