In [1]:
# CLUSTER DEPLOYMENT, TO BE SUBSTITUED WHEN GOING CLUSTER
from dask.distributed import Client, LocalCluster

# For now, local deployment on my computer (multicore)
ncore = 8
cluster = LocalCluster(n_workers=ncore, threads_per_worker=1)
client = Client(cluster)

# Print the dashboard link over the port 8787
print(client.dashboard_link)

http://127.0.0.1:8787/status


In [2]:
import dask.array as da
import dask
import numpy as np
from sklearn.datasets import fetch_california_housing

# Download California Housing dataset
data = fetch_california_housing(as_frame=True)
# Convert features into Dask Array (it's a matrix). N partition = 16
X_da = da.from_array(data.data.values, chunks=(2580, data.data.shape[1]))

print(X_da)

dask.array<array, shape=(20640, 8), dtype=float64, chunksize=(2580, 8), chunktype=numpy.ndarray>


In [3]:
def compute_choleskyQR_parallel(X_da : dask.array.Array):
    # A list of delayed tasks for each partition of the dataset
    # Each partition computes the local Gram matrix
    chunks_delayed = [dask.delayed(lambda x : x.T @ x)(chunk) for chunk in X_da.to_delayed().ravel()]

    # Now sum all the local Gram matrices to get the global Gram matrix
    Gram_global_delayed = dask.delayed(sum)(chunks_delayed)

    # Compute R as the Cholesky decomposition on the global Gram matrix (as a delayed even if a serial operation just call .compute at the end)
    R = dask.delayed(np.linalg.cholesky)(Gram_global_delayed)
    #R.visualize("CholeskyR.png")
    R = R.compute() # Compute R
    R_inv = np.linalg.inv(R) # It's a small matrix, so this operation is fast even if serial

    Q = X_da.map_blocks(lambda block: block @ R_inv, dtype=X_da.dtype)
    #Q.visualize("CholeskyQ.png")
    Q = Q.compute() # Compute Q
    return Q, R

def compute_choleskyQR_serial(X):
    # Global gram matrix
    G = X.T @ X
    R = np.linalg.cholesky(G)
    R_inv = np.linalg.inv(R)
    Q = X @ R_inv
    
    return Q, R

def compute_choleskyR_parallel(X_da : dask.array.Array):
    # A list of delayed tasks for each partition of the dataset
    # Each partition computes the local Gram matrix
    chunks_delayed = [dask.delayed(lambda x : x.T @ x)(chunk) for chunk in X_da.to_delayed().ravel()]
    # Now sum all the local Gram matrices to get the global Gram matrix
    Gram_global_delayed = dask.delayed(sum)(chunks_delayed)
    # Compute R as the Cholesky decomposition on the global Gram matrix (as a delayed even if a serial operation just call .compute at the end)
    R = dask.delayed(np.linalg.cholesky)(Gram_global_delayed)
    R = R.compute() # Compute R
    return  R

def compute_choleskyR_serial(X):
    # Global gram matrix
    G = X.T @ X
    R = np.linalg.cholesky(G)
    return R

In [19]:
%%time
# parallel
Q_p, R_p = compute_choleskyQR_parallel(X_da)

CPU times: user 98.5 ms, sys: 12.7 ms, total: 111 ms
Wall time: 120 ms


In [31]:
%%time
# serial
Q_s, R_s = compute_choleskyQR_serial(data.data.values)

CPU times: user 1.49 ms, sys: 2.89 ms, total: 4.38 ms
Wall time: 3.01 ms


In [None]:
# Let's see whether the results are compatible
diffR = np.linalg.norm(R_p - R_s, 2)
diffQ = np.linalg.norm(Q_p - Q_s, 2)
print(f"||R_parallel - R_serial||_2 = {diffR}")
print(f"||Q_parallel - Q_serial||_2 = {diffQ}")

# Check orthogonality of Q
orthogonality_metric = np.linalg.norm(Q_s.T @ Q_s - np.eye(Q_s.shape[1]), 2)
print(f"||Q^T @ Q- I||_2 = {orthogonality_metric}")
# Check decomposition
decomp_metric = np.linalg.norm(data.data.values - Q_s @ R_s, 2)
print(f"||X - Q @ R||_2 = {decomp_metric}")

||R_parallel - R_serial||_2 = 6.6703992450451735e-09
||Q_parallel - Q_serial||_2 = 5.281864770776042e-10
||Q^T @ Q- I||_2 = 7971678.680290628
||X - Q @ R||_2 = 2.838161229486053e-10


As expected, the decomposition yielded a non reasonnable result (Q is not orthogonal, the algorithm is highly unstable)


Let's try with a different and larger dataset

In [None]:
import dask.dataframe as dd

df = dd.read_csv("HIGGS.csv", header=None, blocksize="400MB")
X_df = df.iloc[:, 1:]  # no labels
X_da = X_df.to_dask_array(lengths=True)

N = 1000000
X_sample = X_df.head(N).to_numpy() 

In [38]:
%%time
Q, R = compute_choleskyQR_parallel(X_da)

CPU times: user 5.08 s, sys: 3.92 s, total: 9 s
Wall time: 39.5 s


In [39]:
%%time
Q, R = compute_choleskyQR_serial(X_sample)

CPU times: user 100 ms, sys: 32 ms, total: 132 ms
Wall time: 118 ms
