### Environnement

In [1]:
import os
# force numpy to use only a single processor, by changing the environment of the underlying libraries
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"
os.environ["OMP_NUM_THREADS"] = "1"
import numpy as np

# libraries for multiprocessing
import multiprocessing as mp
import threading as th

from scipy.stats import unitary_group
import random
import time

In [2]:
### Main execution procedure, taking the matrix, mapper and reducer as input

def execute(big_matrix, mapper, reducer, type = "T", n_split = 6, gamma = 1):
    """Execute map - reduce algorithm based on input mapper and reducer, using Threads / Processes depending on type,
    returns a tuple (result_matrix, execution_time)"""
    (nrow_big_matrix, ncol_big_matrix) = big_matrix.shape
    sub_matrix_list = chunkify(big_matrix, n_split=n_split)
    norms_array = norm(big_matrix)

    # initialize execution time
    start_time = 0
    end_time = 0

    if type == "P":
        pool = mp.Pool(n_split)

        # create list of arguments
        args_list = []
        for sub_mat in sub_matrix_list:
            gamma_copy = gamma + 0.
            args_list.append((sub_mat, norms_array.copy(), gamma_copy))

        start_time = time.time()
        # map
        mapped = pool.starmap(mapper, args_list)
        # reduce
        result = reducer(mapped, norms_array, gamma)
        end_time = time.time()
        return result, end_time - start_time

    if type == "T":
        thread_list = []
        # allocate output
        sub_output_list = []
        for i in range(n_split):
            sub_output_list.append(np.zeros((ncol_big_matrix, ncol_big_matrix)))
        start_time = time.time()

        # define thread content
        def thread_content(mapper, sub_matrix, sub_output, norms_array, gamma):
            """Calls the mapper on the sub_matrix and copies the values in sub_output"""
            sub_output[:, :] = mapper(sub_matrix, norms_array, gamma)

        # start threads
        for i in range(n_split):
            gamma_copy = gamma + 0. # necessary in order to copy the gamma value to avoid GIL
            args = (mapper, sub_matrix_list[i], sub_output_list[i], norms_array.copy(), gamma_copy)
            thread_current = th.Thread(target=thread_content, args=args)
            thread_current.start()
            thread_list.append(thread_current)
        # Wait until all threads are finished
        for thread in thread_list:
            thread.join()

        # reduce
        result = reducer(sub_output_list, norms_array, gamma)
        end_time = time.time()
        return result, end_time - start_time

    if not(isinstance(type, str)):
        raise TypeError("type must be an instance of str")
    raise ValueError("type must be either T for threads or P for processes")

In [3]:
def naive_mapper(mat, norms_array, gamma):
    return mat.T@mat


def naive_reducer(mat_list, norms_array, gamma):
    return sum(mat_list)


In [4]:
def paper_mapper(mat, norms_array, gamma):
    gamma_copy = gamma
    nrow, ncol = mat.shape
    output = np.zeros((ncol, ncol)) # note that ncol << nrow, so the for loops are OK
    for i_output in range(ncol):
        for j_output in range(ncol):
            # randomly choose pairs
            random_values = np.random.rand(nrow)
            probas = gamma_copy/(norms_array[i_output]*norms_array[j_output])*np.ones((nrow,))
            bool_vect = (probas < random_values)
            # sum chosen pairs
            output[i_output, j_output] = np.sum(mat[bool_vect, i_output]*mat[bool_vect, j_output])
    return output


def paper_reducer(mat_list, norms_array, gamma):
    return 1/np.minimum(np.outer(norms_array, norms_array), gamma)*sum(mat_list)

In [39]:

def chunkify(Mat, n_split):
    """Splits the matrix Mat into nsplit matrices, returns a list of np.ndarray"""
    (nrow, ncol) = Mat.shape
    indexes = (nrow // n_split) * np.arange(1, n_split)
    Mat_list = np.vsplit(Mat, indexes)
    assert len(Mat_list) == n_split
    return Mat_list


# Create a sparse matrix
def create_big_matrix(nrow_big_matrix, ncol_big_matrix, rank, verbose=True, threshold = 1e-5):
    if verbose:
        print("Creating big matrix")
    # Generate 2 random unitary matrix
    u, v = unitary_group.rvs(nrow_big_matrix), unitary_group.rvs(ncol_big_matrix)
    
    U, V = np.dot(u, u.conj().T), np.dot(v, v.conj().T)
    print("U =")
    print(U.shape)
    print("V=")
    print(V.shape)
    n = min(ncol_big_matrix,nrow_big_matrix)

    # Then generate a diagonal matrix (singular values) with the same rank as the big matrix
    D = np.zeros((nrow_big_matrix, ncol_big_matrix))
    print("D=")
    print(D.shape)
    non_zeros = random.sample([i for i in range(n)], rank)
    for elem in non_zeros:
        D[elem,elem] = 1
    
    # Return the singular value decomposition of the big matrix
    if verbose:
        print("Creating big matrix [OK]")
    A = U.dot(D.dot(V))
    print("A")
    print(A.shape)
    A[np.abs(A) <= threshold] = 0
    A.dtype = 'float64'# dédouble la taille de la matrice TODO
    A[np.abs(A) <= threshold] = 0
    return A

def time_basic(big_matrix):
    start_time = time.time()
    result = big_matrix.T@big_matrix
    end_time = time.time()
    return result, end_time - start_time


### Define several norms

#Recall that in finite dimension, all norms are equivalent
def max_diff(matrix1, matrix2):
    """Returns the maximum absolute difference between matrix1 and matrix2 (Linf distance)"""
    return np.max(np.absolute(matrix1 - matrix2))

def norm(Mat):
    """returns an array with the norm of the columns of matrix mat"""
    return np.sqrt(np.sum(np.square(Mat), axis=0))

def distance(mat1, mat2, norm = None):
    # return the distance between 2 matrix using different norms:
    # norm = 
        # 'fro' for the Froebenius norm
        # 'nuc' for the nuclear norm
        #  inf  for the spectral norm
    return np.linalg.norm(mat1-mat2,ord = norm)


In [40]:
create_big_matrix(nrow_big_matrix = 10, ncol_big_matrix = 5, rank = 5)

Creating big matrix
U =
(10, 10)
V=
(5, 5)
D=
(10, 5)
Creating big matrix [OK]
A
(10, 5)


array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

### Premier test

In [29]:
# create big matrix (input)
nrow_big_matrix, ncol_big_matrix = 15, 5
big_matrix = create_big_matrix(nrow_big_matrix=nrow_big_matrix, ncol_big_matrix=ncol_big_matrix, rank = 5)

# check rank
print("Big matrix size =", big_matrix.shape)
print("Big matrix rank =", np.linalg.matrix_rank(big_matrix))

Creating big matrix
Creating big matrix [OK]
Big matrix size = (15, 10)
Big matrix rank = 5


In [30]:
big_matrix

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [28]:
gamma = 1.

result_parallel, exec_time_parallel = execute(big_matrix=big_matrix,
                                              mapper=paper_mapper,
                                              reducer=paper_reducer,
                                              type="T",
                                              n_split=6,
                                              gamma=gamma)

# display execution time
print("Execution time parallel=", exec_time_parallel)

# compare with basic approach
result_basic, exec_time_basic = time_basic(big_matrix)
print("Execution time basic=", exec_time_basic)
print("Distance between results", distance(result_basic, result_parallel, norm = float("Inf")))
print(max_diff(result_basic, result_parallel))

Execution time parallel= 0.028798341751098633
Execution time basic= 7.200241088867188e-05
Distance between results nan
nan


  probas = gamma_copy/(norms_array[i_output]*norms_array[j_output])*np.ones((nrow,))
  return 1/np.minimum(np.outer(norms_array, norms_array), gamma)*sum(mat_list)
  return 1/np.minimum(np.outer(norms_array, norms_array), gamma)*sum(mat_list)


In [16]:
big_matrix

array([[ 1.00000000e+00,  2.01018938e-32,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
        -2.02944868e-32,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         1.00000000e+00,  1.92592994e-34,  0.00000000e+00,
         0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.

In [9]:
big_matrix

array([[ 1.00000000e+00,  2.01018938e-32,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
        -2.02944868e-32,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         1.00000000e+00,  1.92592994e-34,  0.00000000e+00,
         0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.

In [10]:
result_parallel

array([[ 0.,  0.,  0.,  0., nan, nan,  0.,  0., nan, nan],
       [ 0.,  0.,  0.,  0., nan, nan,  0.,  0., nan, nan],
       [ 0.,  0.,  0.,  0., nan, nan,  0.,  0., nan, nan],
       [ 0.,  0.,  0.,  0., nan, nan,  0.,  0., nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       [ 0.,  0.,  0.,  0., nan, nan,  0.,  0., nan, nan],
       [ 0.,  0.,  0.,  0., nan, nan,  0.,  0., nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]])

In [11]:
diff = result_basic - result_parallel

In [12]:
diff

array([[ 1.00000000e+00,  2.01018938e-32,  0.00000000e+00,
         0.00000000e+00,             nan,             nan,
         0.00000000e+00,  0.00000000e+00,             nan,
                    nan],
       [ 2.01018938e-32,  4.04086134e-64,  0.00000000e+00,
         0.00000000e+00,             nan,             nan,
         0.00000000e+00,  0.00000000e+00,             nan,
                    nan],
       [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
        -2.02944868e-32,             nan,             nan,
         0.00000000e+00,  0.00000000e+00,             nan,
                    nan],
       [ 0.00000000e+00,  0.00000000e+00, -2.02944868e-32,
         4.11866194e-64,             nan,             nan,
         0.00000000e+00,  0.00000000e+00,             nan,
                    nan],
       [            nan,             nan,             nan,
                    nan,             nan,             nan,
                    nan,             nan,             nan,
           

In [13]:
result_basic

array([[ 1.00000000e+00,  2.01018938e-32,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 2.01018938e-32,  4.04086134e-64,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
        -2.02944868e-32,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00, -2.02944868e-32,
         4.11866194e-64,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.

In [14]:
result_parallel

array([[ 0.,  0.,  0.,  0., nan, nan,  0.,  0., nan, nan],
       [ 0.,  0.,  0.,  0., nan, nan,  0.,  0., nan, nan],
       [ 0.,  0.,  0.,  0., nan, nan,  0.,  0., nan, nan],
       [ 0.,  0.,  0.,  0., nan, nan,  0.,  0., nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       [ 0.,  0.,  0.,  0., nan, nan,  0.,  0., nan, nan],
       [ 0.,  0.,  0.,  0., nan, nan,  0.,  0., nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]])