# SD212: Graph mining

# Lab 1: Sparse matrices

The objective of this lab is to understand the structure and main properties of [sparse matrices](https://en.wikipedia.org/wiki/Sparse_matrix).

## Import

In [None]:
import numpy as np

In [None]:
from scipy import sparse

## Coordinate format

In [None]:
# random matrix (dense format)
A_dense = np.random.randint(2, size = (5,10))

In [None]:
A_dense

In [None]:
A_coo = sparse.coo_matrix(A_dense)

In [None]:
A_coo

In [None]:
A_coo.shape

In [None]:
A_coo.nnz

In [None]:
print(A_coo.row)
print(A_coo.col)
print(A_coo.data)

In [None]:
row = A_coo.row
col = A_coo.col
data = np.random.randint(10, size=len(A_coo.data))
shape = A_coo.shape

In [None]:
B_coo = sparse.coo_matrix((data, (row, col)), shape)

In [None]:
B_coo

In [None]:
B_coo.toarray()

In [None]:
B_coo.nnz

In [None]:
np.sum(B_coo.data > 0)

In [None]:
B_coo.eliminate_zeros()

In [None]:
B_coo

## To do

Complete the function below that converts a dense matrix into a sparse matrix in COO format. 

Needless to say...
* don't use `scipy`
* don't use any loop

In [None]:
class SparseCOO():
    def __init__(self, data: np.ndarray = None, row: np.ndarray = None, 
                 col: np.ndarray = None, shape: tuple = None):
        self.data = data
        self.row = row
        self.col = col
        self.shape = shape

In [None]:
def dense_to_coo(A):
    '''Convert dense matrix to sparse in COO format.
    
    Parameters
    ----------
    A : np.ndarray
        Dense matrix
        
    Returns
    -------
    A_coo : SparseCOO
        Sparse matrix in COO format.
    '''
    A_coo = ( A > 0).astype(int)
    return A_coo

In [31]:
A = (np.random.random((5,10)) < .25).astype(int)
A_coo = dense_to_coo(A)

NameError: name 'dense_to_coo' is not defined

## CSR format

The CSR (Compressed Sparse Row) format is the more efficient for arithmetic operations (see below).

In [None]:
A_dense

In [None]:
A_csr = sparse.csr_matrix(A_dense)

In [None]:
A_csr

In [None]:
A_csr.shape

In [None]:
A_csr.nnz

In [None]:
print(A_csr.indices)
print(A_csr.indptr)
print(A_csr.data)

In [None]:
A_csr[3, 4]

In [None]:
A_csr[3]

In [None]:
indices = A_csr.indices
indptr = A_csr.indptr
data = np.random.randint(10, size=len(A_csr.data))
shape = A_csr.shape

In [None]:
B_csr = sparse.csr_matrix((data, indices, indptr), shape)

In [None]:
B_csr

In [None]:
B_csr.eliminate_zeros()

In [None]:
B_csr

In [None]:
# from COO format
row = [0, 0, 1, 2, 2]
col = [2, 3, 0, 1, 2]
data = np.ones(5)
A_csr = sparse.csr_matrix((data, (row, col)), shape = (3, 4))

In [None]:
A_csr.toarray()

In [None]:
# equivalently
A_coo = sparse.coo_matrix((data, (row, col)), shape = (3, 4))
A_csr = sparse.csr_matrix(A_coo)

In [None]:
A_csr.toarray()

## To do

Complete the functions below that converts:
* a dense matrix into a sparse matrix in CSR format,
* a sparse matrix in COO format to CSR format.

Again...
* don't use `scipy`
* don't use any loop

In [None]:
class SparseCSR():
    def __init__(self, data: np.ndarray = None, indices: np.ndarray = None, 
                 indptr: np.ndarray = None, shape: tuple = None):
        self.data = data
        self.indices = indices
        self.indptr = indptr
        self.shape = shape

In [None]:
def dense_to_csr(A):
    '''Convert dense matrix to sparse in CSR format.
    
    Parameters
    ----------
    A : np.ndarray
        Dense matrix
        
    Returns
    -------
    A_csr : SparseCSR
        Sparse matrix in CSR format.
    '''
    # to be modified
    return None

In [None]:
def coo_to_csr(A_coo):
    '''Convert a sparse matrix from COO to CSR format.
    
    Parameters
    ----------
    A_coo : SparseCSR
        Sparse matrix in COO format.
        
    Returns
    -------
    A_csr : SparseCSR
        Sparse matrix in CSR format.
    '''
    # to be modified
    return None

## CSC format

In [None]:
A_dense

In [None]:
A_csc = sparse.csc_matrix(A_dense)

In [None]:
A_csc

In [None]:
A_csc.shape

In [None]:
A_csc.nnz

In [None]:
print(A_csc.indices)
print(A_csc.indptr)
print(A_csc.data)

## LIL format

In [None]:
A_dense

In [None]:
A_lil = sparse.lil_matrix(A_dense)

In [None]:
A_lil

In [None]:
print(A_lil.rows)
print(A_lil.data)

In [None]:
A_lil = sparse.lil_matrix(A_csr)

## To do

What is the best format to modify an entry to a sparse matrix?

In [None]:
A_csr[0, 2] = 1

In [None]:
A_lil[0, 2] = 1

## Diagonal format

In [None]:
A_diag = sparse.diags(np.arange(5))

In [None]:
A_diag

In [None]:
A_diag.toarray()

In [None]:
A_diag.diagonal()

In [None]:
A = sparse.csr_matrix(A_diag)

In [None]:
A

## To do

Complete the following function that returns a sparse CSR matrix with the pseudo-inverse vector on the diagonal.

**Example:** pseudo inverse of (0, 1, 2) -> (0, 1, 1/2)

**Hint:** Use the property of sparse matrices!

In [None]:
def pseudo_inverse(vector):
    '''Return a sparse matrix with pseudo-inverse on the diagonal.
    
    Parameters
    ----------
    vector : np.ndarray
        Input vector. 
        
    Returns
    -------
    A_csr : sparse.csr_matrix
        Sparse matrix in scipy CSR format.
    '''    
    # to be modified
    return None

## Operations

Usual arithmetic operations apply to sparse matrices. The only contraint is to have a sparse matrix on the **left-hand side** of the operator.

In [None]:
A = sparse.csr_matrix(A_dense)

In [None]:
n_row, n_col = A.shape

In [None]:
A.dot(np.ones(n_col, dtype=int))

In [None]:
A.T.dot(np.ones(n_row, dtype=int))

In [None]:
# observe the format of the transpose
A.T

In [None]:
A.T.dot(A)

In [None]:
A.dot(A.T)

In [None]:
A.data = np.random.choice((1,2,3,4), size = len(A.data))

In [None]:
B = A > 1

In [None]:
B

In [None]:
# Explain the following warning...
B = A < 1

In [None]:
B

In [None]:
B_dense = np.random.randint(2, size = (5,10))
B = sparse.csr_matrix(B_dense)

In [None]:
2 * A + 5 * B

## To do

* Complete the following function that normalizes a sparse CSR matrix with non-negative entries so that each row sums to 1 (or to 0 if the whole row is zero). 
* Do the same for the columns. 

In [None]:
def normalize_rows(A):
    '''Normalize the rows of a CSR matrix so that all sum to 1 (or 0).
    
    Parameters
    ----------
    A : sparse.csr_matrix
        Input matrix (non-negative entries).
    
    Returns
    -------
    X_ : sparse.csr_matrix
        Normalized matrix. 
    
    '''
    # to be modified
    return None

## To do

Complete the following method that returns the dot product of a sparse CSR matrix with a vector.

* No loop allowed!

In [None]:
class SparseCSR():
    def __init__(self, data: np.ndarray = None, indices: np.ndarray = None, 
                 indptr: np.ndarray = None, shape: tuple = None):
        self.data = data
        self.indices = indices
        self.indptr = indptr
        self.shape = shape
        
    def dot(self, x: np.ndarray) -> np.ndarray:
        '''Sparse-vector dot product.'''
        # to be modified
        return None

## Slicing

Sparse matrices can be sliced like numpy arrays. The CSR format is more efficient for row slicing (although column slicing is possible), while the CSC format is more efficient for column slicing.

In [None]:
A = sparse.csr_matrix(A_dense)

In [None]:
A[:2]

In [None]:
A[1:4][:,2:]

In [None]:
A[np.array([0,2,4])]

## To do 

Consider the following matrix:

In [None]:
A = sparse.csr_matrix(np.random.randint(2, size = (20,30)))

* Extract the 10 rows of largest sums and build the corresponding matrix.

## Bonus

Complete all methods of the following CSR class.

In [None]:
class SparseCSR():
    def __init__(self, data: np.ndarray = None, indices: np.ndarray = None, 
                 indptr: np.ndarray = None, shape: tuple = None):
        self.data = data
        self.indices = indices
        self.indptr = indptr
        self.shape = shape
        
    def dot(self, x: np.ndarray) -> np.ndarray:
        '''Sparse-vector dot product.'''
        # to be modified
        return None

    def dot_array(self, X: np.ndarray) -> np.ndarray:
        '''Sparse-array dot product.'''
        # to be modified
        return None
    
    def dot_sparse(self, X: SparseCSR) -> SparseCSR:
        '''Sparse-sparse dot product.'''
        # to be modified
        return None
    
    def add_sparse(self, X: SparseCSR) -> SparseCSR:
        '''Add a sparse matrix.'''
        # to be modified
        return None
    
    def slice_row(self, index: np.ndarray) -> SparseCSR:
        '''Slice rows of a sparse matrix.'''
        # to be modified
        return None
    
    def slice_col(self, index: np.ndarray) -> SparseCSR:
        '''Slice columns of a sparse matrix.'''
        # to be modified
        return None
    
    def eliminate_zeros(self) -> SparseCSR:
        '''Eliminate zeros of a sparse matrix.'''
        # to be modified
        return None