In [1]:
import numpy as np
import scipy.sparse
def SimHash(X, k):
    '''Documentation of SimHash:
        Parameters:
           X: is a sparse array (numpy array or can be scipy sparse matrix) of shape (n, d),
              where n is number of samples and d is feature dimension
           k: reduced dimension value (must be postive integer)

        Returns:
          scipy sparse csr_matrix of shape (n, k)
    '''
    X = scipy.sparse.csr_matrix(X)
    SignMatrix = np.random.choice([1, -1], (X.shape[1], k), p=[0.5,0.5])
    new_X = scipy.sparse.csr_matrix.dot(X, SignMatrix)
    new_X[new_X < 0] = 0
    new_X[new_X > 0] = 1
    return scipy.sparse.csr_matrix(new_X)

def Hamming_distance(a,b):
    'Funtion to calculate hamming distance between array a and b'
    ham = 0
    for i in range(a.shape[1]):
        if a[:,i] != b[:,i]:
            ham += 1
    return ham

In [3]:
#Example
X = scipy.sparse.load_npz('Sample.npz')
print('Shape of actual matrix:', X.shape)

new_X = SimHash(X, 1000)
print('Shape of compressed matrix:', new_X.shape)

Shape of actual matrix: (100, 102660)
Shape of compressed matrix: (100, 1000)


In [4]:
#hamming estimate
a = X[0,:]
b = X[1,:]
#hamming distance between a and b taken from actual matrix
print('Hamming distance between a and b is :', Hamming_distance(a,b))

a_new = new_X[0,:]
b_new = new_X[1,:]
# a_new and b_new corresponds to compressed sketch of a and b
print('Hamming estimate of a and b  using SimHash sketch is :', Hamming_distance(a_new, b_new))


Hamming distance between a and b is : 225
Hamming estimate of a and b  using SimHash sketch is : 487
