In [1]:
import sys 
import numpy as np

class MinHashing:
    """ builds a minHash signature (in the form of a vector or a set) 
    of a given length k from a given set of integers (a set of hashed shingles)."""
    
    def __init__(self, a = [], b = [], c = [], k = 5):
        self.a = a
        self.b = b
        self.c = c
        self.k = k
        self._randomize()
    
    
    def _randomize(self):
        # k is the number of hash functions (and so triples a,b,c) to create
        self.a = np.random.randint(0, sys.maxsize, self.k)
        self.b = np.random.randint(0, sys.maxsize, self.k)
        self.c = np.random.randint(0, sys.maxsize, self.k)

        
    def _h(self, number, index):
        return (self.a[index] * number + self.b[index]) % self.c[index]
        
        
    def _minHash(self, set1, index): 
        #index identify which triples a,b,c to use for the minHash function "h()"    
        minElem = 0
        minValue = sys.maxsize
        for elem in set1:
            h_value = self._h(elem, index)
            if(h_value < minValue):
                minElem = elem
                minValue = h_value
        return minElem
           
        
    def signature(self, doc_set):
        # return a vector that represent the signature of the set using the triples a,b,c in order, stored
        # in the class, to compute the minHash for k times
        id = doc_set[0]
        doc = doc_set[1]
        sign = []
        for i in range(0, self.k):
            sign.append(self._minHash(doc, i))
        return (id, sign)