In [348]:
import hashlib
import random
import numpy as np
import sys


In [349]:
class Shingling:
    """ this class  constructs k–shingles of a given length k (e.g., 10) 
    from a given document, computes a hash value for each unique shingle, 
    and represents the document in the form of an ordered set of its hashed k-shingles."""
    
    def __init__(self):
        pass
    
    def shingle(self, document, k = 10):
        """
        --------
        document: string
                  A string that contains the text of the document to shingle
        k: int, optional
           the shingle size
        """
        self.document = document;
        shingles = set()
        for i in range(0,len(document)-k+1):
            shingle = document[i:k+i]
            #print(shingle)
            #hash the string
            hash_object = hashlib.md5(shingle.encode())
            #convert the hexadecimal digest into an int number
            number = int(hash_object.hexdigest(),16)
            shingles.add(number)
        return shingles
            
        
        

In [350]:
class Matrix:
    """ This class creates different matrix releted to shingles"""
    
    
    def characteristicMatrix(self, shinglesList ):
        #build the characteristic matrix of the hashed shingles in the shinglesList 
        # (dim: hashed shingles X documents) and return it
        shinglesSet = set()
        for shingles in shinglesList:
            shinglesSet = shinglesSet.union(shingles)
        rows = list(shinglesSet)
        rowsLen = len(rows)
        colLen = len(shinglesList)
        charMatrix  = np.zeros((rowsLen,colLen), dtype=np.int)

        for r in range(0,rowsLen) :
            for c in range(0,colLen):
                if(rows[r] in shinglesList[c]):
                    charMatrix[r][c] = 1
        return charMatrix
        

In [351]:
class CompareSets:
    """ this class compare two sets of integers numbers
    A possible comparison is the Jaccard similarity """
    
    def __init__(self):
        pass
    
    def jaccardSimilarity(self, set1, set2):
        #check that the arguments are two sets
        if(not isinstance(set1,set)): raise TypeError("The fisrt argument must be a Set")
        if(not isinstance(set2,set)): raise TypeError("The second argument must be a Set")
        union = set1.union(set2)
        intersection = set1.intersection(set2)
        return len(intersection)/len(union)

In [352]:
class CompareSignatures:
    """ this class compare two signatures ( list of integers numbers )
    returns the fraction of elements in which they agree divided by the total number of elements k (Hamming Distance)"""
    
    def __init__(self):
        pass
    
    def signatureSimilarity(self, sign1, sign2):
        #check that the signatures have the same number of elements
        if(len(sign1)!=len(sign2)): raise Error("The signatures have different number of elements")
        count = 0
        for i in range(0,len(sign1)):
            if(sign1[i]==sign2[i]):
                count = count + 1
        return count/len(sign1)

In [353]:
class MinHashing:
    """ builds a minHash signature (in the form of a vector or a set) 
    of a given length k from a given set of integers (a set of hashed shingles)."""
    
    def __init__(self, a = None, b = None, c = None, k = 5):
        if(a == None):
            self.a = []
        else:
             self.a = a
        if(b == None):
            self.b = []
        else:
             self.b = b
        if(c == None):
            self.c = []
        else:
             self.c = c
        self.k = k
     
    
    def randomize(self):
        # k is the number of hash functions (and so triples a,b,c) to create
        for i in range(0,self.k):
            self.a.append(random.randint(0,sys.maxsize))
            self.b.append(random.randint(0,sys.maxsize))
            self.c.append(random.randint(0,sys.maxsize))
        #print("a: {} b: {} c: {}".format(self.a,self.b,self.c))
        
    def h(self, number, index):
        return (self.a[index]*number+self.b[index])%self.c[index]
        
    def minHash(self, set1, index = 1): 
        #index identify which triples a,b,c to use for the minHash function "h()"    
        if(not isinstance(set1,set)): raise TypeError("The fisrt argument must be a Set")       
        minElem = 0
        minValue = sys.maxsize
        for elem in set1:
            if(self.h(elem, index)<minValue):
                minElem = elem
                minValue = self.h(elem, index)
        return minElem
           
    def signature(self, set1):
        # return a vector that rapresent the signature of the set using the triples a,b,c in order, stored
        # in the class, to compute the minHash for k times
        sign = []
        for i in range(0,self.k):
            sign.append(self.minHash(set1,i))
        return sign
  

In [354]:
class LSH:
    
    def __init__(self, b = 20, r = 5):
        self.t = (1/b)**(1/r)
        self.b = b
        self.r = r
    
    def h(self, intList):
        # p is a prime number , a is a random number
        p = 119983
        a = 5
        sum = 0
        for number in intList:
            sum = sum + (a*number)
            a = a*a
        return sum%p
            
    def lsh(self, signList):
        # create the lsh matrix with b bands each hashing r rows with "h()"
        matrix = np.zeros((self.b,len(signList)),dtype = np.int)
        for band in range(0,self.b):
            for col in range(0,len(signList)):
                matrix[band][col]=self.h(signList[col][band*self.r:band*self.r+self.r])
        # print(matrix)
        # find the candidate pairs if two columns have at least one element in common
        candidatePair = []
        for i in range(0,matrix.shape[1]):
            for j in range(i+1,matrix.shape[1]):
                row = 0
                candidate = False
                while(row<matrix.shape[0] and candidate==False):
                    if(matrix[row][i]==matrix[row][j]):
                        candidate = True
                        candidatePair.append([i,j])
                    row = row + 1
        return candidatePair
    

In [395]:
#reading the dataset and put text in the docs array
import os
import csv
path = os.getcwd()+"/OpinosisDataset1.0/topics/sound_ipod_nano_8gb.txt.data"
#for filename in os.listdir(directory):
    
with open(path) as f:
    docs = f.read().split('\n')


In [398]:
doc1 = "wetfdsghsfhdr"
doc2 = "egdfbdbbbbddddsef"
doc3 = "sfdseeeewwwwwwfd"
doc4 = "sfdfdsfdsfd"
#docs = [doc1,doc2,doc3,doc4]

shingleSize = 5
band = 20
r = 5
threshold = (1/band)**(1/r)
jaccardThreshold = 0.5
shingling = Shingling()
compareSets = CompareSets()
matrix = Matrix()
compareSignatures = CompareSignatures()
minHashing = MinHashing(k=100)
minHashing.randomize()
lsh = LSH(band,r)


# create a list containiong all the sets of shingles of each document
shinglesList = list()
for doc in docs:
    shinglesList.append(shingling.shingle(doc,shingleSize))
# print("Charactheristic matrix: \n {}".format(matrix.characteristicMatrix(shinglesList)))
print("log: finished to create the shingles for all the documents")

# create a list of signatures of the previous shingles lists
signList = list()
for shingles in shinglesList:
    signList.append(minHashing.signature(shingles))
print("log: finished to create the signatures")

# using LSH find the candidatePairs
candidatePairs = lsh.lsh(signList)
# print(candidatePairs)

# check that candidate pairs have similar signature with threshold (1/band)**(1/r),
# otherwise remove the candidate pair
checkedPairs = []
for pair in candidatePairs:
    signSimilarity = compareSignatures.signatureSimilarity(signList[pair[0]],signList[pair[1]])
    if(signSimilarity > threshold ):
        checkedPairs.append(pair)
# print(checkedPairs)

# for the checkedPairs, check the shingles similarity and remove the ones under the defined jaccard Threshold (Optional)
signCheckedPairs = []
for pair in checkedPairs:
    shingleSimilarity = compareSets.jaccardSimilarity(shinglesList[pair[0]],shinglesList[pair[1]])
    if(shingleSimilarity > jaccardThreshold ):
        signCheckedPairs.append([pair[0],pair[1],shingleSimilarity])

# print the results:
if not signCheckedPairs:
    print("all the documents are different!")
else:
    for pair in signCheckedPairs:
        print("document {} and document {} are similar {} (jaccardSimilarity)".format(pair[0],pair[1],pair[2]))
        print("document {} and document {} ".format(docs[pair[0]],docs[pair[1]]))


log: finished to create the shingles for all the documents
log: finished to create the signatures
document 62 and document 76 are similar 0.6170212765957447 (jaccardSimilarity)
document  It is compact and the sound quality is very good . and document  the sound quality is very good . 
