In [2]:
import csv
import numpy as np
import re
import hashlib
import itertools
from collections import Counter
from pprint import pprint
import pandas as pd

In [3]:
# Produces shingles for a given text
def shingling(text, k = 2, trim = True):
    # possibly trim text
    s = re.sub('[\s+]', '', text) if trim else text

    # produce k-shingles and map shingles to shingle IDs using hash()
    shingles = {s[i:i + k] for i in range(len(s) - k + 1)}

    hashes = {hash(i) for i in shingles}
    return hashes

shingling('this is a test')

{-5604688695559431049,
 -4709290484209255997,
 -3048653072066474761,
 -2597013127494721567,
 -2311060957885767026,
 -1221511435840044675,
 638636724992558854,
 2790279245339122502,
 7881566121019139748}

In [4]:
# Jaccard similarity for two sets
def compareSets(setA, setB):
    return len(setA.intersection(setB)) / len(setA.union(setB))

setA = shingling("how a nice day")
setB = shingling("how are you today")   
compareSets(setA,setB)

0.2777777777777778

In [5]:
# Creates the Minhash signature of length n from the shingle set
algorithms = [x for x in hashlib.algorithms_guaranteed if x not in ["shake_128", "shake_256"]] 
# they need a fixed-length argument(The shake_128() and shake_256() algorithms provide variable length digests with length_in_bits//2 up to 128 or 256 bits of security)

#hash a str into 8 digits using hash functions (hexdigest returns a HEX string representing the hash)
def hashWith(alg, i):
    return int(hashlib.new(alg,str(i).encode('UTF-8')).hexdigest(), 16)#位数截取

def minHashing(shingleSet, n = 3):
    # throw error if not enough hash functions are available
    if (n > len(algorithms)):
        raise ValueError('The maximum number of hash functions available is {0}.'.format(len(algorithms)))

    # iterate over n hash functions and compute h_min(s) for the set.
    signature = [min(hashWith(alg, i) for i in shingleSet) for alg in algorithms[0:n]] 
    return signature

setA = shingling("how a nice day")
signature = minHashing(setA, 12)
signature

[3468371960717953315807008496198668124375223090456972966412311387751206841934,
 397118405388484913744664667614821681691839482159293229557058501465,
 226757767749371909531207584897730841400940432352794478572285057525,
 28920365370204966916117120546581656280030706346385192108829855539375900627416,
 3559200500008080058854152276585065511378008220259491308366799905749890829006655223110333460537426623780409604896644,
 4840777009214519595518720026003457255,
 1293996948067994664268517545657085027782055034557215295262305348393180854772056794107009603915702251683679573050436838094426658926769236532885017192192506,
 703550327303886624207891997267108181387205307098285552408488502533057369094895775402362393953077113127388711491245773329631060442617722466041690294722275,
 48843565933115204161573791519806065909052469633859268220806802239622918433718,
 302287561583754436898542403105080676622730707280,
 7334246456494885270275531798370060210075921267825310803090677779247735142289605024194044516899443532

In [12]:
# Compare the signatures, the returned probability will approximate the jaccard similarity of the original shingle sets
def compareSignatures(signatureA, signatureB):
    if (len(signatureA) != len(signatureB)):
        raise ValueError('The signatures have different length({0}, {1} respectively) and should not be compared!', len(signatureA), len(signatureB))

    A = np.array(signatureA)
    B = np.array(signatureB)
    count = np.count_nonzero(A==B)
    # Important: Not jaccard similarity -> Probability instead (number_of_same/number_of_total)
    probability = count / len(signatureA)
    #print(A==B, count, probability)
    
    return probability

setA = shingling("what about you")
setB = shingling("how about you")
print("Set A", setA)
print("Set B", setB)
similarity = compareSets(setA, setB)
print(similarity)
signatureA = minHashing(setA, 12)
signatureB = minHashing(setB, 12)
print("Signature A", signatureA)
print("Signature B", signatureB)
compareSignatures(signatureA, signatureB)

Set A {-6261026196006845022, 6260079725503236383, 5760101317112817636, 2790279245339122502, 1679236819157627884, 5354901128432663950, 3834972162194260274, 8770486948190440786, -389265070190448133, -7634035276630498110}
Set B {-140246809374288832, -6261026196006845022, 6260079725503236383, -7408966727226440914, 1679236819157627884, 5354901128432663950, 5143716844873627055, 3834972162194260274, -7634035276630498110}
0.46153846153846156
Signature A [25853483646181122659267848476276115042781602294605427968987598776947588829706, 792368716865843020180744681815288505202435654755365803751882185774, 1365580069627314265996603121570410320322217725498685698844222116357, 2834720272755034997510467689504801536503115861925810927247086108151867651177, 520490046288962548321443995835636439242001397764372396157513853741360305995411940935422206495474270653685378214341, 3070406864355700703117178557952178062, 1461895404756468104106637779868947662621725968036964887043867979940323413158320858642351985365034910

0.5833333333333334

In [10]:
# Locality sensitive hashing , input parameter: # of bands to separate the signatures into
def lhs(signatures, similarity_threshold, nr_bands = 5, nr_buckets = 5):
    if (len(signatures) < 1 or not all(len(s) == len(signatures[0]) for s in signatures)):
        raise ValueError('The signatures need to have all the same length and be non empty.')

    # 1.) Iterate over signatures, cut in bands and hash each band into a bucket
    buckets = [set() for x in range(0, nr_buckets)]
    bands = np.linspace(0, len(signatures[0]), nr_bands).astype(int).tolist()

    for index, signature in enumerate(signatures):
        for i in range(0, nr_bands-1):
            band_start = bands[i]
            band_end = bands[i+1]
            # join band to be hashed "as one entity"
            band = "".join(str(x) for x in signature[band_start:band_end])
            bucket = hash(band) % nr_buckets
            # add the signature-set-join ("identifier") to the bucket
            #buckets[bucket].add("".join(str(x) for x in signature)) # store stringified signature
            buckets[bucket].add("index %s" % index) # replaced above line with a human readable string in order to validate results after
    
    # 2.) Use sets of buckets to determine candidate pairs based on threshold 
    relevant_buckets = [x for x in buckets if len(x) >= 2] # only check buckets with more than one signature       
    relevant_pairs = []
    for bucket in relevant_buckets:
        # get all relevant pairs from all buckets and append to a huge list
        pairs = [x for x in itertools.combinations(bucket, 2) if x[0] != x[1]]
        relevant_pairs += pairs

    count = Counter(relevant_pairs)
    # count the occourences of each pair in a list and see if the pairs similarity (based on same hashed buckets) crossed the threshold
    indices = [index for index, x in enumerate(count.values()) if (x/(nr_bands-1)) >= similarity_threshold] 
    
    # use the indices for the final candidate pairs
    candidate_pairs = [pair for index, pair in enumerate(count.keys()) if index in indices]

    return candidate_pairs

print(lhs([signatureA, signatureB], 0.3))
print(lhs([signatureA, signatureB], 0.8))

[('index 0', 'index 1')]
[]


In [11]:
import codecs
from pprint import pprint

# Import hotel review data
f = codecs.open('data.txt', encoding='utf-8')
dataSet = [line.strip() for line in f]

print('avg char count', sum(len(d) for d in dataSet)/len(dataSet))

# Executes given comparison function over combinations of elements in input array
def compare(fn, arr):
    return [(s[0][0], s[1][0], fn(s[0][1], s[1][1])) for s in itertools.combinations(arr, 2)]

# jaccard similarity with shinglings
shinglings = [(i+1, shingling(t, k=4)) for i, t in enumerate(dataSet)] # value i+1 is the line number and it is stored to evalute results
similarities = compare(compareSets, shinglings)

similarities = [s for s in similarities if s[2] > 0.3]
pprint(similarities)

# jaccard similarity with minHashing
signatures = [(i, minHashing(s, n=12)) for i, s in shinglings]
similarities = compare(compareSignatures, signatures)

similarities = [s for s in similarities if s[2] > 0.4]
pprint(similarities)

# local sensitive hashing
pprint(lhs([s for i, s in signatures], 0.5, nr_buckets=200)) # index i represents the comment with line number i+1

avg char count 268.6708229426434
[(73, 74, 1.0),
 (90, 186, 0.4090909090909091),
 (90, 194, 0.3235294117647059),
 (102, 140, 0.3492063492063492),
 (153, 154, 1.0),
 (178, 241, 0.375),
 (178, 395, 0.3333333333333333),
 (235, 347, 0.34146341463414637),
 (239, 258, 0.34285714285714286),
 (239, 347, 0.3333333333333333),
 (241, 395, 0.30303030303030304),
 (258, 320, 0.34285714285714286),
 (258, 347, 0.34375),
 (258, 351, 0.3055555555555556),
 (258, 366, 0.3783783783783784),
 (320, 328, 0.34615384615384615),
 (320, 330, 0.3055555555555556),
 (320, 347, 0.3333333333333333),
 (320, 377, 0.3055555555555556),
 (320, 382, 0.32075471698113206),
 (330, 377, 0.3142857142857143)]
[(6, 90, 0.4166666666666667),
 (6, 134, 0.4166666666666667),
 (60, 185, 0.4166666666666667),
 (73, 74, 1.0),
 (84, 217, 0.4166666666666667),
 (91, 180, 0.4166666666666667),
 (99, 320, 0.4166666666666667),
 (99, 366, 0.4166666666666667),
 (103, 318, 0.4166666666666667),
 (153, 154, 1.0),
 (178, 186, 0.5833333333333334),
 (178