In [2]:
# Import packages
from datasketch import MinHash, MinHashLSHForest

In [3]:
# Tokenize each product
skus = []
titles = []
with open('products.txt', 'r') as products:
    for line in products:
        sku, title = line.split(',')
        skus.append(sku)
        titles.append(set(title.split()))

print(skus)
print(titles)

['217024200', '218670000', '123490270', '216973200', '155536400', '216412900', '193394900', '193393800', '193358500']
[{'Gavetas', '-', 'Araplac', '3', '10', 'Casal', 'Guarda-roupa', 'Rusti', '18490-88', 'Portas'}, {'-', 'Espelho', '3', 'Casal', 'City', 'Madesa', 'Guarda-roupa', '1056-1E', 'com', 'Portas'}, {'Gavetas', '-', 'Araplac', 'Lucca', '3', 'Casal', 'de', 'Guarda-roupa', 'Correr', 'Portas'}, {'Apple', '-', 'Retina', '12MP', '6s', 'Ouro', '4G', '4.7”', 'Câm.', 'Rosa', 'Proc.', 'A9', '32GB', '11', 'iPhone', 'Tela', '+', 'iOS', 'Selfie', '5MP'}, {'-', 'Ouro', '4G', 'Câm.', '13MP', 'Duo', 'Rosa', 'Plus', 'Chip', '32GB', 'Smartphone', 'Dual', '+', 'G5s', 'Moto', 'Motorola'}, {'Samsung', 'Dourado', '16GB', '-', 'Câm', '4G', '13MP', 'Chip', 'Galaxy', 'Smartphone', 'Dual', '+', 'Selfie', 'Metal', '5MP', 'J7', 'Flash'}, {'Conversor', '-', '5102', 'TV', 'USB', 'LED', '3', 'HDMI', 'Digital', '43PFG5102', '2', '43”', 'Smart', 'Série', 'Philips', 'Tela', '5.5”'}, {'Conversor', 'webOS', '-',

In [4]:
# Build MinHash Signatures for each product title
forest = MinHashLSHForest(num_perm=128)
signatures = []
i = 1

for i in range(9):
    signature = MinHash(num_perm=128)
    for token in titles[i]:
        signature.update(token.encode('utf8'))
    forest.add(skus[i], signature)
    signatures.append(signature)

print(len(signatures))

9


In [5]:
# Build index for search (that's where I think LSH comes in)
forest.index()

# For each product, search the similar candidates
for i in range(9):
    result = forest.query(signatures[i], 3)
    print('Produtos candidatos a similares ao sku {}:'.format(skus[i]), result)

Produtos candidatos a similares ao sku 217024200: ['218670000', '123490270', '217024200']
Produtos candidatos a similares ao sku 218670000: ['218670000', '217024200', '193394900']
Produtos candidatos a similares ao sku 123490270: ['218670000', '123490270', '217024200']
Produtos candidatos a similares ao sku 216973200: ['216973200', '155536400']
Produtos candidatos a similares ao sku 155536400: ['216412900', '216973200', '155536400']
Produtos candidatos a similares ao sku 216412900: ['216412900', '155536400']
Produtos candidatos a similares ao sku 193394900: ['218670000', '123490270', '193394900']
Produtos candidatos a similares ao sku 193393800: ['193393800', '193358500']
Produtos candidatos a similares ao sku 193358500: ['193393800', '193358500']


In [6]:
# Comment about...

# - False negatives and false positives.
# - A bit of computational complexity.