In [11]:
import numpy as np
import pandas as pd
import re
import time
from datasketch import MinHash, MinHashLSHForest

# Preprocess will split a string of text into individual tokens/shingles based on whitespace.
def preprepare(text):
    #text = re.sub(r'[^\w+( \w+)*$]','', text) #bigram
    text = re.sub(r'[^\w\s]','',text) #unigram
    
    tokens = text.lower()
    tokens = tokens.split()
    return tokens

text = 'My name is RAM'
print('The shingles (tokens) are:', preprepare(text))

#Number of Permutations
permutations = 128


#Number of Recommendations to return
#num_recommendations = 1

def get_forest(data, perms):
    start_time = time.time()
    
    minhash = []
    
    for text in data['text']:
        tokens = preprepare(text)
        m = MinHash(num_perm=perms)
        for s in tokens:
            m.update(s.encode('utf8'))
        minhash.append(m)
        
    forest = MinHashLSHForest(num_perm=perms)
    
    for i,m in enumerate(minhash):
        forest.add(i,m)
        
    forest.index()
    
    print('It took %s seconds to build forest.' %(time.time()-start_time))
    return forest

def predict(text, database, perms, num_results, forest):
    start_time = time.time()
    
    tokens = preprepare(text)
    m = MinHash(num_perm=perms)
    for s in tokens:
        m.update(s.encode('utf8'))
        
    idx_array = np.array(forest.query(m, num_results))
    if len(idx_array) == 0:
        return None # if your query is empty, return none
    
    result = database.iloc[idx_array]['Book_title']
    
    print('It took %s seconds to query forest.' %(time.time()-start_time))
    
    return result

db = pd.read_csv(r'C:\Users\Ami\Desktop\Entity Resolution\prog_book.csv')
db['text'] = db['Book_title'] + ' ' + db['Description']
forest = get_forest(db, permutations)


num_recommendations = 20
Book_title = 'C++ '           #java art   #Google
result = predict(Book_title, db, permutations, num_recommendations, forest)
print('\n Top Entity Match(es) is(are) \n',  result)

The shingles (tokens) are: ['my', 'name', 'is', 'ram']
It took 1.1430654525756836 seconds to build forest.
It took 0.0060002803802490234 seconds to query forest.

 Top Entity Match(es) is(are) 
 192    Think Like a Programmer: An Introduction to Cr...
173    Modern C++ Design: Generic Programming and Des...
87     Learn You a Haskell for Great Good!: A Beginne...
88                   Learn You a Haskell for Great Good!
123    The Clean Coder: A Code of Conduct for Profess...
157    More Exceptional C++: 40 New Engineering Puzzl...
Name: Book_title, dtype: object
