In [21]:
import pysparnn as snn
from sklearn.feature_extraction import DictVectorizer

data = [
    'hello world',
    'oh hello there',
    'Play it',
    'Play it again Sam',
]    

# build a feature representation for each sentence
def scentence2features(scentence):
    features = dict()
    for word in scentence.split():
        features[word] = 1
    return features

features_list = []
for sentence in data:
    features_list.append(scentence2features(sentence))

dv = DictVectorizer()
dv.fit(features_list)

# build the search index!
cp = snn.ClusterIndex(dv.transform(features_list), data)

# search the index
search_items = [
    scentence2features('oh there'),
    scentence2features('Play it again Frank')
]
search_items = dv.transform(search_items)
cp.search(search_items, min_threshold=0.50, k=1, k_clusters=2, return_metric=False)

[['oh hello there'], ['Play it again Sam']]

In [1]:
# Copyright (c) 2016-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree. An additional grant
# of patent rights can be found in the PATENTS file in the same directory.


import numpy as np
import time
from scipy.sparse import csr_matrix
from sklearn.datasets import fetch_20newsgroups
from sklearn.neighbors import LSHForest
from sklearn.feature_extraction import DictVectorizer

In [2]:
import pysparnn

# Fetch data

In [3]:
dataset = fetch_20newsgroups(subset='all', shuffle=True, random_state=42)

docs = np.array([x.split() for x in dataset.data])
datas = np.array(range(len(docs)))

# Build LSH & pysparnn indexes

In [4]:
class SNNSearch:
    def __init__(self, docs, datas):
        
        self.dv = DictVectorizer()
        dicts = []
        for d in docs:
            dicts.append(dict([(w, 1) for w in d]))
        self.dv.fit(dicts)
        features = csr_matrix(self.dv.transform(dicts), dtype=int)
        self.cp = pysparnn.ClusterIndex(features, datas, pysparnn.matrix_similarity.UnitCosineSimilarity)
        
    def search(self, docs):
        dicts = []
        for d in docs:
            dicts.append(dict([(w, 1) for w in d]))
        features = csr_matrix(self.dv.transform(dicts), dtype=int)
        return self.cp.search(features, return_metric=False, k=1, k_clusters=1, min_threshold=0.1)
        

t0 = time.time()
snn_search = SNNSearch(docs, datas)
print(time.time() - t0)

5.30067110062


In [5]:
class LSHSearch:
    def __init__(self, docs):
        self.lshf = LSHForest(n_estimators=1, n_candidates=1,
                     n_neighbors=1)
        self.dv = DictVectorizer()
        dicts = []
        for d in docs:
            dicts.append(dict([(w, 1) for w in d]))
        self.dv.fit(dicts)
        features = self.dv.transform(dicts)
        # floats are faster
        # features = csr_matrix(features, dtype=int)
        self.lshf.fit(features)
        
    def search(self, docs):
        dicts = []
        for d in docs:
            dicts.append(dict([(w, 1) for w in d]))
        features = self.dv.transform(dicts)
        # floats are faster
        # features = csr_matrix(features, dtype=int)
        return self.lshf.kneighbors(features, return_distance=False)
    
t0 = time.time()    
lsh = LSHSearch(docs) 
print(time.time() - t0)


4.43253207207


### Compare query speed an accuracy

In [6]:
import time
import random
def accuracy(result, truth):
    ret =  []
    for r, t in zip(result, truth):
        ret.append(1 if t in r else 0)
    return np.array(ret)

def time_it(search_index, docs, query_index):
    t0 = time.time()
    neighbors = search_index.search(docs[query_index])
    delta = time.time() - t0

    return delta, accuracy(neighbors, query_index).mean()

def time_it_n(search_index, docs, n=100, k_docs=100):

    times = []
    accuracys = []
    for i in range(n):
        query_index = random.sample(range(len(docs)), k_docs)
        time, accuracy = time_it(search_index, docs, query_index)
        time = time / k_docs
        times.append(time)
        accuracys.append(accuracy)
    return np.median(times), np.median(accuracys)

In [7]:
lsh_time, lsh_accuracy = time_it_n(lsh, docs)
print('LSH median time per query: {0}'.format(lsh_time)) 
print('LSH median accuracy: {0}'.format(lsh_accuracy)) 

LSH median time per query: 0.0028506398201
LSH median accuracy: 1.0


In [8]:
snn_time, snn_accuracy = time_it_n(snn_search, docs)
print('PySparNN median time per query: {0}'.format(snn_time)) 
print('PySparNN median accuracy: {0}'.format(snn_accuracy)) 

PySparNN median time per query: 0.00248134613037
PySparNN median accuracy: 1.0


In [9]:
lsh_time / snn_time

1.1488279628576261

# SlowEuclideanDistance 

In [10]:
class SNNSearchEuclideanDistance:
    def __init__(self, docs, datas):
        
        features = []
        for d in docs:
            features.append(dict([(w, 1) for w in d]))
        self.cp = pysparnn.ClusterIndex(features, datas, pysparnn.matrix_similarity.SlowEuclideanDistance)
        
    def search(self, docs):
        dicts = []
        for d in docs:
            dicts.append(dict([(w, 1) for w in d]))
        return self.cp.search(dicts, return_metric=False, k=1, k_clusters=1, min_threshold=0.0, max_threshold=0.5)
        

t0 = time.time()
snn_search_euclidean = SNNSearchEuclideanDistance(docs, datas)
print(time.time() - t0)

AttributeError: 'list' object has no attribute 'shape'

In [None]:
snn_euclidean_time, snn_euclidean_accuracy = time_it_n(snn_search_euclidean, docs, n=1)
print('PySparNN SlowEuclideanDistance median time per query: {0}'.format(snn_euclidean_time)) 
print('PySparNN SlowEuclideanDistance median accuracy: {0}'.format(snn_euclidean_accuracy)) 