In [None]:
pip install faiss-cpu  sentence-transformers annoy nmslib

In [None]:
import pickle
import faiss
import tensorflow_datasets as tfds
import numpy as np
from sentence_transformers import SentenceTransformer
import annoy, nmslib
model = SentenceTransformer('all-MiniLM-L6-v2')

# Exhaustive Search

https://engineering.fb.com/2017/03/29/data-infrastructure/faiss-a-library-for-efficient-similarity-search/

In [None]:
ratings = tfds.load("movielens/100k-ratings", split="train")
df = tfds.as_dataframe(ratings)
df.head()

Unnamed: 0,bucketized_user_age,movie_genres,movie_id,movie_title,raw_user_age,timestamp,user_gender,user_id,user_occupation_label,user_occupation_text,user_rating,user_zip_code
0,45.0,[7],b'357',"b""One Flew Over the Cuckoo's Nest (1975)""",46.0,879024327,True,b'138',4,b'doctor',4.0,b'53211'
1,25.0,"[4, 14]",b'709',b'Strictly Ballroom (1992)',32.0,875654590,True,b'92',5,b'entertainment',2.0,b'80525'
2,18.0,[4],b'412',"b'Very Brady Sequel, A (1996)'",24.0,882075110,True,b'301',17,b'student',4.0,b'55439'
3,50.0,"[5, 7]",b'56',b'Pulp Fiction (1994)',50.0,883326919,True,b'60',4,b'healthcare',4.0,b'06472'
4,50.0,"[10, 16]",b'895',b'Scream 2 (1997)',55.0,891409199,True,b'197',18,b'technician',3.0,b'75094'


In [None]:
df_unique = df.drop_duplicates('movie_id').copy()[['movie_id', 'movie_title']]
df_unique['vector'] = model.encode(df_unique['movie_title'].tolist()).tolist()

# use faiss to search for all
vector = np.array(df_unique['vector'].tolist())
index = faiss.IndexFlatL2(vector.shape[1])
index.add(vector.astype('float32'))

# query
query = np.array(df_unique[df_unique['movie_title'] == b'Toy Story (1995)'].vector.tolist()).astype('float32')
distances, indices = index.search(query, 10) 
[df_unique['movie_title'].tolist()[i] for i in indices[0]]

[b'To Wong Foo, Thanks for Everything! Julie Newmar (1995)',
 b'Toy Story (1995)',
 b'Top Gun (1986)',
 b'To Kill a Mockingbird (1962)',
 b'To Die For (1995)',
 b'Tombstone (1993)',
 b'Tommy Boy (1995)',
 b'To Gillian on Her 37th Birthday (1996)',
 b'Tomorrow Never Dies (1997)',
 b'To Catch a Thief (1955)']

# Approximate Nearest Neighbor

https://towardsdatascience.com/comprehensive-guide-to-approximate-nearest-neighbors-algorithms-8b94f057d6b6
https://towardsdatascience.com/understanding-locality-sensitive-hashing-49f6d1f6134

In [None]:
# Ann using annoy search forest
vectors = np.array(df_unique['vector'].tolist())
query = np.array(df_unique[df_unique['movie_title'] == b'Toy Story (1995)'].vector.tolist()).astype('float32')

index = annoy.AnnoyIndex(vectors.shape[1])
for i, vec in enumerate(vectors):
    index.add_item(i, vec.tolist())
index.build(5)

indices = index.get_nns_by_vector(query[0].tolist(), 10, search_k=5)    
[df_unique['movie_title'].tolist()[i] for i in indices]    

  """


[b'To Kill a Mockingbird (1962)',
 b'To Wong Foo, Thanks for Everything! Julie Newmar (1995)',
 b'To Catch a Thief (1955)',
 b'Toy Story (1995)',
 b'Top Gun (1986)',
 b'Tomorrow Never Dies (1997)',
 b'To Die For (1995)',
 b'Tombstone (1993)',
 b'Tommy Boy (1995)',
 b'To Gillian on Her 37th Birthday (1996)']

In [None]:
# ANN using Faiss
# use faiss to search for all
vector = np.array(df_unique['vector'].tolist())
# num_bits — A larger value will give more accurate results, but larger indexes.
index = faiss.IndexLSH(vector.shape[1], 8)
index.add(vector.astype('float32'))

# query
query = np.array(df_unique[df_unique['movie_title'] == b'Toy Story (1995)'].vector.tolist()).astype('float32')
distances, indices = index.search(query, 10) 
[df_unique['movie_title'].tolist()[i] for i in indices[0]]

[b'Scream 2 (1997)',
 b'To Catch a Thief (1955)',
 b'To Wong Foo, Thanks for Everything! Julie Newmar (1995)',
 b'Scream (1996)',
 b'Toy Story (1995)',
 b'Top Gun (1986)',
 b'Tomorrow Never Dies (1997)',
 b'To Die For (1995)',
 b"Schindler's List (1993)",
 b'To Kill a Mockingbird (1962)']

In [None]:
# ANN using Hierarchical Navigable Small World Graphs(hnsw)
vectors = np.array(df_unique['vector'].tolist())
query = np.array(df_unique[df_unique['movie_title'] == b'Toy Story (1995)'].vector.tolist()).astype('float32')

index = nmslib.init(method='hnsw', space='cosinesimil')
index.addDataPointBatch(vectors)
index.createIndex({'post': 2})

indices = index.knnQuery(query, k=10)
[df_unique['movie_title'].tolist()[i] for i in indices[0]]

[b'To Gillian on Her 37th Birthday (1996)',
 b'Touch (1997)',
 b'To Kill a Mockingbird (1962)',
 b'To Wong Foo, Thanks for Everything! Julie Newmar (1995)',
 b'To Catch a Thief (1955)',
 b'Toy Story (1995)',
 b'Touki Bouki (Journey of the Hyena) (1973)',
 b'Tom and Huck (1995)',
 b'To Live (Huozhe) (1994)',
 b'Tomorrow Never Dies (1997)']

# Faiss Quantization
https://www.pinecone.io/learn/faiss-tutorial/