In [21]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Binarizer
import faiss

In [1]:
# Sample list of documents (strings)
documents = [
    "Document one is here.",
    "This is the second document.",
    "And this is document number three.",
    # Add more documents here...
]

# Step 1: Convert documents to TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=128)  # Limit to 128 features for LSH compatibility
tfidf_matrix = vectorizer.fit_transform(documents)

In [3]:
# Step 2: Convert TF-IDF vectors to binary format
# Binarize the TF-IDF matrix to get binary vectors (0s and 1s)
binarizer = Binarizer(threshold=0.0)  # Convert non-zero values to 1
binary_vectors = binarizer.transform(tfidf_matrix).toarray().astype('float32')

In [22]:
for i, binary_vector in enumerate(binary_vectors):
    print(f"Document {i}:", binary_vector)

Document 0: [0. 1. 1. 1. 0. 1. 0. 0. 0. 0.]
Document 1: [0. 1. 0. 1. 0. 0. 1. 1. 1. 0.]
Document 2: [1. 1. 0. 1. 1. 0. 0. 0. 1. 1.]


In [5]:
# Step 3: Create FAISS LSH Index
d = binary_vectors.shape[1]  # Dimensionality of the vectors
num_bits = 64  # Number of bits to use in LSH, can adjust based on preference

In [6]:
# Initialize the FAISS LSH index
index = faiss.IndexLSH(d, num_bits)

# Add binary vectors to the index
index.add(binary_vectors)

In [23]:
# Step 4: Perform a search
# Convert a sample document into binary vector to search
query_doc = "second document is here."  # Sample query document
query_vector = vectorizer.transform([query_doc]).toarray().astype('float32')
query_binary = binarizer.transform(query_vector)
print("Query Document:", query_binary[0])

Query Document: [0. 1. 1. 1. 0. 0. 1. 0. 0. 0.]


In [18]:
# Search for the top 3 nearest neighbors
k = 3
distances, indices = index.search(query_binary, k)

In [19]:
# Output results
print("Nearest neighbors indices:", indices)
print("Distances:", distances)
print("Similar documents:")
for i in indices[0]:
    if i != -1:  # Ignore -1, which indicates no result found
        print(documents[i])


Nearest neighbors indices: [[0 1 2]]
Distances: [[10. 12. 23.]]
Similar documents:
Document one is here.
This is the second document.
And this is document number three.
