In [1]:
import faiss
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open("data/sentences.txt", "r") as f:
    sentences = f.read().splitlines()

In [3]:
sentences[:5]

['Four people rowing down a river.',
 'A woman in a black dress is pulling a cart and is standing near two men who are seated on a park bench',
 'Spansion Flash memory solutions are available worldwide from AMD and Fujitsu.',
 'A yellow kayak is being ridden by a man and a young boy',
 'The driver, Eugene Rogers, helped to remove children from the bus, Wood said.']

In [5]:
len(sentences)

14504

In [4]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [6]:
sentence_embeddings = model.encode(sentences)
sentence_embeddings.shape

(14504, 384)

In [7]:
d = sentence_embeddings.shape[1]
d

384

## Flat

In [8]:
index = faiss.IndexFlatL2(d)

In [None]:
index.is_trained

True

In [10]:
index.add(sentence_embeddings)

In [11]:
index.ntotal

14504

In [15]:
k = 5
xq = model.encode(["Soccer"])

In [28]:
%%time
D, I = index.search(xq, k)  # search
print(I)

[[ 3238 12554  8171  7714 13359]]
CPU times: total: 0 ns
Wall time: 6.89 ms


In [30]:
[f'{I[0][i]}, Similarity: {D[0][i]}: {sentences[I[0][i]]}' for i in range(len(I[0]))]

['3238, Similarity: 0.7008035182952881: Two teams are playing soccer',
 '12554, Similarity: 0.8051016330718994: There is no team playing soccer',
 '8171, Similarity: 0.8078267574310303: Two teams are enthusiastically playing soccer',
 '7714, Similarity: 0.8641998171806335: The players is maneuvering for the soccer ball',
 '13359, Similarity: 0.8754724860191345: Two soccer teams are playing, one is in white the other in red.']

In [32]:
sentence_embeddings[I[0]]

array([[-0.00052541,  0.03629366, -0.09062289, ..., -0.00027818,
         0.03138516,  0.01272458],
       [ 0.04979156,  0.02657485, -0.07570338, ..., -0.03477927,
         0.03460299, -0.01035636],
       [ 0.04664638,  0.01123674, -0.06384049, ...,  0.03923417,
         0.00555222, -0.00035002],
       [ 0.02777832,  0.04824036, -0.03446573, ...,  0.00280244,
         0.0400941 ,  0.03012369],
       [ 0.01219965,  0.00510379, -0.09950106, ...,  0.04080795,
         0.01295557,  0.05050651]], shape=(5, 384), dtype=float32)

## IVFlat

In [None]:
n_clusters = 50
quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFFlat(quantizer, d, n_clusters)

In [None]:
index.is_trained

False

In [None]:
index.train(sentence_embeddings)
index.is_trained

True

In [36]:
index.add(sentence_embeddings)

In [37]:
%%time
D, I = index.search(xq, k)  # search
print(I)

[[ 3238  8171  7714 13359  1378]]
CPU times: total: 0 ns
Wall time: 5.99 ms


In [38]:
index.nprobe = 10

In [None]:
%%time
D, I = index.search(xq, k)  # search
print(I)

[[ 3238  8171  7714 13359  1378]]
CPU times: total: 0 ns
Wall time: 692 μs
