# FAISS example from pinecone.com

## Set up

In [2]:
# -*- coding: utf-8 -*-
"""
Created on Fri Mar  7 11:07:55 2025
https://www.pinecone.io/learn/series/faiss/faiss-tutorial/

@author: wb268970
"""

import requests
from io import StringIO
import pandas as pd
import numpy

#The first dataset is in a slightly different format:

## Read the example data

In [3]:
#%%    
res = requests.get('https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/sick2014/SICK_train.txt')
# create dataframe
data = pd.read_csv(StringIO(res.text), sep='\t')
data.head()

#%%
sentences = data['sentence_A'].tolist()
sentences[:5]

#%%
# we take all samples from both sentence A and B
sentences = data['sentence_A'].tolist()
sentence_b = data['sentence_B'].tolist()
sentences.extend(sentence_b)  # merge them
len(set(sentences))  # together we have ~4.5K unique sentences

#%% Extend the dbase
urls = [
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/MSRpar.train.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/MSRpar.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2012/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2013/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2014/OnWN.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2014/images.test.tsv',
    'https://raw.githubusercontent.com/brmson/dataset-sts/master/data/sts/semeval-sts/2015/images.test.tsv'
]

# each of these dataset have the same structure, so we loop through each creating our sentences data
for url in urls:
    res = requests.get(url)
    # extract to dataframe
    data = pd.read_csv(StringIO(res.text), sep='\t', header=None, on_bad_lines='skip')
    # add to columns 1 and 2 to sentences list
    sentences.extend(data[1].tolist())
    sentences.extend(data[2].tolist())
    
len(set(sentences))    

14505

## Do the embeddings


In [4]:
#%%

# remove duplicates and NaN
sentences = [word for word in list(set(sentences)) if type(word) is str]

from sentence_transformers import SentenceTransformer
# initialize sentence transformer model
model = SentenceTransformer('bert-base-nli-mean-tokens')
# create sentence embeddings
sentence_embeddings = model.encode(sentences)
sentence_embeddings.shape

(14504, 768)

In [5]:
import pickle
with open("Embeddings.pkl", 'wb') as outp:  # Overwrites any existing file.
        pickle.dump(sentence_embeddings, outp, pickle.HIGHEST_PROTOCOL)

print(numpy.__version__)


2.2.2


## Now we try and do the FAIss stuff

In [6]:
import faiss

d = sentence_embeddings.shape[1]
d
index=faiss.IndexFlatL2(d)
index.is_trained

True

In [7]:
index.add(sentence_embeddings)
index.ntotal

14504

# Our L2 index is created yeah

In [8]:

k = 4
xq = model.encode(["Someone sprints with a football"])


do teh search

In [9]:
%%time
D, I = index.search(xq, k)  # search
print(I)




[[ 9062 10750 11182  9223]]
CPU times: total: 46.9 ms
Wall time: 17.6 ms


In [10]:
len(sentences)

14504

In [11]:
#NB the print expression in the tutorial does not work, this does
[f'{i}: {sentences[i]}' for i in I[0]]

['9062: A group of football players is running in the field',
 '10750: A group of people playing football is running in the field',
 '11182: Two groups of people are playing football',
 '9223: A person playing football is running past an official carrying a football']

## Using approximate search methods

Above is an exhaustive search if the database, below we pre-organize the dataset so that likes are grouped together and then we first search over the central tendancy of all tegh groups and then within the group (or groups) taht arethe closest match.

This gives us an approximation of the results from the exhaustive search

Voronoi cell method



In [12]:
#Create  50 groups
#feed the list from above as a quantizer and then generate the upper level index
nlist = 50  # how many cells
quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFFlat(quantizer, d, nlist)

In [13]:
# This new index needs to be trained (is_trained returns false
index.is_trained

# we train it on the embeddings (takes a moment)
index.train(sentence_embeddings)
index.is_trained  # check 

index.add(sentence_embeddings)
index.ntotal  



14504

## Now we ask the same query as before, but it runs way faster



In [14]:
%%time
D, I = index.search(xq, k)  # search
print(I)




[[ 9062 10750 11182  9223]]
CPU times: total: 0 ns
Wall time: 0 ns


In [15]:
[f'{i}: {sentences[i]}' for i in I[0]]

['9062: A group of football players is running in the field',
 '10750: A group of people playing football is running in the field',
 '11182: Two groups of people are playing football',
 '9223: A person playing football is running past an official carrying a football']

This search just searched exhaustively the closest cell.  As it happens it generate sthe same result as the echaustive search but it need not.  

If we want to increase accuracy we can also search the "closesT adjacent cells.

Below searches the 10 closest adjacent cell (nprobe=10). This will be slower but will catch edges that might be better than th  best answers in the closest cell.



In [18]:
index.nprobe=10

D, I = index.search(xq, k)  # search
print(I)

[[ 9062 10750 11182  9223]]


## Vector reconstruction

We can't pull the answers out of VECTOR_DB on this search because of tegh two step search.  However we can make a mapping and then reconstruct the vector

index.make_direct_map()
index.reconstruct(7460)[:100]

In [19]:
index.make_direct_map() 
index.reconstruct(7460)[:100]

array([ 0.15472068, -0.5122029 ,  1.3126652 ,  0.6024759 , -0.06100069,
       -0.93629616, -0.70237976,  0.73408574,  0.5450875 , -0.27290782,
       -0.5584474 ,  0.83948326,  0.54898095,  0.12920375,  1.3358675 ,
        0.08322317, -0.93952936, -0.86793655, -0.28046873, -0.20230664,
       -1.0648402 ,  0.08017921, -0.59579366, -0.7956636 ,  0.37972102,
       -0.8312959 , -0.22539063, -1.3049499 , -1.2862183 ,  0.17064631,
       -0.22948855,  0.93691784,  0.65546316, -0.18186367, -0.5413576 ,
        0.56543756,  1.4237095 ,  0.0801444 , -0.03141323, -0.6890686 ,
        0.92161417, -0.08932924,  0.61229473, -0.8886215 , -0.8715622 ,
       -0.42966792, -0.07079749,  0.8167949 , -0.22693019, -1.410869  ,
        0.42714944, -0.01733742,  0.0430317 , -0.59035105, -0.7381914 ,
        0.31170622, -0.07283385, -0.23089044, -0.13359231,  0.7888765 ,
       -0.01654837, -0.52540797,  0.7211192 ,  0.27512184, -0.39492846,
        0.64258885,  1.3808063 ,  0.23299615, -0.830509  , -0.88

In [20]:
D, I = index.search(xq, k)  # search
print(I)

[[ 9062 10750 11182  9223]]


In [21]:
[f'{i}: {sentences[i]}' for i in I[0]]

['9062: A group of football players is running in the field',
 '10750: A group of people playing football is running in the field',
 '11182: Two groups of people are playing football',
 '9223: A person playing football is running past an official carrying a football']

## Quantization

Dbase is still very large (it can be searched more efficiently but it still uses a lot of space.

Quantization reduces the scale of the search problem and increases the speed of search (at the cost of some accuracy)

In [22]:
m = 8  # number of centroid IDs in final compressed vectors
bits = 8 # number of bits in each centroid

quantizer = faiss.IndexFlatL2(d)  # we keep the same L2 distance flat index
index = faiss.IndexIVFPQ(quantizer, d, nlist, m, bits) 
index.is_trained

index.train(sentence_embeddings)
index.add(sentence_embeddings)


In [24]:
index.nprobe = 10  # align to previous IndexIVFFlat nprobe value

D, I = index.search(xq, k)
print(I)

[[ 8474  9062 10750   182]]


In [25]:
[f'{i}: {sentences[i]}' for i in I[0]]

['8474: A group of football players running down the field.',
 '9062: A group of football players is running in the field',
 '10750: A group of people playing football is running in the field',
 '182: position in football played by a team member']