In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', 200)  # Adjust the width as needed
import faiss

In [3]:
df=pd.read_csv('https://raw.githubusercontent.com/DhunganaKB/customchat/main/VectorIndexSearch/TextClassification_Indexing.csv')

In [4]:
df = df.reset_index(drop=True)

In [5]:
df.head()

Unnamed: 0,Topic,Sentence
0,Soccer World Cup 2020,"The 2020 FIFA World Cup, a globally anticipated event, brought together nations from around the world."
1,Soccer World Cup 2020,"Hosted in multiple cities, the 2020 Soccer World Cup was a month-long celebration of the beautiful game."
2,Soccer World Cup 2020,"It featured the world's top soccer teams, each vying for the coveted championship title in the Soccer World Cup."
3,Soccer World Cup 2020,"With palpable anticipation, fans from diverse backgrounds united to support their respective nations in the Soccer World Cup."
4,Soccer World Cup 2020,"The matches were held in state-of-the-art stadiums, showcasing the pinnacle of soccer infrastructure in the Soccer World Cup."


In [6]:
df.shape

(109, 2)

### Creating Embedding Vectors

In [7]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [8]:
sentences = list(df['Sentence'])
sentence_embeddings = model.encode(sentences)
embedding_dimension = sentence_embeddings.shape[1]
print(embedding_dimension)

768


#### Usning hnswlib

In [None]:
#https://github.com/nmslib/hnswlib

In [9]:
import hnswlib

In [31]:
# Create a new index
hnsw_index = hnswlib.Index(space='l2', dim=embedding_dimension)

# Initialize an index - the maximum number of elements should be known beforehand
hnsw_index.init_index(max_elements=1000, ef_construction=200, M=16)

# Element insertion (can be called several times)
hnsw_index.add_items(sentence_embeddings)

# Controlling the recall by setting ef:
hnsw_index.set_ef(50) # 

In [32]:
k=3
xq=model.encode(['I would like to know about Nobel Prize from 2020'])

I,D = hnsw_index.knn_query(xq, k)  # search
print(I)

[[20 28 22]]


In [33]:
df.iloc[I[0]]

Unnamed: 0,Topic,Sentence
20,Nobel Prize 2020,"The Nobel Prize, one of the world's most prestigious awards, recognized exceptional achievements across multiple fields in 2020."
28,Nobel Prize 2020,The Nobel Prize in 2020 celebrated accomplishments and served as a reminder of the potential for positive change and innovation across the world.
22,Nobel Prize 2020,"In 2020, the Nobel Prize awarded laureates in Physics, Chemistry, Medicine, Literature, and Peace for their remarkable accomplishments."


In [34]:
## like to save the index for future use:


In [35]:
hnsw_index.save_index('hnsw_index_saved.index')

In [36]:
new_index = hnswlib.Index(space='l2', dim=embedding_dimension)

In [26]:
new_index.load_index('hnsw_index_saved.index')

In [27]:
I,D = new_index.knn_query(xq, k)  # search


### faiss implementation

In [39]:
import faiss
# create an index:
M=16
faiss_index = faiss.IndexHNSWFlat(embedding_dimension, M)  
faiss_index.hnsw.efConstruction = 40         # Setting the value for efConstruction.
faiss_index.hnsw.efSearch = 200              # Setting the value for efSearch.


# adding embedding vectors in the index
faiss_index.add(sentence_embeddings)

# making search 
k=3
D, I = faiss_index.search(xq, k)
print(I)
print(D)

#[[20 28 22]]
#[[154.76157 186.0599  188.53912]]


[[20 28 22]]
[[154.76157 186.0599  188.53912]]
