In [9]:
import pandas as pd

pd.set_option('display.max_colwidth', 100)

## Read the sample data

In [10]:
df = pd.read_csv('sample_text.csv')

df.head()

Unnamed: 0,text,category
0,Meditation and yoga can improve mental health,Health
1,"Fruits, whole grains and vegetables helps control blood pressure",Health
2,These are the latest fashion trends for this week,Fashion
3,Vibrant color jeans for male are becoming a trend,Fashion
4,The concert starts at 7 PM tonight,Event


## Load Sentence Transformer model and extract embeddings

In [11]:
from sentence_transformers import SentenceTransformer

encoder = SentenceTransformer('all-mpnet-base-v2')

vectors = encoder.encode(df.text)

vectors.shape

(8, 768)

In [12]:
dim = vectors.shape[1]
dim

768

## Create the index for vectors

In [13]:
import faiss

# Create the faiss index
index = faiss.IndexFlatL2(dim)

# Add the vectors to the index
index.add(vectors)

## Search for simiklar text in the index

In [14]:
search_query = "I wanna buy a polo t-shirt."

search_vector = encoder.encode([search_query])

search_vector.shape

(1, 768)

In [15]:
# Search the vector in the index and retrieve top k=5 similar vectors
distances, indexes = index.search(search_vector, k=2)

In [16]:
df.loc[indexes[0]]

Unnamed: 0,text,category
3,Vibrant color jeans for male are becoming a trend,Fashion
2,These are the latest fashion trends for this week,Fashion


It retrieves the indexes of fashion related texts from the dataframe, since the search query is about polo t-shirt.\
When we change the search query, we can get the indexes of the texts that relates to the search query.