In [None]:
!pip install pandas
!pip install numpy
!pip install faiss-cpu
!pip install sentence-transformers

In [None]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

In [None]:
model = SentenceTransformer('all-mpnet-base-v2')
data_path = "../data"

# SentenceTransformers
SentenceTransformers is a Python framework for state-of-the-art sentence, text and image embeddings. The initial work is described in our paper Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks.

# Faiss
Faiss is a library for efficient similarity search and clustering of dense vectors. It contains algorithms that search in sets of vectors of any size, up to ones that possibly do not fit in RAM. It also contains supporting code for evaluation and parameter tuning.

In [None]:
def generate_index(data):
    encoded_data = model.encode(data)
    encoded_data = np.asarray(encoded_data.astype('float32'))
    index = faiss.IndexIDMap(faiss.IndexFlatIP(768))
    ids = np.array(range(0, len(data)))
    ids = np.asarray(ids.astype('int64'))
    index.add_with_ids(encoded_data, ids)
    faiss.write_index(index, f'{data_path}/sentences.index')

def search(query):
    index = faiss.read_index(f'{data_path}/sentences.index')
    query_vector = model.encode([query])
    k = 5
    top_k = index.search(query_vector, k)
    return top_k[1].tolist()[0]

In [None]:
df = pd.read_csv(f'{data_path}/data_file.csv', sep="\t")
data = df["col"]

generate_index(data)

In [None]:
query = "string"

search(query)