In [10]:
# Полезные материалы:
# https://api.python.langchain.com/en/latest/embeddings/langchain_huggingface.embeddings.huggingface.HuggingFaceEmbeddings.html
# https://medium.com/@milana.shxanukova15/embeddings-normalisation-b279e32ca958
# https://api.python.langchain.com/en/latest/vectorstores/langchain_community.vectorstores.faiss.FAISS.html
# https://medium.com/@pankaj_pandey/faiss-efficient-similarity-search-and-clustering-of-dense-vectors-dace1df1e235
# https://sbert.net/docs/package_reference/sentence_transformer/SentenceTransformer.html#sentence_transformers.SentenceTransformer.encode
# https://github.com/facebookresearch/faiss/wiki/MetricType-and-distances
# https://github.com/langchain-ai/langchain/discussions/16224

In [9]:
import pandas as pd
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
import ast

# Есть 2 варианта формирования базы данных:
# 1. C использованием нормализации векторов. 
#    Тогда 'normalize_embeddings' = True и 'distance_strategy' = DistanceStrategy.MAX_INNER_PRODUCT
# 2. Без нормализации векторов.
#    Тогда 'normalize_embeddings' = False и 'distance_strategy' = DistanceStrategy.EUCLIDEAN_DISTANCE

# !!! BELOW TO CHANGE !!! 
DATASET_PATH = ''
SAVE_DIR = ''
DB_SAVE_PATH = f'{SAVE_DIR}/'
DB_LOG_PATH = f'{SAVE_DIR}/db_info.log'

EMBEDDING_MODEL_PATH = '../../models/'
MODEL_KWARGS = {'device': 'cpu'}
ENCODE_KWARGS = {'normalize_embeddings': False, 'prompt': 'passage: '}
FAISS_KWARGS = {'distance_strategy': DistanceStrategy.EUCLIDEAN_DISTANCE}
# !!! ABOVE TO CHANGE !!! 

In [None]:
# Loading Embedder-model

embeddings = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_PATH,
    model_kwargs=MODEL_KWARGS,
    encode_kwargs=ENCODE_KWARGS 
)

In [None]:
# Loading dataset

df = pd.read_csv(DATASET_PATH)
df['metadata'] = df['metadata'].map(lambda v: ast.literal_eval(v)) 

In [None]:
# Creating DataBase

faiss = FAISS.from_texts(df['text'].to_list(), embeddings, df['metadata'].to_list(), **FAISS_KWARGS)

In [None]:
# Saving DataBase

with open(DB_LOG_PATH, 'r') as fd:
    fd.writelines([DB_SAVE_PATH, EMBEDDING_MODEL_PATH, str(ENCODE_KWARGS), str(FAISS_KWARGS)])
faiss.save_local(DB_SAVE_PATH)