In [1]:
# Полезные материалы:
# https://api.python.langchain.com/en/latest/embeddings/langchain_huggingface.embeddings.huggingface.HuggingFaceEmbeddings.html
# https://medium.com/@milana.shxanukova15/embeddings-normalisation-b279e32ca958
# https://api.python.langchain.com/en/latest/vectorstores/langchain_community.vectorstores.faiss.FAISS.html
# https://medium.com/@pankaj_pandey/faiss-efficient-similarity-search-and-clustering-of-dense-vectors-dace1df1e235
# https://sbert.net/docs/package_reference/sentence_transformer/SentenceTransformer.html#sentence_transformers.SentenceTransformer.encode
# https://github.com/facebookresearch/faiss/wiki/MetricType-and-distances
# https://github.com/langchain-ai/langchain/discussions/16224

In [2]:
import sys
sys.path.insert(0, "/home/aisummer/mikhail_workspace/nlp-service")

from src.DocumentsParser.utils import INFO_FILE, TABLES_DIR_TABLE_NAME, DBS_DIR_DENSE_VECTORDB_NAME, DBS_DIR_SPARSE_VECTORDB_NAME

import pandas as pd
import json
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.retrievers import BM25Retriever
from langchain_community.vectorstores.utils import DistanceStrategy
import pickle
import ast

# Есть 2 варианта формирования базы данных:
# 1. C использованием нормализации векторов. 
#    Тогда 'normalize_embeddings' = True и 'distance_strategy' = DistanceStrategy.MAX_INNER_PRODUCT
# 2. Без нормализации векторов.
#    Тогда 'normalize_embeddings' = False и 'distance_strategy' = DistanceStrategy.EUCLIDEAN_DISTANCE

# !!! BELOW TO CHANGE !!! 
LOAD_DIR = "../data/infsec_gosts/tables/v1/"
DATASET_PATH = f'{LOAD_DIR}/{TABLES_DIR_TABLE_NAME}'
SAVE_DIR = '../data/infsec_gosts/dbs/v2/'
DENSE_DB_SAVE_PATH = f'{SAVE_DIR}/{DBS_DIR_DENSE_VECTORDB_NAME}'
SPARSE_DB_SAVE_PATH = f'{SAVE_DIR}/{DBS_DIR_SPARSE_VECTORDB_NAME}'
DB_LOG_PATH = f'{SAVE_DIR}/{INFO_FILE}'

EMBEDDING_MODEL_PATH = '../models/intfloat/multilingual-e5-small'
MODEL_KWARGS = {'device': 'cuda'}
ENCODE_KWARGS = {'normalize_embeddings': True, 'prompt': 'passage: '}
FAISS_KWARGS = {'distance_strategy': DistanceStrategy.MAX_INNER_PRODUCT}
# !!! ABOVE TO CHANGE !!! 

### Loading

In [3]:
# Loading Embedder-model

embeddings = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_PATH,
    model_kwargs=MODEL_KWARGS,
    encode_kwargs=ENCODE_KWARGS 
)

  from tqdm.autonotebook import tqdm, trange
No sentence-transformers model found with name ../models/intfloat/multilingual-e5-small. Creating a new one with mean pooling.


In [4]:
# Loading dataset

df = pd.read_csv(DATASET_PATH, sep=';')
df['metadata'] = df['metadata'].map(lambda v: ast.literal_eval(v)) 

### Vectorizing

In [6]:
# Creating dense DataBase

faiss = FAISS.from_texts(df['chunks'].to_list(), embeddings, df['metadata'].to_list(), **FAISS_KWARGS)

In [7]:
# Creating sparse DataBase

retriever_BM25 = BM25Retriever.from_texts(df['chunks'].to_list(), df['metadata'].to_list())

### Saving

In [8]:
# Saving Logs
with open(DB_LOG_PATH, 'w') as fd:
    fd.write(json.dumps({
        "load_dir": LOAD_DIR,
        "save_dir": SAVE_DIR, "model_name": EMBEDDING_MODEL_PATH,
        "encode_kwargs": ENCODE_KWARGS, "faiss_kwargs": FAISS_KWARGS}, indent=1))

In [9]:
# Saving dense DataBase
faiss.save_local(DENSE_DB_SAVE_PATH)

In [10]:
# Saving sparse DataBase
filehandler = open(SPARSE_DB_SAVE_PATH,"wb")
pickle.dump(retriever_BM25,filehandler)
filehandler.close()