In [1]:
import chromadb
from chromadb import Documents, EmbeddingFunction, Embeddings
import pandas as pd
import json
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.embeddings import Embeddings
import pickle
import ast
from time import time

BASE_DIR = "/home/dzigen/Desktop/Projects/rag_project"

In [2]:
class MyEmbeddingFunction(EmbeddingFunction):
    def __init__(self, embedder):
        self.embedder = embedder
    def __call__(self, input: Documents) -> Embeddings:
        return self.embedder.embed_documents(input)

In [3]:
# !!! BELOW TO CHANGE !!! 
DATA_NAME = 'mtssquad'
TABLE_VERSION = 'v3'
DB_VERSION = 'v3'

EMBEDDING_MODEL_PATH = f'/home/dzigen/Desktop/PersonalAI/Personal-AI/models/intfloat/multilingual-e5-small'
MODEL_KWARGS = {'device': 'cuda'}
ENCODE_KWARGS = {'normalize_embeddings': True, 'prompt': 'passage: '}
CHROMA_KWARGS = {"hnsw:space": "ip"}
# !!! ABOVE TO CHANGE !!!

LOAD_DIR = f'../data/{DATA_NAME}/tables/{TABLE_VERSION}'
DATASET_PATH = f'{LOAD_DIR}/chunked_docs.csv'
SAVE_DIR = f"../data/{DATA_NAME}/dbs/{DB_VERSION}"
DENSE_DB_SAVE_PATH = f'{SAVE_DIR}/densedb'
DB_LOG_PATH = f'{SAVE_DIR}/operation_info.json' 

#### Preparing

In [4]:
embeddings = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_PATH,
    model_kwargs=MODEL_KWARGS,
    encode_kwargs=ENCODE_KWARGS 
)
ef = MyEmbeddingFunction(embeddings)

No sentence-transformers model found with name /home/dzigen/Desktop/PersonalAI/Personal-AI/models/intfloat/multilingual-e5-small. Creating a new one with MEAN pooling.


In [5]:
client = chromadb.PersistentClient(path=DENSE_DB_SAVE_PATH)
collection = client.get_or_create_collection(name=DATA_NAME,  metadata=CHROMA_KWARGS, 
                                             embedding_function=ef)

In [6]:
df = pd.read_csv(DATASET_PATH, sep=';')
df['metadata'] = df['metadata'].map(lambda v: ast.literal_eval(v)) 

In [7]:
df['metadata'][0]

{'doc_id': '0e410ba54401034d094c72346ca0b8fe',
 'chunk_id': '0e410ba54401034d094c72346ca0b8fe',
 'total_n_distance': 0.05,
 'mean_dependency_distance': 0.0336}

In [8]:
len(df['chunks'].dropna())

9031

#### Vectorizing 

In [9]:
vectorize_t_start = time()

collection.add(
    documents=df['chunks'].to_list(),
    metadatas=df['metadata'].to_list(),
    ids=list(map(lambda v: str(v['chunk_id']), df['metadata'].to_list()))
)

VECTORIZE_ELAPSED_TIME = round(time() - vectorize_t_start, 5)

In [10]:
collection.count()

9031

#### Saving Log

In [11]:
with open(DB_LOG_PATH, 'w') as fd:
    fd.write(json.dumps({
        "data_name": DATA_NAME,
        "table_version": TABLE_VERSION,
        "db_version": DB_VERSION, "model_name": EMBEDDING_MODEL_PATH,
        "encode_kwargs": ENCODE_KWARGS, "chroma_kwargs": CHROMA_KWARGS,
        "vectorize_elapsed_sec_time": VECTORIZE_ELAPSED_TIME}, indent=1))