# E-com Assistant ChromaDB supplier

## Инициализация настроек

Установка и инициализация библиотек

In [4]:
!pip install chromadb
!pip install sentence_transformers




[notice] A new release of pip is available: 24.1.1 -> 24.2
[notice] To update, run: C:\Users\tuman\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.1.1 -> 24.2
[notice] To update, run: C:\Users\tuman\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip


In [5]:
import polars as pl
import gc
import os
import pickle
import chromadb
from chromadb.api import ClientAPI
from sentence_transformers import SentenceTransformer

## Векторизация признаков

In [6]:
# Путь к файлу для сохранения прогресса
progress_file = 'chroma_data_items_progress.txt'

def save_progress(batch_num):
    with open(progress_file, 'w') as f:
        f.write(str(batch_num))
    print(f"Progress saved: batch {batch_num + 1}")


def load_progress():
    if os.path.exists(progress_file):
        with open(progress_file, 'r') as f:
            progress = int(f.read())
        print(f"Resuming from batch {progress + 1}")
        return progress
    print("No progress found. Starting from batch 1.")
    return 0


def create_chroma_collection(client: chromadb.ClientAPI,
                             categories_dataframe: pl.DataFrame,
                             collection_name,
                             batch_size) -> None:
    print("Creating Chroma collection...")

    existing_collections = [col.name for col in client.list_collections()]

    if collection_name in existing_collections:
        print(f"Collection '{collection_name}' already exists. Using the existing collection.")
        items_collection = client.get_collection(collection_name)
    else:
        print(f"Creating new collection '{collection_name}'...")
        items_collection = client.create_collection(collection_name)

    print(f"Collection '{collection_name}' created / loaded.")

    # Инициализация модели
    print("Initializing embedding model...")
    embedding_model = SentenceTransformer('intfloat/multilingual-e5-small')
    print("Embedding model initialized.")

    # Преобразуем данные в список для работы с батчами
    print("Converting dataframe to list...")
    items_specific = categories_dataframe["name"].to_list()  # ЗДЕСЬ ВЫБРАТЬ СТОЛБЕЦ 
    total_items = len(items_specific)
    print(f"Total items to process: {total_items}")

    # Определяем количество элементов в каждом батче
    batch_length = total_items // batch_size
    print(f"Each batch will contain approximately {batch_length} items.")

    # Загружаем прогресс
    start_batch = load_progress()

    print(f"Total items: {total_items}. Processing in {batch_size} batches.")
    print(f"Starting from batch {start_batch + 1}.")

    for batch_num in range(start_batch, batch_size):
        print(f"\nProcessing batch {batch_num + 1}/{batch_size}...")

        start_index = batch_num * batch_length
        end_index = start_index + batch_length if batch_num < batch_size - 1 else total_items  # Последний батч захватывает все остатки
        print(f"Batch range: {start_index} to {end_index}")

        batch_specific = items_specific[start_index:end_index]

        # Кодирование эмбеддингов для текущего батча
        print(f"Encoding embeddings for batch {batch_num + 1}...")
        specific_embeddings = embedding_model.encode(batch_specific, show_progress_bar=True)
        specific_embeddings_list = [embedding.astype('float16').tolist() for embedding in specific_embeddings]
        print(f"Embeddings encoded for batch {batch_num + 1}.")

        print(f"Adding batch {batch_num + 1} to ChromaDB...")
        for index, embedding in enumerate(specific_embeddings_list):
            row = categories_dataframe.row(start_index + index)
            items_collection.add(
                ids=[str(row[1])],
                documents=[batch_specific[index]],
                embeddings=[embedding],
                metadatas=[{"category_id": row[0], "id": row[1], "name": row[2]}]
            )
        print(f"Batch {batch_num + 1} added to ChromaDB.")

        # Сохраняем прогресс
        save_progress(batch_num + 1)

        # Освобождение памяти после каждого батча
        del specific_embeddings
        del specific_embeddings_list
        gc.collect()
        print(f"Memory cleared after batch {batch_num + 1}.")

if __name__ == "__main__":
    print("Starting ChromaDB item collection process...")

    chroma_client = chromadb.PersistentClient(path="chroma_light_items_v3/")
    print("ChromaDB client initialized.")

    print("Loading dataframe...")
    items_dataframe = pl.read_parquet("submission/data/items.parquet")
    print("Dataframe loaded.")

    create_chroma_collection(chroma_client, items_dataframe, "items", batch_size=10)
    print("ChromaDB collection process completed.")

Starting ChromaDB item collection process...
ChromaDB client initialized.
Loading dataframe...
Dataframe loaded.
Creating Chroma collection...
Collection 'items' already exists. Using the existing collection.
Collection 'items' created / loaded.
Initializing embedding model...




Embedding model initialized.
Converting dataframe to list...
Total items to process: 873237
Each batch will contain approximately 87323 items.
No progress found. Starting from batch 1.
Total items: 873237. Processing in 10 batches.
Starting from batch 1.

Processing batch 1/10...
Batch range: 0 to 87323
Encoding embeddings for batch 1...


Batches:   0%|          | 0/2729 [00:00<?, ?it/s]

Embeddings encoded for batch 1.
Adding batch 1 to ChromaDB...
Batch 1 added to ChromaDB.
Progress saved: batch 2
Memory cleared after batch 1.

Processing batch 2/10...
Batch range: 87323 to 174646
Encoding embeddings for batch 2...


Batches:   0%|          | 0/2729 [00:00<?, ?it/s]

Embeddings encoded for batch 2.
Adding batch 2 to ChromaDB...
Batch 2 added to ChromaDB.
Progress saved: batch 3
Memory cleared after batch 2.

Processing batch 3/10...
Batch range: 174646 to 261969
Encoding embeddings for batch 3...


Batches:   0%|          | 0/2729 [00:00<?, ?it/s]

Embeddings encoded for batch 3.
Adding batch 3 to ChromaDB...
Batch 3 added to ChromaDB.
Progress saved: batch 4
Memory cleared after batch 3.

Processing batch 4/10...
Batch range: 261969 to 349292
Encoding embeddings for batch 4...


Batches:   0%|          | 0/2729 [00:00<?, ?it/s]

Embeddings encoded for batch 4.
Adding batch 4 to ChromaDB...
Batch 4 added to ChromaDB.
Progress saved: batch 5
Memory cleared after batch 4.

Processing batch 5/10...
Batch range: 349292 to 436615
Encoding embeddings for batch 5...


Batches:   0%|          | 0/2729 [00:00<?, ?it/s]

Embeddings encoded for batch 5.
Adding batch 5 to ChromaDB...
Batch 5 added to ChromaDB.
Progress saved: batch 6
Memory cleared after batch 5.

Processing batch 6/10...
Batch range: 436615 to 523938
Encoding embeddings for batch 6...


Batches:   0%|          | 0/2729 [00:00<?, ?it/s]

Embeddings encoded for batch 6.
Adding batch 6 to ChromaDB...
Batch 6 added to ChromaDB.
Progress saved: batch 7
Memory cleared after batch 6.

Processing batch 7/10...
Batch range: 523938 to 611261
Encoding embeddings for batch 7...


Batches:   0%|          | 0/2729 [00:00<?, ?it/s]

Embeddings encoded for batch 7.
Adding batch 7 to ChromaDB...
Batch 7 added to ChromaDB.
Progress saved: batch 8
Memory cleared after batch 7.

Processing batch 8/10...
Batch range: 611261 to 698584
Encoding embeddings for batch 8...


Batches:   0%|          | 0/2729 [00:00<?, ?it/s]

Embeddings encoded for batch 8.
Adding batch 8 to ChromaDB...
Batch 8 added to ChromaDB.
Progress saved: batch 9
Memory cleared after batch 8.

Processing batch 9/10...
Batch range: 698584 to 785907
Encoding embeddings for batch 9...


Batches:   0%|          | 0/2729 [00:00<?, ?it/s]

Embeddings encoded for batch 9.
Adding batch 9 to ChromaDB...
Batch 9 added to ChromaDB.
Progress saved: batch 10
Memory cleared after batch 9.

Processing batch 10/10...
Batch range: 785907 to 873237
Encoding embeddings for batch 10...


Batches:   0%|          | 0/2730 [00:00<?, ?it/s]

Embeddings encoded for batch 10.
Adding batch 10 to ChromaDB...
Batch 10 added to ChromaDB.
Progress saved: batch 11
Memory cleared after batch 10.
ChromaDB collection process completed.
