# Chroma DB

## Collection erstellen

In [28]:
import chromadb

### Test Collection

Server starten und client verbinden

In [None]:
chroma_client = chromadb.HttpClient(host='localhost', port=8000)

In [29]:
collection = chroma_client.create_collection(name="test_collection")

In [3]:
collection.add(
    documents=["This is a document", "This is another document"],
    metadatas=[{"source": "my_source"}, {"source": "my_source"}],
    ids=["1", "2"]
)

In [5]:
results = collection.query(
    query_texts=["This is a query document"],
    n_results=2
)
results

{'ids': [['1', '2']],
 'distances': [[0.7111214399337769, 1.0109773874282837]],
 'metadatas': [[{'source': 'my_source'}, {'source': 'my_source'}]],
 'embeddings': None,
 'documents': [['This is a document', 'This is another document']],
 'uris': None,
 'data': None}

## MINI_LM hinzufügen

In [66]:
import os

from dotenv import load_dotenv
import sys

sys.path.append("..")
from db_connect import db_get_df, db_save_df, save_pkl, load_pkl, save_npz, load_npz
from embedding_creation.embedding_creator_MINI_L6 import MINI_LM_embed
import numpy as np
import pandas as pd
from tqdm import tqdm

load_dotenv()
DATABASE_PATH = os.getenv("DATABASE_PATH")
DATA_PATH = os.getenv("DATA_PATH")
CHROMADB_PATH = os.getenv("CHROMADB_PATH")

In [105]:
chroma_client = chromadb.HttpClient(host='localhost', port=8000)

In [31]:
df = db_get_df("transcript_sentences")
embeddings = load_pkl("MINI_L6_embeddings.pkl")

In [50]:
embeddings = [embed.tolist() for embed in tqdm(embeddings)]

  0%|          | 0/433562 [00:00<?, ?it/s]

100%|██████████| 433562/433562 [00:27<00:00, 15554.67it/s]


In [104]:
collection_mini = chroma_client.create_collection(name="sentence_MINI_LM")

### Daten vorbereiten

In [57]:
documents = df["sentence"].to_list()
metadatas = []
ids = []
for i, row in tqdm(df.iterrows()):
    filename = row["filename"]
    start = row["start"]
    end = row["end"]
    sentence_id = row["sentence_id"]
    id = filename + str(sentence_id)
    ids.append(id)
    metadatas.append({"filename": filename, "start": start, "end": end, "sentence_id": sentence_id})

433562it [00:46, 9385.82it/s] 


In [58]:
print(len(ids))

433562


### Daten einfügen

In [34]:
def split_into_batches(data, batch_size):
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

In [106]:
batch_size = 100  
start_point = 0

documents_batches = list(split_into_batches(documents, batch_size))[start_point:]
metadatas_batches = list(split_into_batches(metadatas, batch_size))[start_point:]
ids_batches = list(split_into_batches(ids, batch_size))[start_point:]
embeddings_batches = list(split_into_batches(embeddings, batch_size))[start_point:]

# Insert each batch into the collection
for i in tqdm(range(len(documents_batches))):
    collection_mini.add(
        documents=documents_batches[i],
        embeddings=embeddings_batches[i],
        metadatas=metadatas_batches[i],
        ids=ids_batches[i]
    )

  2%|▏         | 75/4336 [00:38<24:49,  2.86it/s]  Exception occurred invoking consumer for subscription 641805bce0e345bca79556b300900f4cto topic persistent://default/default/f2ee7c5f-69fd-4717-8626-3b3cde1f2f18 
  5%|▌         | 220/4336 [01:03<11:44,  5.84it/s]Exception occurred invoking consumer for subscription 641805bce0e345bca79556b300900f4cto topic persistent://default/default/f2ee7c5f-69fd-4717-8626-3b3cde1f2f18 
100%|██████████| 4336/4336 [12:45<00:00,  5.67it/s]  


In [127]:
collection_mini.count()

433482

In [118]:
chroma_client.list_collections()

[Collection(name=sentece_MINI_LM), Collection(name=test)]

In [117]:
chroma_client.list_collections()

[Collection(name=sentece_MINI_LM), Collection(name=test)]

### Suche starten

In [65]:
collection._embedding_function

<chromadb.utils.embedding_functions.ONNXMiniLM_L6_V2 at 0x7fb2d0129790>

In [79]:
query_embedding = MINI_LM_embed("Sonnensystem")

In [86]:
result = collection.query(
    query_embeddings=query_embedding.tolist(),
    n_results=10
)

In [87]:
result["documents"]

[['Vor Sonnenaufgang.',
  'Also der Tagesablauf, das Sonnenaufgang, Sonnenuntergang hat natürlich das Leben bestimmt.',
  'Auch nicht an den vielen grassierenden schlechten Übersetzungen.',
  'Heuschmann ergänzt ihn mit einer Tabelle zu Pendellängen und Schwingungszahlen.',
  'Also zunächst mal bedeutete es natürlich Ehelosigkeit.',
  'Die Vulkanasche hat das Sonnenlicht noch zusätzlich abgeschirmt.',
  'Sie wollen ihn kennenlernen.',
  'Ausdauersportler haben oft einen erhöhten Bedarf am Spurenelement Eisen.',
  'Etwas, das sich lohnt kennenzulernen.',
  'Nadine Bräsicke fordert neben verstärkter Forschung auch mehr Zusammenarbeit beim Waldmanagement.']]

## Laden der Daten

In [1]:
import chromadb

In [110]:
client_mini_v2 = chromadb.HttpClient(host='localhost', port=8000)

In [113]:
collection = client_mini_v2.get_collection(name="sentence_MINI_LM")

Exception: {"error":"ValueError('Collection sentence_MINI_LM does not exist.')"}

In [112]:
collection.count()

0

In [101]:
client.list_collections()

[Collection(name=sentece_MINI_LM), Collection(name=test)]

In [96]:
client_mini.list_collections()

[Collection(name=sentece_MINI_LM), Collection(name=test)]

In [123]:
collection_my = client_mini.create_collection(name="my_collection")

In [103]:
collection_mini.peek()

{'ids': [],
 'embeddings': [],
 'metadatas': [],
 'documents': [],
 'data': None,
 'uris': None}

In [3]:
client = chromadb.PersistentClient(path=CHROMADB_PATH)

In [21]:
client.count_collections()

2

In [None]:
collection.peek()

In [26]:
client.heartbeat()

1708369938856539000

In [None]:
 
client.reset()

In [126]:
collection_my.add(
    documents=["WARUM ZUM TEFEL IST HIER NIIIIIICHTS DIKUMNETIERT???"],
    metadatas=[{"metadata":"Metadata2"}],
    ids=["3"]
)

In [9]:
collection = client.create_collection(name="sentece_MINI_LM")

UniqueConstraintError: Collection sentece_MINI_LM already exists

In [None]:
vectordb = Chroma(persist_directory=CHROMADB_PATH, embedding_function=embeddings)
