# Chroma DB

In [1]:
import os
import chromadb
from dotenv import load_dotenv
import sys

sys.path.append("..")
from db_connect import db_get_df, db_save_df, save_pkl, load_pkl, save_npz, load_npz
from embedding_creation.embedding_creator_MINI_L6 import MINI_LM_embed
import numpy as np
import pandas as pd
from tqdm import tqdm

load_dotenv()
DATABASE_PATH = os.getenv("DATABASE_PATH")
DATA_PATH = os.getenv("DATA_PATH")
CHROMADB_PATH = os.getenv("CHROMADB_PATH")
OPENAI_KEY=os.getenv("OPENAI_KEY")

  from .autonotebook import tqdm as notebook_tqdm


## Collection erstellen

### Test Collection

Server starten und client verbinden

In [None]:
chroma_client = chromadb.HttpClient(host='localhost', port=8000)

In [9]:
collection = chroma_client.create_collection(name="sentence_voyage")

In [3]:
collection.add(
    documents=["This is a document", "This is another document"],
    metadatas=[{"source": "my_source"}, {"source": "my_source"}],
    ids=["1", "2"]
)

In [5]:
results = collection.query(
    query_texts=["This is a query document"],
    n_results=2
)
results

{'ids': [['1', '2']],
 'distances': [[0.7111214399337769, 1.0109773874282837]],
 'metadatas': [[{'source': 'my_source'}, {'source': 'my_source'}]],
 'embeddings': None,
 'documents': [['This is a document', 'This is another document']],
 'uris': None,
 'data': None}

## Collections löschen

In [6]:
chroma_client = chromadb.HttpClient(host='localhost', port=8000)

In [11]:
chroma_client.list_collections()

[Collection(name=sentence_voyage), Collection(name=sentence_openai)]

In [15]:
test_collection = chroma_client.get_collection("sentence_voyage")

In [16]:
test_collection.count()

0

In [17]:
chroma_client.delete_collection("sentence_voyage")

In [4]:
chroma_client.clear_system_cache()

## MINI_LM hinzufügen

### Daten laden

In [2]:
chroma_client = chromadb.HttpClient(host='localhost', port=8000)

In [3]:
chroma_client.list_collections()

[Collection(name=my_collection),
 Collection(name=sentence_MINI_LM),
 Collection(name=sentece_MINI_LM),
 Collection(name=test)]

In [7]:
df = db_get_df("transcript_sentences")
embeddings = load_pkl("MINI_L6_embeddings.pkl")

In [8]:
embeddings = [embed.tolist() for embed in tqdm(embeddings)]

100%|██████████| 433562/433562 [00:24<00:00, 18056.96it/s]


In [9]:
collection_mini = chroma_client.create_collection(name="sentence_MINI_LM")

### Daten vorbereiten

In [4]:
def get_data(df):
    documents = df["sentence"].to_list()
    metadatas = []
    ids = []
    for i, row in tqdm(df.iterrows()):
        filename = row["filename"]
        start = row["start"]
        end = row["end"]
        sentence_id = row["sentence_id"]
        id = filename + str(sentence_id)
        ids.append(id)
        metadatas.append({"filename": filename, "start": start, "end": end, "sentence_id": sentence_id})
    return (documents, metadatas, ids)

In [None]:
documents, metadatas, ids = get_data(df)

In [58]:
print(len(ids))

433562


### Daten einfügen

In [None]:
def split_into_batches(data, batch_size):
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

In [9]:
def insert_collection_batchwise(my_collection, documents, metadatas, ids, embeddings):
    batch_size = 100  
    start_point = 0
    documents_batches = list(split_into_batches(documents, batch_size))[start_point:]
    metadatas_batches = list(split_into_batches(metadatas, batch_size))[start_point:]
    ids_batches = list(split_into_batches(ids, batch_size))[start_point:]
    embeddings_batches = list(split_into_batches(embeddings, batch_size))[start_point:]

    for i in tqdm(range(len(documents_batches))):
        my_collection.add(
            documents=documents_batches[i],
            embeddings=embeddings_batches[i],
            metadatas=metadatas_batches[i],
            ids=ids_batches[i],
        )

In [None]:
insert_collection_batchwise(documents, metadatas, ids, embeddings)

In [13]:
collection_mini.count()

433562

In [14]:
client = chromadb.PersistentClient(path="data/chromadb")

### Suche starten

In [65]:
collection._embedding_function

<chromadb.utils.embedding_functions.ONNXMiniLM_L6_V2 at 0x7fb2d0129790>

In [79]:
query_embedding = MINI_LM_embed("Sonnensystem")

In [86]:
result = collection.query(
    query_embeddings=query_embedding.tolist(),
    n_results=10
)

In [87]:
result["documents"]

[['Vor Sonnenaufgang.',
  'Also der Tagesablauf, das Sonnenaufgang, Sonnenuntergang hat natürlich das Leben bestimmt.',
  'Auch nicht an den vielen grassierenden schlechten Übersetzungen.',
  'Heuschmann ergänzt ihn mit einer Tabelle zu Pendellängen und Schwingungszahlen.',
  'Also zunächst mal bedeutete es natürlich Ehelosigkeit.',
  'Die Vulkanasche hat das Sonnenlicht noch zusätzlich abgeschirmt.',
  'Sie wollen ihn kennenlernen.',
  'Ausdauersportler haben oft einen erhöhten Bedarf am Spurenelement Eisen.',
  'Etwas, das sich lohnt kennenzulernen.',
  'Nadine Bräsicke fordert neben verstärkter Forschung auch mehr Zusammenarbeit beim Waldmanagement.']]

## OpenAI hinzufügen

### Daten laden

Die daten mussten in 2 verschiedenen Splits geladen werden

In [7]:
chroma_client = chromadb.HttpClient(host='localhost', port=8000)

In [22]:
df = db_get_df("transcript_sentences")

In [None]:
embeddings_2 = load_pkl("embeddings_OPENAI_180793.pkl")

In [21]:
len(embeddings_2)

180793

In [22]:
documents_2, metadatas_2, ids_2 = get_data(df.tail(len(embeddings_2)))

180793it [00:15, 11989.04it/s]


In [23]:
len(documents_2)

180793

In [13]:
collection_openai = chroma_client.create_collection(name="sentence_openai")

### Daten einfügen

In [24]:
insert_collection_batchwise(collection_openai, documents_2, metadatas_2, ids_2, embeddings_2)

100%|██████████| 1808/1808 [25:36<00:00,  1.18it/s]


In [25]:
chroma_client.list_collections()

[Collection(name=my_collection),
 Collection(name=sentence_MINI_LM),
 Collection(name=sentece_MINI_LM),
 Collection(name=sentence_openai),
 Collection(name=test)]

In [None]:
collection_openai.peek()

### Suchen

In [4]:
import chromadb.utils.embedding_functions as embedding_functions
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                api_key=OPENAI_KEY,
                model_name="text-embedding-3-small"
            )
chroma_client = chromadb.HttpClient(host='localhost', port=8000)

In [30]:
chroma_client.list_collections()

[Collection(name=my_collection),
 Collection(name=sentence_MINI_LM),
 Collection(name=sentece_MINI_LM),
 Collection(name=sentence_openai),
 Collection(name=test)]

In [5]:
collection = chroma_client.get_collection(name="sentence_openai", embedding_function=openai_ef)

In [6]:
collection.count()

433562

In [7]:
result = collection.query(
    query_texts=["Marsmission"],
    n_results=10
)

In [25]:
result

{'ids': [['gibt-es-ausserirdisches-leben-entdeckungsreise-in-unserem-sonnensystem.mp326',
   'gibt-es-ausserirdisches-leben-entdeckungsreise-in-unserem-sonnensystem.mp375',
   'mars-mission-die-erforschung-des-roten-planeten.mp3134',
   'marsmaennchen-realitaet-fiktion-goetter-der-gottlosen.mp334',
   'meteor-mit-mission-kam-das-leben-aus-dem-all.mp3190',
   'mars-mission-die-erforschung-des-roten-planeten.mp3124',
   'marsmaennchen-realitaet-fiktion-goetter-der-gottlosen.mp346',
   'marsmaennchen-realitaet-fiktion-goetter-der-gottlosen.mp348',
   'meteor-mit-mission-kam-das-leben-aus-dem-all.mp319',
   'mars-mission-die-erforschung-des-roten-planeten.mp3131']],
 'distances': [[0.695876955986023,
   0.8734498023986816,
   0.8734498023986816,
   0.8853272795677185,
   0.9264781475067139,
   0.926581621170044,
   0.979823112487793,
   0.986320972442627,
   0.9892181754112244,
   0.9927668571472168]],
 'embeddings': None,
 'metadatas': [[{'end': 144.09,
    'filename': 'gibt-es-ausserirdi

In [18]:
df = pd.DataFrame({"sentence":result["documents"][0]})
df["end"] = [metadata["end"] for metadata in result["metadatas"][0]]
df["start"] = [metadata["start"] for metadata in result["metadatas"][0]]
df["filename"] = [metadata["filename"] for metadata in result["metadatas"][0]]

## Laden der Daten

In [1]:
import chromadb

In [2]:
client = chromadb.HttpClient(host='localhost', port=8000)

In [3]:
collection = client.get_collection(name="sentence_openai")

In [4]:
collection.count()

433562

In [5]:
client.list_collections()

[Collection(name=sentence_openai)]

In [8]:
peek = collection.peek()

In [10]:
len(peek['embeddings'][0])

1536

In [3]:
client = chromadb.PersistentClient(path=CHROMADB_PATH)

In [21]:
client.count_collections()

2

In [None]:
collection.peek()

## voyage hinzufügen

In [2]:
chroma_client = chromadb.HttpClient(host='localhost', port=8000)

In [3]:
df = db_get_df("transcript_sentences")
embeddings = load_pkl("embeddings_voyage.pkl")

In [5]:
len(embeddings)

433562

In [7]:
documents, metadatas, ids = get_data(df)

433562it [00:51, 8457.09it/s] 


In [8]:
collection_voyage = chroma_client.create_collection(name="sentence_voyage")

In [None]:
def insert_collection_batchwise(my_collection, ids, embeddings, documents=None, metadatas=None):
    batch_size = 100  
    
    ids_batches = list(split_into_batches(ids, batch_size))
    embeddings_batches = list(split_into_batches(embeddings, batch_size))
    if documents and metadatas:
        documents_batches = list(split_into_batches(documents, batch_size))
        metadatas_batches = list(split_into_batches(metadatas, batch_size))
        for i in tqdm(range(len(embeddings_batches))):
            my_collection.add(
                embeddings=embeddings_batches[i],
                ids=ids_batches[i],
                documents=documents_batches[i],
                metadatas=metadatas_batches[i]
            )
    else:
        for i in tqdm(range(len(embeddings_batches))):
            my_collection.add(
                embeddings=embeddings_batches[i],
                ids=ids_batches[i],
            )

In [13]:
insert_collection_batchwise(collection_voyage, ids, embeddings)

100%|██████████| 4336/4336 [42:05<00:00,  1.72it/s]  


## Collection umziehen

In [2]:
chroma_client_1 = chromadb.HttpClient(host='localhost', port=8000)
collection_openai_1 = chroma_client_1.get_collection(name="sentence_openai")

In [3]:
collection_openai_1.count()

433562

In [5]:
data =collection_openai_1.get()

In [7]:
data["ids"]

['15-jahre-spaeter-ist-suedafrika-versoehnt.mp30',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp31',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp310',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp3100',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp3101',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp311',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp312',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp313',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp314',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp315',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp316',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp317',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp318',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp319',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp32',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp320',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp321',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp322',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp323',
 '15-jahre-sp

In [4]:
data_embeds =collection_openai_1.get(ids=["15-jahre-spaeter-ist-suedafrika-versoehnt.mp30"],include=['embeddings'])

Your new secret key is: KDLLQBXBSBBWMRYLPHSCA2YYMA
Your verification code is 577400
Your emergency scratch codes are:
  26573021
  30451711
  98109648
  61828770
  66268366

In [18]:
data.keys()

dict_keys(['ids', 'embeddings', 'metadatas', 'documents', 'data', 'uris'])

In [22]:
data["ids"]

['15-jahre-spaeter-ist-suedafrika-versoehnt.mp30',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp31',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp310',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp3100',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp3101',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp311',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp312',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp313',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp314',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp315',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp316',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp317',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp318',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp319',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp32',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp320',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp321',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp322',
 '15-jahre-spaeter-ist-suedafrika-versoehnt.mp323',
 '15-jahre-sp

In [6]:
chroma_client_2 = chromadb.HttpClient(host='localhost', port=8001)
collection_openai_2 = chroma_client_2.get_or_create_collection(name="sentence_openai")

<chromadb.api.client.Client at 0x7fc705ad1be0>

In [24]:
data

{'ids': ['15-jahre-spaeter-ist-suedafrika-versoehnt.mp30',
  '15-jahre-spaeter-ist-suedafrika-versoehnt.mp31',
  '15-jahre-spaeter-ist-suedafrika-versoehnt.mp310',
  '15-jahre-spaeter-ist-suedafrika-versoehnt.mp3100',
  '15-jahre-spaeter-ist-suedafrika-versoehnt.mp3101',
  '15-jahre-spaeter-ist-suedafrika-versoehnt.mp311',
  '15-jahre-spaeter-ist-suedafrika-versoehnt.mp312',
  '15-jahre-spaeter-ist-suedafrika-versoehnt.mp313',
  '15-jahre-spaeter-ist-suedafrika-versoehnt.mp314',
  '15-jahre-spaeter-ist-suedafrika-versoehnt.mp315',
  '15-jahre-spaeter-ist-suedafrika-versoehnt.mp316',
  '15-jahre-spaeter-ist-suedafrika-versoehnt.mp317',
  '15-jahre-spaeter-ist-suedafrika-versoehnt.mp318',
  '15-jahre-spaeter-ist-suedafrika-versoehnt.mp319',
  '15-jahre-spaeter-ist-suedafrika-versoehnt.mp32',
  '15-jahre-spaeter-ist-suedafrika-versoehnt.mp320',
  '15-jahre-spaeter-ist-suedafrika-versoehnt.mp321',
  '15-jahre-spaeter-ist-suedafrika-versoehnt.mp322',
  '15-jahre-spaeter-ist-suedafrika-verso

In [7]:
data=collection_openai_1.get(include=['embeddings'])

KeyboardInterrupt: 

In [23]:
insert_collection_batchwise(
    collection_openai_2, 
    ids=data['ids'],
    embeddings=data['embeddings'],
    documents=data['documents'],
    metadatas=data['metadatas']
    )

ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

In [15]:
collection_openai_2.add(
    embeddings=collection_openai_1.get()['embeddings'],
    metadatas=collection_openai_1.get()['metadatas'],
    documents=collection_openai_1.get()['documents'],
    ids=collection_openai_1.get()['ids']
)

KeyboardInterrupt: 

In [25]:
collection_openai_2.count()

ConnectionError: HTTPConnectionPool(host='localhost', port=8000): Max retries exceeded with url: /api/v1/collections/f22d26a0-950f-4159-8dfc-012931d73188/count (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fc6e2e89a90>: Failed to establish a new connection: [Errno 61] Connection refused'))