# Local Peristence Demo
This notebook demonstrates how to configure Chroma to persist to disk, then load it back in. 

In [8]:
%pip install chromadb 

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd

# Create a Pandas DataFrame
data = {
        
        #'column1': [1, 2, 3, 4, 5],
        #'column2': ['a', 'b', 'c', 'd', 'e']

        embeddings : [
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
    ],
    metadatas: [
        {"uri": "img1.png", "style": "style1"},
        {"uri": "img2.png", "style": "style2"},
        {"uri": "img3.png", "style": "style1"},
        {"uri": "img4.png", "style": "style1"},
        {"uri": "img5.png", "style": "style1"},
        {"uri": "img6.png", "style": "style1"},
        {"uri": "img7.png", "style": "style1"},
        {"uri": "img8.png", "style": "style1"},
    ],
    documents: ["doc1", "doc2", "doc3", "doc4", "doc5", "doc6", "doc7", "doc8"],
    ids: ["id1", "id2", "id3", "id4", "id5", "id6", "id7", "id8"],
        
        
        }
df = pd.DataFrame(data)

# Convert the DataFrame to a PyArrow Table
table = pa.Table.from_pandas(df)

# Define the Parquet file path
file_path = 'db\example.parquet'

# Write the PyArrow Table to a Parquet file
pq.write_table(table, file_path)

In [2]:
import chromadb

In [3]:
# Create a new Chroma client with persistence enabled. 
persist_directory = "db"

client = chromadb.PersistentClient(path=persist_directory)

# Create a new chroma collection
collection_name = "peristed_collection"
collection = client.get_or_create_collection(name=collection_name)

In [4]:
# Add some data to the collection
collection.add(
    embeddings=[
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
        [1.1, 2.3, 3.2],
        [4.5, 6.9, 4.4],
    ],
    metadatas=[
        {"uri": "img1.png", "style": "style1"},
        {"uri": "img2.png", "style": "style2"},
        {"uri": "img3.png", "style": "style1"},
        {"uri": "img4.png", "style": "style1"},
        {"uri": "img5.png", "style": "style1"},
        {"uri": "img6.png", "style": "style1"},
        {"uri": "img7.png", "style": "style1"},
        {"uri": "img8.png", "style": "style1"},
    ],
    documents=["doc1", "doc2", "doc3", "doc4", "doc5", "doc6", "doc7", "doc8"],
    ids=["id1", "id2", "id3", "id4", "id5", "id6", "id7", "id8"],
)

In [5]:
# Create a new client with the same settings
client = chromadb.PersistentClient(path=persist_directory)

# Load the collection
collection = client.get_collection(collection_name)

In [6]:
# Query the collection
results = collection.query(
    query_embeddings=[[1.1, 2.3, 3.2]],
    n_results=1
)

print(results)

{'ids': [['id1']], 'distances': [[5.1159076593562386e-15]], 'metadatas': [[{'style': 'style1', 'uri': 'img1.png'}]], 'embeddings': None, 'documents': [['doc1']]}


In [7]:
collection.get(include=["embeddings", "metadatas", "documents"])

{'ids': ['id1', 'id2', 'id3', 'id4', 'id5', 'id6', 'id7', 'id8'],
 'embeddings': [[1.100000023841858, 2.299999952316284, 3.200000047683716],
  [4.5, 6.900000095367432, 4.400000095367432],
  [1.100000023841858, 2.299999952316284, 3.200000047683716],
  [4.5, 6.900000095367432, 4.400000095367432],
  [1.100000023841858, 2.299999952316284, 3.200000047683716],
  [4.5, 6.900000095367432, 4.400000095367432],
  [1.100000023841858, 2.299999952316284, 3.200000047683716],
  [4.5, 6.900000095367432, 4.400000095367432]],
 'metadatas': [{'style': 'style1', 'uri': 'img1.png'},
  {'style': 'style2', 'uri': 'img2.png'},
  {'style': 'style1', 'uri': 'img3.png'},
  {'style': 'style1', 'uri': 'img4.png'},
  {'style': 'style1', 'uri': 'img5.png'},
  {'style': 'style1', 'uri': 'img6.png'},
  {'style': 'style1', 'uri': 'img7.png'},
  {'style': 'style1', 'uri': 'img8.png'}],
 'documents': ['doc1', 'doc2', 'doc3', 'doc4', 'doc5', 'doc6', 'doc7', 'doc8']}

In [7]:
# Clean up
! rm -rf db