#Clean Chroma Ingestion Pipeline: Cell-by-Cell

In [2]:
import os
import shutil
from dotenv import load_dotenv
from langchain_community.document_loaders import TextLoader
from pathlib import Path
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document

# Always load your .env so API keys are set
load_dotenv()
DATA_PATH = "data/books"
CHROMA_PATH = "chroma_fresh"  # Use a FRESH directory for every re-ingest during dev!

#Cell 2: Load Documents

In [3]:
def load_documents():
    docs = []
    for path in Path(DATA_PATH).glob("*.md"):
        loader = TextLoader(str(path), encoding="utf-8")
        docs.extend(loader.load())
    print(f"Loaded {len(docs)} documents.")
    return docs

documents = load_documents()

Loaded 1 documents.


#Cell 3: Split Documents into Chunks

In [4]:
def split_text(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
    return chunks

chunks = split_text(documents)

Split 1 documents into 217 chunks.


#Chunk Sanity Check

In [None]:
#for i, chunk in enumerate(chunks[:3]):
    #print(f"--- Chunk {i} ---")
    #print(chunk.page_content[:300])  # show first 300 characters
    #print(chunk.metadata)

--- Chunk 0 ---
The Project Gutenberg eBook of Alice's Adventures in Wonderland

This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenb
{'source': 'data\\books\\alice_in_wonderland.md', 'start_index': 0}
--- Chunk 1 ---
**_ START OF THE PROJECT GUTENBERG EBOOK ALICE'S ADVENTURES IN WONDERLAND _**
[Illustration]

Aliceâ€™s Adventures in Wonderland

by Lewis Carroll

THE MILLENNIUM FULCRUM EDITION 3.0

Contents

CHAPTER I. Down the Rabbit-Hole
CHAPTER II. The Pool of Tears
CHAPTER III. A Caucus-Race and a Long Tale
CHA
{'source': 'data\\books\\alice_in_wonderland.md', 'start_index': 714}
--- Chunk 2 ---
So she was considering in her own mind (as well as she could, for the
hot day made her feel very sleepy and stupid), whether the pleasure of
making a daisy-chain would be worth the trouble of getting up and
pic

#Cell 4: (Optional) Sanity Check for Target Phrase

In [None]:
found = False
for i, doc in enumerate(chunks):
    if "after such a fall as this" in doc.page_content:
        print(f"\nChunk {i}:")
        print(doc.page_content)
        print(doc.metadata)
        found = True
if not found:
    print("Phrase not found in any chunk. Check chunk_size or source document.")

#Cell 5: Garbage Collection & Ensure No Chroma Locks Exist

In [6]:
try:
    del db
except NameError:
    pass  # No db object present
import gc
gc.collect()

0

#Cell 6: Delete Old DB Folder (if it exists) and Save Chunks to Chroma

In [7]:
# Delete previous vector store to avoid file-lock issues and ensure a clean ingest
if os.path.exists(CHROMA_PATH):
    shutil.rmtree(CHROMA_PATH)

# Build new embeddings object
embeddings = OpenAIEmbeddings(
    api_key=os.environ.get("OPENAI_API_KEY"),
    model="text-embedding-3-small"
)

# Create and persist the Chroma vector DB
db = Chroma.from_documents(
    chunks,
    embeddings,
    persist_directory=CHROMA_PATH
)
print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


Saved 217 chunks to chroma_fresh.


#Cell 7: Confirm DB Content (Debugging/Verification)

In [8]:
print("Number of chunks in vector DB:", db._collection.count())

Number of chunks in vector DB: 217
