In [1]:
# CELL 1
# Purpose: Imports, loads environment, and defines constants used across the notebook.
import os
import glob
import shutil
import time
from datetime import datetime
from pathlib import Path

from dotenv import load_dotenv
import numpy as np

# LangChain + embeddings + LCChroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# chroma settings (modern)
from chromadb.config import Settings

# Load environment variables (if any)
load_dotenv(override=True)

# Constants
DB_DIR = "vector_db"
EMBED_MODEL = "all-MiniLM-L6-v2"  # HuggingFace model
print("Setup complete. DB_DIR:", DB_DIR, "Embedding model:", EMBED_MODEL)


Setup complete. DB_DIR: vector_db Embedding model: all-MiniLM-L6-v2


In [2]:
# CELL 2
# Purpose: If an older sqlite-based DB exists, move it to a timestamped backup folder before creating new DuckDB DB.
old_sqlite = Path(DB_DIR) / "chroma.sqlite3"
if old_sqlite.exists():
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    backup_dir = f"{DB_DIR}_sqlite_backup_{ts}"
    print(f"Found old sqlite DB: {old_sqlite}. Moving to backup: {backup_dir}")
    shutil.move(DB_DIR, backup_dir)
    print("Move complete. You can inspect the backup later.")
else:
    print("No old sqlite DB found — safe to create new DuckDB+Parquet database in", DB_DIR)


No old sqlite DB found — safe to create new DuckDB+Parquet database in vector_db


In [3]:
# CELL 3
# Purpose: Optionally remove the existing new-format vector_db to start fresh.
# Run this only if you want a fresh DB. On Windows, locked files may require restarting the kernel.
def safe_remove_db(path=DB_DIR):
    if not os.path.exists(path):
        print("No existing DB directory found.")
        return
    try:
        shutil.rmtree(path)
        print(f"Removed existing DB directory: {path}")
    except Exception as e:
        print("Could not remove directory (likely locked). Error:", e)
        print("If you need to force removal, restart the kernel and try again.")

# Uncomment to remove DB, or call safe_remove_db() explicitly.
# safe_remove_db(DB_DIR)
print("Cell loaded. Call safe_remove_db() if you want to delete the DB.")


Cell loaded. Call safe_remove_db() if you want to delete the DB.


In [4]:
# CELL 4
# Purpose: Load markdown files from knowledge-base/* and tag them by folder name (doc_type).
knowledge_base_path = "knowledge-base/**/*.md"
files = glob.glob(knowledge_base_path, recursive=True)
print(f"Found {len(files)} files in the knowledge base")

documents = []
folders = glob.glob("knowledge-base/*")
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs={'encoding': 'utf-8'})
    folder_docs = loader.load()
    for doc in folder_docs:
        if not isinstance(doc.metadata, dict):
            doc.metadata = {}
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)

print(f"Loaded {len(documents)} documents")


Found 30 files in the knowledge base
Loaded 30 documents


In [5]:
# CELL 5
# Purpose: Split loaded documents into chunks for embedding/indexing.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)
print(f"Divided into {len(chunks)} chunks")
if len(chunks) > 0:
    print("Sample chunk metadata:", chunks[0].metadata)
    print("Sample chunk snippet:", chunks[0].page_content[:400])


Divided into 57 chunks
Sample chunk metadata: {'source': 'knowledge-base\\about\\bio.md', 'doc_type': 'about'}
Sample chunk snippet: # Professional Bio

Ayush Tyagi is a dedicated Computer Science Engineering student at JIMS, Greater Noida (IP University), on track to graduate in 2025. With a strong foundation built during his PCM schooling at Vivekanand School, Anand Vihar, he has channeled his analytical mindset into a passion for technology. 

Ayush thrives on the creative process of front-end development, specializing in bu


In [6]:
# CELL 6
# Purpose: Prepare the HuggingFace embedding model and Settings for new Chroma (duckdb+parquet).
embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)

client_settings = Settings(
    chroma_db_impl="duckdb+parquet",   # NEW recommended backend
    persist_directory=DB_DIR,          # where new parquet files will be stored
)

print("Embeddings and client settings prepared for duckdb+parquet.")


Embeddings and client settings prepared for duckdb+parquet.


In [7]:
from langchain_chroma import Chroma
import inspect
import sys

print("Chroma class file:", inspect.getfile(Chroma))
print("Python interpreter:", sys.executable)


Chroma class file: c:\Users\ayush\AppData\Local\anaconda3\envs\llms\Lib\site-packages\langchain_chroma\vectorstores.py
Python interpreter: c:\Users\ayush\AppData\Local\anaconda3\envs\llms\python.exe


In [8]:
# CELL 7
# Purpose: Create LCChroma vectorstore from text chunks using DuckDB+Parquet backend and persist to disk.
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory=DB_DIR,
    client_settings=client_settings
)


# Force persist to ensure files are written to disk
try:
    vectorstore.persist()
except Exception as e:
    print("Warning: persist() raised an exception (but creation may have succeeded):", e)

# Access collection info via LCChroma wrapper
collection = getattr(vectorstore, "_collection", None)
collection_name = collection.name if collection is not None else "unknown"
print("Collection created:", collection_name)

# If possible print stored vector count
try:
    print("Stored vectors:", collection.count())
except Exception:
    print("Could not read collection.count(); creation may have succeeded.")


ValueError: [91mYou are using a deprecated configuration of Chroma.

[94mIf you do not have data you wish to migrate, you only need to change how you construct
your Chroma client. Please see the "New Clients" section of https://docs.trychroma.com/deployment/migration.
________________________________________________________________________________________________

If you do have data you wish to migrate, we have a migration tool you can use in order to
migrate your data to the new Chroma architecture.
Please `pip install chroma-migrate` and run `chroma-migrate` to migrate your data and then
change how you construct your Chroma client.

See https://docs.trychroma.com/deployment/migration for more information or join our discord at https://discord.gg/MMeYNTmh3x for help![0m