In [6]:
import os
from dotenv import load_dotenv

load_dotenv()

# Load the API key from the .env file
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

from langchain_openai.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")


In [56]:
from langchain_community.document_loaders import ObsidianLoader
from langchain_experimental.text_splitter import SemanticChunker

text_splitter = SemanticChunker(OpenAIEmbeddings(model="text-embedding-3-small"))

# Initialize the loader with the path to your Obsidian vault
loader = ObsidianLoader("./data/Personal")

# Load the documents (this returns an iterator)
docs = loader.load_and_split(text_splitter=text_splitter)

# Calculate the number of words total for all documents
total_words = sum(len(doc.page_content.split()) for doc in docs)

# Print the number of documents and total words
print(f"{len(docs)} documents loaded with a total of {total_words:,} words.")

794 documents loaded with a total of 144,505 words.


In [7]:
# Create a persistent chroma db to store the embeddings
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="obsidian",
    embedding_function=embeddings,
    persist_directory="./chroma",
)

# Embed the documents and store them in the chroma db
vector_store.add_documents(docs)
