### Importing libraries

In [52]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

from langchain.document_loaders import DirectoryLoader
# from langchain.document_loaders import UnstructuredMarkdownLoader

from langchain.text_splitter import MarkdownTextSplitter
# from langchain.text_splitter import MarkdownHeaderTextSplitter

from langchain.embeddings import HuggingFaceEmbeddings

from langchain_community.vectorstores import FAISS

### Recursively loading file(s)

In [None]:
md_notes_dir = "../data/md-notes"
os.makedirs(md_notes_dir, exist_ok=True)

# Load all md files in the data/md-notes directory
loader = DirectoryLoader(md_notes_dir, glob="*.md")
documents = loader.load()
print(f"Found {len(documents)} documents")

print("documents found: ")
for doc in documents:
    print(doc.metadata)
    print(doc.page_content[:200])

first_document = documents[0]
print(first_document.page_content[:200])

### Text Splitting

In [None]:
# Split the document into chunks of 1000 characters
markdown_splitter = MarkdownTextSplitter(chunk_size = 1000, chunk_overlap = 100)

# Split the document into chunks
chunks = markdown_splitter.split_documents(documents)
print(f"{len(chunks)} chunks")

print(f"chunk content: {chunks[0].page_content}")
print(f"metadata: {chunks[0].metadata}")

### Embedding & Indexing using FAISS

In [None]:
# Load the embedding model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# Create a FAISS instance from the chunks and embeddings
vectorstore = FAISS.from_documents(chunks, embeddings)

data_store_dir = "../data/vectorstore"
os.makedirs(data_store_dir, exist_ok=True)

# Save the FAISS index to the data-store directory
vectorstore.save_local(data_store_dir)

print(f"FAISS index saved to {data_store_dir}") 