In [1]:
from pathlib import Path
from collections import defaultdict
from dotenv import load_dotenv

from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from langchain_mistralai import MistralAIEmbeddings

from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
from langchain_qdrant import QdrantVectorStore

In [2]:
load_dotenv()

True

In [3]:
USE_SECONDARY_SPLIT = False   
CHUNK_SIZE = 256
CHUNK_OVERLAP = 30
HEADERS_TO_SPLIT = [("#", "H1"), ("##", "H2"), ("###", "H3")]                               # Number of results

In [4]:
def split_markdown(text: str):
    splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=HEADERS_TO_SPLIT,
        strip_headers=False,   # keep headings inside the chunk text
    )
    docs = splitter.split_text(text)
    if USE_SECONDARY_SPLIT and len(docs) == 1:
        char_splitter = RecursiveCharacterTextSplitter(
            chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
        )
        docs = char_splitter.split_documents(docs)
    return docs

def process_markdown_directory(in_dir: str, qdrant_dir: str, collection: str):
    """Main function to process markdown files and store them into a local Qdrant vector database."""
    
    in_dir = Path(in_dir)
    qdrant_dir = Path(qdrant_dir)

    # 1) Collect & split
    all_docs = []
    counts = defaultdict(int)
    for md_path in in_dir.rglob("*.md"):
        text = md_path.read_text(encoding="utf-8", errors="ignore")
        docs = split_markdown(text)
        for d in docs:
            d.metadata = {"source": md_path.as_posix(), **(d.metadata or {})}
        all_docs.extend(docs)
        counts[md_path.as_posix()] += len(docs)

    print(f"Collected {len(all_docs)} chunks from {len(counts)} files.")

    # 2) Embeddings
    embeddings = MistralAIEmbeddings(model="mistral-embed")

    # 3) Local embedded Qdrant (persists under qdrant_dir)
    qdrant_dir.mkdir(parents=True, exist_ok=True)
    client = QdrantClient(path=str(qdrant_dir))

    # 4) Create collection if missing (size must match embedding dim)
    vector_size = len(embeddings.embed_query("sample text"))
    try:
        client.get_collection(collection_name=collection)
    except Exception:
        client.create_collection(
            collection_name=collection,
            vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
        )

    # 5) Upsert via LangChain QdrantVectorStore
    store = QdrantVectorStore(
        client=client,
        collection_name=collection,
        embedding=embeddings,
    )
    store.add_documents(all_docs)

    print(f"Qdrant local DB ready at: {qdrant_dir} | collection: {collection}")

    # Simple per-file stats
    print("\nChunks per file (desc):")
    for src, c in sorted(counts.items(), key=lambda x: x[1], reverse=True):
        print(f"{c:5d}  {src}")
    total = sum(counts.values())
    avg = total / len(counts) if counts else 0
    print(f"\nFiles: {len(counts)} | Total chunks: {total} | Avg/file: {avg:.2f}")


In [None]:
process_markdown_directory(
    in_dir="1",
    qdrant_dir="stores/qdrant_db_test_1",
    collection="rag_chunks_test"
)

Collected 31 chunks from 1 files.
