In [1]:
from langchain_community.document_loaders import TextLoader
from langchain_qdrant import QdrantVectorStore, FastEmbedSparse, RetrievalMode
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_openai import OpenAIEmbeddings
from qdrant_client import models
import os
import dotenv
dotenv.load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


In [95]:
loader = TextLoader("Houston, TX Code of Ordinances(2).pdf.md")
documents = loader.load()

In [97]:
headers_to_split_on = [
    ("#", "Header_1"),
    ("##", "Header_2"),
    ("###", "Header_3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
md_header_splits = markdown_splitter.split_text(documents[0].page_content)

In [99]:
vector_embeddings = OpenAIEmbeddings(api_key=os.getenv("OPENAI_API_KEY"),
                                    model="text-embedding-3-large")
sparse_embeddings = FastEmbedSparse(model_name="Qdrant/bm25")

qdrant_url=os.getenv("QDRANT_URL")
qdrant_api_key= os.getenv("QDRANT_API_KEY")

Fetching 29 files: 100%|██████████| 29/29 [00:00<00:00, 230368.97it/s]


In [100]:
# qdrant_hybrid = QdrantVectorStore.from_documents(
#     md_header_splits,
#     vector_embeddings,
#     url=qdrant_url,
#     collection_name="city-of-houston",
#     prefer_grpc=True,
#     api_key=qdrant_api_key,
#     sparse_embedding=sparse_embeddings,
#     retrieval_mode=RetrievalMode.HYBRID
# )

In [101]:
qdrant_hybrid = QdrantVectorStore.from_existing_collection(
        embedding=vector_embeddings,
        sparse_embedding=sparse_embeddings,
        collection_name="city-of-houston",
        url=qdrant_url,
        api_key=qdrant_api_key
    )

In [None]:
qdrant_hybrid.similarity_search("Buffer Yards Landscape requirements for a high-rise structure adjacent to single family home", k=10)