In [1]:
from langchain_community.document_loaders import TextLoader
from langchain_qdrant import QdrantVectorStore, FastEmbedSparse, RetrievalMode
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_openai import OpenAIEmbeddings
from qdrant_client import models
import os
import dotenv
dotenv.load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


In [95]:
loader = TextLoader("Houston, TX Code of Ordinances(2).pdf.md")
documents = loader.load()

In [97]:
headers_to_split_on = [
    ("#", "Header_1"),
    ("##", "Header_2"),
    ("###", "Header_3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
md_header_splits = markdown_splitter.split_text(documents[0].page_content)

In [98]:
md_header_splits

[Document(metadata={'Header_2': 'Sec. 1-1. - How Code designated and cited.'}, page_content='## Sec. 1-1. - How Code designated and cited.  \nThe ordinances embraced in the following chapters and sections shall constitute and be designated the "Code of Ordinances, City of Houston, Texas," and may be so cited.  \nThe Construction Code, including the Fire Code, constitutes a part of this Code and is adopted herein by reference. The Construction Code is published by separate promulgation and is not set forth in this two-volume edition of the Code. Interested persons may contact the city secretary for purchase information.  \n(Code 1968, § 1-1; Ord. No. 2011-1168, § 3, 12-14-2011; Ord. No. 2021-1037, § 20(Exh. I.1, 12-1-2021, eff. 4-1-2022)  \nCharter reference— Printed ordinances as evidence, Art. IX, § 7; force and effect of Codes of Ordinances and admission thereof into evidence, Art. IX, § 13.'),
 Document(metadata={'Header_1': 'Chapter 33 - PLANNING AND DEVELOPMENT'}, page_content='# 

In [99]:
vector_embeddings = OpenAIEmbeddings(api_key=os.getenv("OPENAI_API_KEY"),
                                    model="text-embedding-3-large")
sparse_embeddings = FastEmbedSparse(model_name="Qdrant/bm25")

qdrant_url=os.getenv("QDRANT_URL")
qdrant_api_key= os.getenv("QDRANT_API_KEY")


Fetching 29 files: 100%|██████████| 29/29 [00:00<00:00, 230368.97it/s]


In [100]:
# qdrant_hybrid = QdrantVectorStore.from_documents(
#     md_header_splits,
#     vector_embeddings,
#     url=qdrant_url,
#     collection_name="city-of-houston",
#     prefer_grpc=True,
#     api_key=qdrant_api_key,
#     sparse_embedding=sparse_embeddings,
#     retrieval_mode=RetrievalMode.HYBRID
# )

In [101]:
qdrant_hybrid = QdrantVectorStore.from_existing_collection(
        embedding=vector_embeddings,
        sparse_embedding=sparse_embeddings,
        collection_name="city-of-houston",
        url=qdrant_url,
        api_key=qdrant_api_key
    )

In [15]:
qdrant_hybrid.similarity_search("Buffer Yards Landscape requirements for a high-rise structure adjacent to single family home", k=10)

[Document(metadata={'Header 1': 'ARTICLE V. - TREES, SHRUBS AND SCREENING FENCES', 'Header 2': 'Sec. 33-128. - Landscape buffer required.', '_id': '42f52274-74ba-4691-aea4-eb2f65d27def', '_collection_name': 'test-city-of-houston'}, page_content='## Sec. 33-128. - Landscape buffer required.  \nThe owner of a building site included under section 33-121 and which is to be developed or expanded for a nonresidential or a multifamily residential use adjacent to any existing single-family residential property shall provide a landscape buffer adhering to at least one of the following two buffer types:  \n(1)\n---\nExcept as may otherwise be provided in chapter 19 of this Code, either a wood, concrete or masonry opaque screening fence with a minimum height of six feet along the entire property line or entire artificial lot line, if any, adjacent to the single-family residential property.  \n(2) Evergreen screening on the property line or artificial lot line.\na. The evergreen screening shall co