## Import packages

In [1]:
import time
from tqdm import tqdm
import os
from dotenv import load_dotenv
from typing import Dict

from langchain.storage import InMemoryStore
from langchain_voyageai import VoyageAIEmbeddings

from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

from langchain_community.document_loaders.json_loader import JSONLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

from langchain_core.documents import Document


# Load environment variables
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
load_dotenv()

True

## Retrieve the directories and documents

In [3]:
%store -r DATA_DIR
%store -r PROJECT_DIR
%store -r VECTOR_DIR


In [4]:
%store -r json_documents

## Create Embeddings (`voyage-3-large`)


In [5]:
voyage_embeddings = VoyageAIEmbeddings(
    voyage_api_key=os.getenv("VOYAGE_API_KEY"),
    model = 'voyage-3-large'
)

## On-disk storage of Qdrant Vector Store

In [6]:
QDRANT_DIR = os.path.join(VECTOR_DIR, "qdrant")
client = QdrantClient(path=QDRANT_DIR)

In [7]:
start_time = time.time()

client.create_collection(
    collection_name='EO-Collection',
    vectors_config=VectorParams(size = 1024, distance=Distance.COSINE)
)

qdrant_voyage = QdrantVectorStore(
    client=client,
    collection_name='EO-Collection',
    embedding=voyage_embeddings
)

end_time = time.time()

elapsed_qdrant_initiation = end_time - start_time
print(f"Qdrant initiation took {elapsed_qdrant_initiation} seconds")

Qdrant initiation took 0.3645589351654053 seconds


In [8]:
# This text splitter is used to create the child documents
child_splitter = RecursiveCharacterTextSplitter(chunk_size=600, separators=['\n'])

# The vectorstore to use to index the child chunks
store = InMemoryStore()

In [9]:
start_time = time.time()

retriever_qdrant_voyage = ParentDocumentRetriever(
    vectorstore=qdrant_voyage,
    docstore=store,
    child_splitter=child_splitter
)

end_time = time.time()

elapsed_parent_retriever = end_time - start_time
print(f"Elapsed time for ParentDocumentRetriever Initialization: {elapsed_parent_retriever} seconds")

Elapsed time for ParentDocumentRetriever Initialization: 0.0001010894775390625 seconds


In [10]:
start_time = time.time()
retriever_qdrant_voyage.add_documents(json_documents, ids=None)
end_time = time.time()

elapsed_qdrant_voyage_indexing = end_time - start_time

print(f"Elapsed time for Qdrant and Voyage Embeddings Indexing: {elapsed_qdrant_voyage_indexing} seconds")

Elapsed time for Qdrant and Voyage Embeddings Indexing: 78.20874071121216 seconds


In [11]:
sub_docs = qdrant_voyage.similarity_search_with_score(
    "Is there a hiring freeze?",
    k=5
)

# Similarity Score results
for res, score in sub_docs:
    print(f"* [SIM={score:3f}] [{res.metadata}]")


* [SIM=0.596351] [{'Title': 'Hiring Freeze', 'URL': 'https://www.whitehouse.gov/presidential-actions/2025/01/hiring-freeze/', 'DateSigned': 'January 20, 2025', 'doc_id': 'bd709913-4042-419b-91c6-a673a55a7b72', '_id': '94618e7ab10a4ed4a22991c6619d0e67', '_collection_name': 'EO-Collection'}]
* [SIM=0.513083] [{'Title': 'Hiring Freeze', 'URL': 'https://www.whitehouse.gov/presidential-actions/2025/01/hiring-freeze/', 'DateSigned': 'January 20, 2025', 'doc_id': 'bd709913-4042-419b-91c6-a673a55a7b72', '_id': '2e23519ef5b141b5a17b329d9f917414', '_collection_name': 'EO-Collection'}]
* [SIM=0.475389] [{'Title': 'Hiring Freeze', 'URL': 'https://www.whitehouse.gov/presidential-actions/2025/01/hiring-freeze/', 'DateSigned': 'January 20, 2025', 'doc_id': 'bd709913-4042-419b-91c6-a673a55a7b72', '_id': 'dec6ebc7a2f84eb1aa8d517da53d2c71', '_collection_name': 'EO-Collection'}]
* [SIM=0.475304] [{'Title': 'Hiring Freeze', 'URL': 'https://www.whitehouse.gov/presidential-actions/2025/01/hiring-freeze/', '

In [12]:


# Verify the collection exists
collections = client.get_collections()
print("Available collections:", collections)

Available collections: collections=[CollectionDescription(name='EO-Collection')]
