### Import packages

In [18]:
import time
from tqdm import tqdm
import faiss
from langchain.storage import InMemoryStore
from langchain_community.vectorstores import FAISS
from langchain_voyageai import VoyageAIEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import AutoModel, AutoTokenizer
import torch
from langchain_community.document_loaders.json_loader import JSONLoader
from langchain_community.docstore.in_memory import InMemoryDocstore

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from uuid import uuid4

from langchain_core.documents import Document
import os
from dotenv import load_dotenv
from typing import Dict

# Load environment variables
load_dotenv()

True

In [19]:
import langchain_core

print(langchain_core.__version__)

0.3.33


### Set the directories

In [20]:
# Set the constants
CURRENT_DIR = os.getcwd()
PROJECT_DIR = os.path.dirname(CURRENT_DIR)
DATA_DIR = os.path.join(PROJECT_DIR, 'data/raw')
VECTOR_DIR = os.path.join(PROJECT_DIR, 'data/vector_stores')
FAISS_DIR = os.path.join(VECTOR_DIR, 'faiss')

print(f'FAISS_DIR: {FAISS_DIR}')

%store PROJECT_DIR
%store DATA_DIR
%store VECTOR_DIR
%store FAISS_DIR

FAISS_DIR: /Users/bharathbeeravelly/Desktop/RAG-System/data/vector_stores/faiss
Stored 'PROJECT_DIR' (str)
Stored 'DATA_DIR' (str)
Stored 'VECTOR_DIR' (str)
Stored 'FAISS_DIR' (str)


In [42]:
def metadata_extractor(record: Dict, metadata: Dict) -> Dict:
    """Extracts metadata fields from each ExecutiveOrder entry."""
    return {
        "Title": record.get("Title", "Unknown"),
        "URL": record.get("URL", ""),
        "DateSigned": record.get("DateSigned", ""),
    }

def json_loader(json_path):
    loader = JSONLoader(
        file_path=json_path,
        jq_schema='.ExecutiveOrder[]',  # Extracts individual EO entries
        content_key='Description',  # Extracts the main text content
        metadata_func=metadata_extractor  # Custom function for metadata extraction
    )
    return loader.load()  # Return extracted documents

json_documents = []  # List to store all extracted documents

for item in os.listdir(DATA_DIR):
    item_path = os.path.join(DATA_DIR, item)
    docs = json_loader(item_path)  # Extract documents from each JSON file
    json_documents.extend(docs)  # Append extracted documents to json_documents

# Now json_documents contains a list of all extracted documents
print(f"Total Documents Extracted: {len(json_documents)}")
print(json_documents[2].metadata)

Total Documents Extracted: 79
{'Title': 'Additional Measures to Combat Anti-Semitism', 'URL': 'https://www.whitehouse.gov/presidential-actions/2025/01/additional-measures-to-combat-anti-semitism/', 'DateSigned': 'January 29, 2025'}


In [22]:
%store json_documents

Stored 'json_documents' (list)


## Voyage Embeddings

### Create Embeddings (`voyage-3-large`)

In [23]:
voyage_embeddings = VoyageAIEmbeddings(
    voyage_api_key = os.getenv("VOYAGE_API_KEY"),
    model = 'voyage-3-large'
)

In [24]:
# Create a FAISS index with L2 (Euclidean) distance metric
index = faiss.IndexFlatL2(len(voyage_embeddings.embed_query("hello world"))) 

In [25]:
start_time = time.time()

faiss_voyage = FAISS(
    embedding_function=voyage_embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

end_time = time.time()


In [26]:
elapsed_time_faiss_voyage = end_time - start_time
print(f"Elapsed time for FAISS and Voyage Embeddings Initialization: {elapsed_time_faiss_voyage} seconds")

Elapsed time for FAISS and Voyage Embeddings Initialization: 7.486343383789062e-05 seconds


In [27]:
# This text splitter is used to create the child documents
child_splitter = RecursiveCharacterTextSplitter(chunk_size=600, separators=['\n'])

# The vectorstore to use to index the child chunks
store = InMemoryStore()

In [28]:
start_time = time.time()

retriever_faiss_voyage = ParentDocumentRetriever(
    vectorstore=faiss_voyage,
    docstore=store,
    child_splitter=child_splitter
)

end_time = time.time()

elapsed_parent_retriever = end_time - start_time
print(f"Elapsed time for ParentDocumentRetriever Initialization: {elapsed_parent_retriever} seconds")

Elapsed time for ParentDocumentRetriever Initialization: 0.002836942672729492 seconds


In [29]:
start_time = time.time()
retriever_faiss_voyage.add_documents(json_documents, ids=None)
end_time = time.time()

elapsed_faiss_voyage_indexing = end_time - start_time

print(f"Elapsed time for FAISS and Voyage Embeddings Indexing: {elapsed_faiss_voyage_indexing} seconds")

Elapsed time for FAISS and Voyage Embeddings Indexing: 124.06875395774841 seconds


In [30]:
%store elapsed_faiss_voyage_indexing

Stored 'elapsed_faiss_voyage_indexing' (float)


In [31]:
sub_docs = faiss_voyage.similarity_search_with_score(
    "Is there a hiring freeze?",
    k=5
)

# Similarity Score results
for res, score in sub_docs:
    print(f"* [SIM={score:3f}] [{res.metadata}]")


* [SIM=0.807298] [{'Title': 'Hiring Freeze', 'URL': 'https://www.whitehouse.gov/presidential-actions/2025/01/hiring-freeze/', 'DateSigned': 'January 20, 2025', 'doc_id': 'b5efd982-e4ed-4d29-9eac-206fe6784495'}]
* [SIM=0.973834] [{'Title': 'Hiring Freeze', 'URL': 'https://www.whitehouse.gov/presidential-actions/2025/01/hiring-freeze/', 'DateSigned': 'January 20, 2025', 'doc_id': 'b5efd982-e4ed-4d29-9eac-206fe6784495'}]
* [SIM=1.048505] [{'Title': 'Hiring Freeze', 'URL': 'https://www.whitehouse.gov/presidential-actions/2025/01/hiring-freeze/', 'DateSigned': 'January 20, 2025', 'doc_id': 'b5efd982-e4ed-4d29-9eac-206fe6784495'}]
* [SIM=1.049223] [{'Title': 'Hiring Freeze', 'URL': 'https://www.whitehouse.gov/presidential-actions/2025/01/hiring-freeze/', 'DateSigned': 'January 20, 2025', 'doc_id': 'b5efd982-e4ed-4d29-9eac-206fe6784495'}]
* [SIM=1.117409] [{'Title': 'REFORMING THE FEDERAL HIRING PROCESS AND RESTORING MERIT TO GOVERNMENT SERVICE', 'URL': 'https://www.whitehouse.gov/presidentia

In [32]:
retriever_faiss_voyage.invoke("Is there a hiring freeze?")

[Document(metadata={'Title': 'Hiring Freeze', 'URL': 'https://www.whitehouse.gov/presidential-actions/2025/01/hiring-freeze/', 'DateSigned': 'January 20, 2025'}, page_content='By the authority vested in me as President by the Constitution and the laws of the United States of America, I hereby order a\xa0freeze on the hiring of Federal civilian employees, to be applied throughout the executive branch. \xa0As part of this freeze, no Federal civilian position that is vacant at noon on January 20, 2025, may be filled, and no new position may be created except as otherwise provided for in this memorandum or other applicable law. \xa0Except as provided below, this freeze applies to all executive departments and agencies regardless of their sources of operational and programmatic funding.\nThis order does not apply to military personnel of the armed forces or to positions related to immigration enforcement, national security, or public safety. \xa0Moreover, nothing in this memorandum shall ad

In [33]:
faiss_voyage.save_local(FAISS_DIR)

In [34]:
!pip list

Package                                  Version
---------------------------------------- ------------
accelerate                               0.34.2
adal                                     1.2.7
aiobotocore                              2.17.0
aiohttp                                  3.9.5
aioitertools                             0.12.0
aiolimiter                               1.2.1
aiosignal                                1.3.1
alembic                                  1.14.0
altair                                   5.3.0
annotated-types                          0.6.0
antlr4-python3-runtime                   4.9.3
anyio                                    4.3.0
appnope                                  0.1.4
argcomplete                              3.5.0
asgiref                                  3.8.1
asttokens                                2.4.1
async-timeout                            4.0.3
attrs                                    23.2.0
azure-common                             1.1.2

In [36]:
!pip freeze > ../requirements.txt

In [37]:
!pip --version

pip 25.0 from /Users/bharathbeeravelly/.pyenv/versions/3.9.19/lib/python3.9/site-packages/pip (python 3.9)


In [40]:
!pip show langchain-voyageai

Name: langchain-voyageai
Version: 0.1.4
Summary: An integration package connecting VoyageAI and LangChain
Home-page: https://github.com/langchain-ai/langchain
Author: 
Author-email: 
License: MIT
Location: /Users/bharathbeeravelly/.pyenv/versions/3.9.19/lib/python3.9/site-packages
Requires: langchain-core, pydantic, voyageai
Required-by: 
