### Sample Data Used in Below Examples

In [1]:
data = [
    "Function that randomly generates a list of integers",
    "Function that makes calls to the OpenAI API for summary generation",
    "Function that manages textual context as a directory is navigated through DFS",
    "Function that prints current textual context",
    "Function that scans a document for LaTeX texts"
]

metadatas = [
    {
        "source" : "source 1",
        "code" : "source code 1",
        "date" : "2025-03-04"
    },
    {
        "source" : "source 2",
        "code" : "source code 2",
        "date" : "2025-03-04"
    },
    {
        "source" : "source 3",
        "code" : "source code 3",
        "date" : "2025-03-04"
    },
    {
        "source" : "source 4",
        "code" : "source code 4",
        "date" : "2025-03-04"
    },
    {
        "source" : "source 5",
        "code" : "source code 5",
        "date" : "2025-03-04"
    }
]

### Chroma Vector Database Usage
- Free, local
- Metadata filtering possible
- Search results are returned in somewhat annoying format

In [2]:
pip install datetime chromadb --quiet

In [None]:
# ChromaDB offers metadata filtering
# https://docs.trychroma.com/docs/overview/introduction

from datetime import datetime
import chromadb
import chromadb.utils.embedding_functions as embedding_functions

chroma_client = chromadb.Client()

# # Need API Key
# openai_ef = embedding_functions.OpenAIEmbeddingFunction(
#     api_key="YOUR_API_KEY",
#     model_name="text-embedding-3-small"
# )

# With huggingface embeddings, in case we ever transition to open source implementation
huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
    api_key="",
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

collection = chroma_client.get_or_create_collection(
    name="test",
    embedding_function = huggingface_ef,
    metadata={
        "hnsw:space": "cosine"
    }
)

In [4]:
now = datetime.now()
collection.add(
    documents= data,
    metadatas= metadatas,
    ids= [now.strftime("%Y-%m-%d %H:%M:%S") + "id" + str(i) for i in range(5)]
)

In [5]:
results = collection.query(
    query_texts=["How does the library come up with the code summaries?"],
    n_results=5
)

results

# Can look at all documents inserted into the vector database with peek()
# collection.peek()

{'ids': [['2025-03-10 16:19:48id1',
   '2025-03-10 16:19:48id3',
   '2025-03-10 16:19:48id4',
   '2025-03-10 16:19:48id2',
   '2025-03-10 16:19:48id0']],
 'embeddings': None,
 'documents': [['Function that makes calls to the OpenAI API for summary generation',
   'Function that prints current textual context',
   'Function that scans a document for LaTeX texts',
   'Function that manages textual context as a directory is navigated through DFS',
   'Function that randomly generates a list of integers']],
 'uris': None,
 'data': None,
 'metadatas': [[{'code': 'source code 2',
    'date': '2025-03-04',
    'source': 'source 2'},
   {'code': 'source code 4', 'date': '2025-03-04', 'source': 'source 4'},
   {'code': 'source code 5', 'date': '2025-03-04', 'source': 'source 5'},
   {'code': 'source code 3', 'date': '2025-03-04', 'source': 'source 3'},
   {'code': 'source code 1', 'date': '2025-03-04', 'source': 'source 1'}]],
 'distances': [[0.6876525282859802,
   0.7072367072105408,
   0.7780

### FAISS Vector Database Integration in LangChain
- Thought one example using LangChain could be helpful since it seemed that LangChain / LangGraph could eventually be in the books
- Filtering with metadata also possible

In [6]:
pip install --upgrade --quiet  sentence_transformers langchain_huggingface langchain_community faiss-cpu

In [9]:
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents import Document

model_name = "sentence-transformers/all-MiniLM-L6-v2"
# If we get access to a GPU, we can use the GPU implementation to accelerate search
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
hf_embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# LangChain Integration of FAISS
# db = FAISS.from_texts(texts = data, embedding = hf_embeddings)

# Alternatively, can setup documents with
docs = [Document(page_content = text, metadata = metadata) for text, metadata in zip(data, metadatas)]
db = FAISS.from_documents(documents = docs, embedding = hf_embeddings)

In [10]:
db.similarity_search("How does the library come up with the code summaries?")

[Document(id='602c63d6-8534-475a-a5b4-360fb59b49e1', metadata={'source': 'source 2', 'code': 'source code 2', 'date': '2025-03-04'}, page_content='Function that makes calls to the OpenAI API for summary generation'),
 Document(id='a0cd807c-826d-42c0-b69a-7c94100b7bc3', metadata={'source': 'source 4', 'code': 'source code 4', 'date': '2025-03-04'}, page_content='Function that prints current textual context'),
 Document(id='1e75ee8d-e248-4a31-aba4-9ce9810fe9c2', metadata={'source': 'source 5', 'code': 'source code 5', 'date': '2025-03-04'}, page_content='Function that scans a document for LaTeX texts'),
 Document(id='a93d51f8-afff-468f-856f-1e8c8b54f07a', metadata={'source': 'source 3', 'code': 'source code 3', 'date': '2025-03-04'}, page_content='Function that manages textual context as a directory is navigated through DFS')]

In [12]:
db.similarity_search(query="How does the library come up with the code summaries?", k=3,filter={"source" : "source 5"})

[Document(id='1e75ee8d-e248-4a31-aba4-9ce9810fe9c2', metadata={'source': 'source 5', 'code': 'source code 5', 'date': '2025-03-04'}, page_content='Function that scans a document for LaTeX texts')]