In [4]:
import os
from dotenv import load_dotenv

load_dotenv('../.env')

True

# Weaviate collection definition

[Weaviate](https://weaviate.io) is a powerful vector database. It can be deployed in numerous ways, through a docker image or a kubernetes images. They also offer a fully managed database service. In particular, weaviate allows for storing objects with custom embeddings and performing hybrid searches. 

Custom embeddings allows us to use powerful Cohere embeddings with the database. Hybrid search performs a fusion of a keyword (BM25F) search and a vector search, allowing high accuracy in search results. A lot of settings are configurable to choose between performance and accuracy, we go with the default settings.

In [5]:
import weaviate
import weaviate.classes as wvc

In [7]:
# Connect to the remote database

client = weaviate.connect_to_custom(
    http_host=os.getenv('WEAVIATE_HOST'),
    http_port=os.getenv('WEAVIATE_PORT'),
    http_secure=False,
    grpc_host=os.getenv('WEAVIATE_GRPC_HOST'),
    grpc_port=os.getenv('WEAVIATE_GRPC_PORT'),
    grpc_secure=False,
    auth_credentials=wvc.init.Auth.api_key(os.getenv('WEAVIATE_API_KEY')),
    additional_config=wvc.init.AdditionalConfig(timeout=wvc.init.Timeout(init=30, query=60, insert=120)),
)

print(client.is_ready())

assert client.is_ready()

True


In [8]:
client.collections.list_all()

{}

In [9]:
# Create the collection. we define a filename, title and text fields for our pdf chunks. Additionally, we store the embedding of the text chunk.

questions = client.collections.create(
    "Documents",
    vectorizer_config=[wvc.config.Configure.NamedVectors.none(name="chunk_vector", vector_index_config=wvc.config.Configure.VectorIndex.hnsw())],
    properties=[
        wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="filename", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="chunk_content", data_type=wvc.config.DataType.TEXT),
    ]
)
questions.exists()

True

In [10]:
# This allows you to list all the chunk in the database

questions = client.collections.get("Documents")

for item in questions.iterator(include_vector=True):
    print(item.properties)
    print(item.vector)

In [15]:
# Run this cell if you want to delete the collection

client.collections.delete("Documents")