In [1]:
import os
from dotenv import load_dotenv

load_dotenv('../.env')

True

# Weaviate collection definition

[Weaviate](https://weaviate.io) is a powerful vector database. It can be deployed in numerous ways, through a docker image or a kubernetes images. They also offer a fully managed database service. In particular, weaviate allows for storing objects with custom embeddings and performing hybrid searches. 

Custom embeddings allows us to use powerful Cohere embeddings with the database. Hybrid search performs a fusion of a keyword (BM25F) search and a vector search, allowing high accuracy in search results. A lot of settings are configurable to choose between performance and accuracy, we go with the default settings.

In [2]:
import weaviate

import weaviate.classes as wvc
from weaviate.classes.init import Auth

In [3]:
# Connect to the remote database

weaviate_api_key = os.environ["WEAVIATE_API_KEY"]

client = weaviate.connect_to_local(
    auth_credentials=Auth.api_key(weaviate_api_key)
)

print(client.is_ready())

assert client.is_ready()

True


In [4]:
# Create the collection. we define a filename, title and text fields for our pdf chunks. Additionally, we store the embedding of the text chunk.

questions = client.collections.create(
    "Documents",
    vectorizer_config=[wvc.config.Configure.NamedVectors.none(name="chunk_vector", vector_index_config=wvc.config.Configure.VectorIndex.hnsw())],
    properties=[
        wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="filename", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="chunk_content", data_type=wvc.config.DataType.TEXT),
    ]
)
questions.exists()

True

In [8]:
# This allows you to list all the chunk in the database

questions = client.collections.get("Documents")

for item in questions.iterator(include_vector=True):
    print(item.properties)
    print(item.vector)

{'title': '1: Digital Adoption: Accelerating Postpandemic, yet a Widening Divide: Digital uptake by businesses varies significantly across countries and types of technology: • Firms’ digital readiness and management practices before the pandemic predicted a higher', 'filename': 'World Bank Digital Progress and Trends Report 2023.pdf', 'chunk_content': '##### Digital uptake by businesses varies significantly across countries and types of technology'}
{'chunk_vector': [0.0268096923828125, 0.0029621124267578125, -0.03533935546875, 0.001033782958984375, 0.006439208984375, -0.03900146484375, 0.0191497802734375, -0.00689697265625, -0.059906005859375, 0.0078582763671875, -0.053497314453125, 0.068115234375, -0.060577392578125, -0.00959014892578125, 0.03790283203125, -0.0347900390625, 0.00550079345703125, 0.0499267578125, 0.03302001953125, 0.0037097930908203125, -0.050140380859375, 0.00940704345703125, -0.008819580078125, -0.035064697265625, 0.019927978515625, -0.0077362060546875, -0.0689697265

KeyboardInterrupt: 

In [15]:
# Run this cell if you want to delete the collection

client.collections.delete("Documents")