In [1]:
import os
from dotenv import load_dotenv

load_dotenv('../.env')

True

# Weaviate collection definition

[Weaviate](https://weaviate.io) is a powerful vector database. It can be deployed in numerous ways, through a docker image or a kubernetes images. They also offer a fully managed database service. In particular, weaviate allows for storing objects with custom embeddings and performing hybrid searches. 

Custom embeddings allows us to use powerful Cohere embeddings with the database. Hybrid search performs a fusion of a keyword (BM25F) search and a vector search, allowing high accuracy in search results. A lot of settings are configurable to choose between performance and accuracy, we go with the default settings.

In [2]:
import weaviate
import weaviate.classes as wvc

In [3]:
# Connect to the remote database

client = weaviate.connect_to_custom(
    http_host=os.getenv('WV_HTTP_HOST'),
    http_port=os.getenv('WV_HTTP_PORT'),
    http_secure=False,
    grpc_host=os.getenv('WV_GRPC_HOST'),
    grpc_port=os.getenv('WV_GRPC_PORT'),
    grpc_secure=False,
    auth_credentials=wvc.init.Auth.api_key(os.getenv('WV_API_KEY')),
    additional_config=wvc.init.AdditionalConfig(timeout=wvc.init.Timeout(init=30, query=60, insert=120)),
)

print(client.is_ready())

assert client.is_ready()

True


In [4]:
client.collections.list_all()

{'Documents': _CollectionConfigSimple(name='Documents', description=None, generative_config=None, properties=[_Property(name='title', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectorizer='none'), _Property(name='filename', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectorizer='none'), _Property(name='chunk_content', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectorizer='none')], references=[], reranker_config=None, vectorizer_config=None, vectorizer=None, vector_config={'chun

In [8]:
# Create the collection. we define a filename, title and text fields for our pdf chunks. Additionally, we store the embedding of the text chunk.

questions = client.collections.create(
    "Documents",
    vectorizer_config=[wvc.config.Configure.NamedVectors.none(name="chunk_vector", vector_index_config=wvc.config.Configure.VectorIndex.hnsw())],
    properties=[
        wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="filename", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="chunk_content", data_type=wvc.config.DataType.TEXT),
    ]
)
questions.exists()

UnexpectedStatusCodeError: Collection may not have been created properly.! Unexpected status code: 422, with response body: {'error': [{'message': 'class name Documents already exists'}]}.

In [9]:
# This allows you to list all the chunk in the database

questions = client.collections.get("Documents")

for item in questions.iterator(include_vector=True):
    print(item.properties)
    print(item.vector)

{'title': 'Unser Land verdient mehr!: Wahlprogramm für die Bundestagswahl 2025: Zweitstimme: Beste Bildung für alle, von der Küste', 'filename': 'bsw.pdf', 'chunk_content': 'Die Grundschule muss wieder die Kernkompetenzen Lesen, Schreiben und Rechnen in den\nMittelpunkt stellen. Das „Schreiben nach Gehör“\nwar dabei kontraproduktiv und muss vollständig\nverschwinden. Studien belegen außerdem, dass\nanaloge Lernmittel den digitalen beim Erwerb\nvon Lese- und Rechtschreibfähigkeiten überlegen sind.\n\n✔ Handys und Tablets wollen wir mindestens bis\nzum Ende der Grundschule aus den Klassenzimmern verbannen und auch danach möglichst wenig im Unterricht einsetzen.'}
{'chunk_vector': [0.006122589111328125, 0.0019369125366210938, 0.016693115234375, 0.0080108642578125, -0.00717926025390625, 0.0148162841796875, -0.00274658203125, -0.0287017822265625, 0.0479736328125, 0.033477783203125, 0.0025482177734375, -0.0115509033203125, 0.00682830810546875, -0.025421142578125, -0.034698486328125, 0.005214

KeyboardInterrupt: 

In [10]:
# Run this cell if you want to delete the collection

client.collections.delete("Documents")