# How to Use Weaviate Vector Database

1. Create new collection
2. Add new data (vector)
3. Retrieve data (hybrid)

## Setup

In [47]:
import os
from dotenv import load_dotenv
load_dotenv()
import weaviate
from weaviate.classes.init import Auth
import weaviate.classes.config as wc
from pprint import pprint

In [42]:
weaviate_url = os.environ["WEAVIATE_URL"]
weaviate_api_key = os.environ["WEAVIATE_API_KEY"]

In [45]:
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=weaviate_url,
    auth_credentials=Auth.api_key(weaviate_api_key),
)

print(client.is_ready())

True


In [46]:
client.close()

## 1. Create new collection

In [None]:





CONNECTION_CONFIG = {"port": 8080, "grpc_port": 50051, "skip_init_checks": True}


def add_collection(name, connection_config=CONNECTION_CONFIG):
    name = name.capitalize()

    client = None
    try:
        client = weaviate.connect_to_local(**connection_config)

        collection = client.collections.create(
            name=name,
            vector_config=[
                wc.Configure.Vectors.self_provided(
                    name="custom_vector",
                    vector_index_config=wc.Configure.VectorIndex.hnsw(
                        ef_construction=300,
                        distance_metric=wc.VectorDistances.COSINE,
                    ),
                )
            ],
            properties=[wc.Property(name="text", data_type=wc.DataType.TEXT)],
            inverted_index_config=wc.Configure.inverted_index(  # Optional
                bm25_b=0.7,
                bm25_k1=1.25,
                index_null_state=True,
                index_property_length=True,
                index_timestamps=True,
            ),
        )

        print("New Collection created!")
        print(f" name: {collection.config.get().name}")

    except Exception as e:
        print(e)
    finally:
        if client:
            client.close()


def delete_all_collections(connection_config=CONNECTION_CONFIG):
    client = None
    try:
        client = weaviate.connect_to_local(**connection_config)
        client.collections.delete_all()
        print("All collections has been removed!")
    except Exception as e:
        print(e)
    finally:
        if client:
            client.close()


def delete_collection(collections, connection_config=CONNECTION_CONFIG):
    client = None
    try:
        client = weaviate.connect_to_local(**connection_config)
        client.collections.delete(collections)
        print(f"deleted collections: {collections}")
    except Exception as e:
        print(e)
    finally:
        if client:
            client.close()


def list_collections(connection_config=CONNECTION_CONFIG):
    client = None
    try:
        client = weaviate.connect_to_local(**connection_config)
        collections = client.collections.list_all(simple=True)
        return list(collections.keys())
    except Exception as e:
        print(e)
    finally:
        if client:
            client.close()

In [14]:
# delete_all_collections()

In [15]:
# add_collection('document')

In [16]:
list_collections()

['Custom_splitter_hf',
 'Custom_splitter_openai',
 'Custom_splitter_w_context_hf',
 'Custom_splitter_w_context_openai',
 'Normal_splitter_hf',
 'Normal_splitter_openai',
 'Normal_splitter_w_context_hf',
 'Normal_splitter_w_context_openai']

## 2. Add new data (vector)

In [17]:
from sentence_transformers import SentenceTransformer

# Load a multilingual embedding model from HuggingFace Model Hub
model = SentenceTransformer("intfloat/multilingual-e5-base")

In [18]:
def add_text(text, collection_name, connection_config=CONNECTION_CONFIG):
    client = None
    try:
        client = weaviate.connect_to_local(**connection_config)
        documents = client.collections.get(collection_name)
        documents.data.insert(properties={"text": text}, vector=model.encode(text))
        print('New data is inserted!')
        print(f'text: {text}')
    except Exception as e:
        print(e)
    finally:
        if client:
            client.close()

In [19]:
# add_text('hello world', 'Document')

In [20]:
def list_objects(collection_name, connection_config=CONNECTION_CONFIG):
    client = None
    try:
        client = weaviate.connect_to_local(**connection_config)
        documents = client.collections.get(collection_name)
        for item in documents.iterator(
            include_vector=True
        ):
            pprint(item)
    except Exception as e:
        print(e)
    finally:
        if client:
            client.close()

In [21]:
list_objects('Requirements')

Query call with protocol GRPC search failed with message extract params: no such class with name 'Requirements' found in the schema. Check your schema files for which classes are available.


## 2. Retrieve data (hybrid)

**Hybrid Search** = Keyword Search + Semantic Search
- An alpha of 1 is a pure vector search.
- An alpha of 0 is a pure keyword search.

In [38]:
def retrieve_documents(query, collection_name, connection_config=CONNECTION_CONFIG):
    client = None
    try:
        client = weaviate.connect_to_local(**connection_config)
        documents = client.collections.get(collection_name)
        response = documents.query.hybrid(
            query=query,
            alpha=1,  # Balance between keyword and vector
            target_vector="custom_vector",  # Specify which named vector to use for the vector component
            query_properties=[
                "title",
                "content",
            ],  # Which properties to use for the keyword (BM25) search
            # IMPORTANT: If your collection has NO vectorizer_config at the class level
            # and only self_provided named vectors, you might need to also provide `vector`
            # within the hybrid query to explicitly give the query vector.
            # The Weaviate docs indicate `query` is used for both, but if your setup is strict
            # with `vectorizer_config=none()`, `query` won't be auto-vectorized.
            # In such cases, if you want semantic search as part of hybrid, you must provide the vector.
            vector=model.encode(query),  # This would be the vector of 'search_query'
            limit=5,
            include_vector=True
        )
        for object in response.objects:
            print(len(object.vector['default']))
            print(object.properties)

        if len(response.objects) == 0:
            print("there is no similar documents")
    except Exception as e:
        print(e)
    finally:
        if client:
            client.close()

In [39]:
query = 'tourism requirements'

In [40]:
retrieve_documents(f'query: {query}', 'Custom_splitter_w_context_hf')

768
{'relationships': None, '_node_type': 'TextNode', 'doc_id': 'None', 'document_id': 'None', 'ref_doc_id': 'None', '_node_content': '{"id_": "ea93acfb-26be-4fc0-a455-e79b59ff3ff4", "embedding": null, "metadata": {"chunk_text": "visa \\u2013 stay \\u2264 90 days for sightseeing, business, conferences, visiting relatives/acquaintances. o not allowed: employment, entertainment, training with allowance, or on-the-job training. o for longer stays or compensated activities: obtain coe and apply for appropriate visa. apply together if two or more applicants have same travel schedule and purpose. \\u2022 submission of requirements does not guarantee issuance. \\u2022 fake/tampered documents \\u2192 visa will not be issued. \\u2022 documents will not be returned. \\u2022 applicants have 3 months to submit additional requirements; otherwise, application will be terminated. reason for denial will not be disclosed. \\u2022 if denied, reapplication for the same purpose is only allowed after 6 mon