# How to Use Weaviate Vector Database

1. Create new collection
2. Add new data (vector)
3. Retrieve data (hybrid)

## 1. Create new collection

In [1]:
import weaviate
import weaviate.classes.config as wc
from pprint import pprint


CONNECTION_CONFIG = {"port": 8080, "grpc_port": 50051, "skip_init_checks": True}


def add_collection(name, connection_config=CONNECTION_CONFIG):
    name = name.capitalize()

    client = None
    try:
        client = weaviate.connect_to_local(**connection_config)

        collection = client.collections.create(
            name=name,
            vector_config=[
                wc.Configure.Vectors.self_provided(
                    name="custom_vector",
                    vector_index_config=wc.Configure.VectorIndex.hnsw(
                        ef_construction=300,
                        distance_metric=wc.VectorDistances.COSINE,
                    ),
                )
            ],
            properties=[wc.Property(name="text", data_type=wc.DataType.TEXT)],
            inverted_index_config=wc.Configure.inverted_index(  # Optional
                bm25_b=0.7,
                bm25_k1=1.25,
                index_null_state=True,
                index_property_length=True,
                index_timestamps=True,
            ),
        )

        print("New Collection created!")
        print(f" name: {collection.config.get().name}")

    except Exception as e:
        print(e)
    finally:
        if client:
            client.close()


def delete_all_collections(connection_config=CONNECTION_CONFIG):
    client = None
    try:
        client = weaviate.connect_to_local(**connection_config)
        client.collections.delete_all()
        print("All collections has been removed!")
    except Exception as e:
        print(e)
    finally:
        if client:
            client.close()


def delete_collection(collections, connection_config=CONNECTION_CONFIG):
    client = None
    try:
        client = weaviate.connect_to_local(**connection_config)
        client.collections.delete(collections)
        print(f"deleted collections: {collections}")
    except Exception as e:
        print(e)
    finally:
        if client:
            client.close()


def list_collections(connection_config=CONNECTION_CONFIG):
    client = None
    try:
        client = weaviate.connect_to_local(**connection_config)
        collections = client.collections.list_all(simple=True)
        return list(collections.keys())
    except Exception as e:
        print(e)
    finally:
        if client:
            client.close()



In [2]:
# delete_all_collections()

In [3]:
# add_collection('document')

In [4]:
list_collections()

['Requirements']

## 2. Add new data (vector)

In [5]:
from sentence_transformers import SentenceTransformer

# Load a multilingual embedding model from HuggingFace Model Hub
model = SentenceTransformer("intfloat/multilingual-e5-base")

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
def add_text(text, collection_name, connection_config=CONNECTION_CONFIG):
    client = None
    try:
        client = weaviate.connect_to_local(**connection_config)
        documents = client.collections.get(collection_name)
        documents.data.insert(properties={"text": text}, vector=model.encode(text))
        print('New data is inserted!')
        print(f'text: {text}')
    except Exception as e:
        print(e)
    finally:
        if client:
            client.close()

In [7]:
# add_text('hello world', 'Document')

In [8]:
def list_objects(collection_name, connection_config=CONNECTION_CONFIG):
    client = None
    try:
        client = weaviate.connect_to_local(**connection_config)
        documents = client.collections.get(collection_name)
        for item in documents.iterator(
            include_vector=True
        ):
            pprint(item)
    except Exception as e:
        print(e)
    finally:
        if client:
            client.close()

In [9]:
list_objects('Requirements')

Object(uuid=_WeaviateUUIDInt('1f9544c8-b288-4187-970b-244edf36cc59'),
       metadata=MetadataReturn(creation_time=None,
                               last_update_time=None,
                               distance=None,
                               certainty=None,
                               score=None,
                               explain_score=None,
                               is_consistent=None,
                               rerank_score=None),
       properties={'chunk_id': 'tourism.pdf_chunk_4',
                   'content': '【In case that applicant will shoulder part/all '
                              'of travel expense】\n'
                              '(6) Applicant’s Bank Certificate (balance '
                              'within the last six months must be shown) * If '
                              'the Average Daily\n'
                              'Balance (ADB) in the last six months is not '
                              'indicated, bank statement must be 

## 2. Retrieve data (hybrid)

**Hybrid Search** = Keyword Search + Semantic Search
- An alpha of 1 is a pure vector search.
- An alpha of 0 is a pure keyword search.

In [10]:
def retrieve_documents(query, collection_name, connection_config=CONNECTION_CONFIG):
    client = None
    try:
        client = weaviate.connect_to_local(**connection_config)
        documents = client.collections.get(collection_name)
        response = documents.query.hybrid(
            query=query,
            alpha=1,  # Balance between keyword and vector
            target_vector="custom_vector",  # Specify which named vector to use for the vector component
            query_properties=[
                "title",
                "content",
            ],  # Which properties to use for the keyword (BM25) search
            # IMPORTANT: If your collection has NO vectorizer_config at the class level
            # and only self_provided named vectors, you might need to also provide `vector`
            # within the hybrid query to explicitly give the query vector.
            # The Weaviate docs indicate `query` is used for both, but if your setup is strict
            # with `vectorizer_config=none()`, `query` won't be auto-vectorized.
            # In such cases, if you want semantic search as part of hybrid, you must provide the vector.
            vector=model.encode(query),  # This would be the vector of 'search_query'
            limit=5,
        )
        for object in response.objects:
            print(object.properties)

        if len(response.objects) == 0:
            print("there is no similar documents")
    except Exception as e:
        print(e)
    finally:
        if client:
            client.close()

In [11]:
query = 'tourism requirements'

In [12]:
retrieve_documents(f'query: {query}', 'Requirements')

{'chunk_id': 'tourism.pdf_chunk_4', 'title': 'TOURISM', 'content': '【In case that applicant will shoulder part/all of travel expense】\n(6) Applicant’s Bank Certificate (balance within the last six months must be shown) * If the Average Daily\nBalance (ADB) in the last six months is not indicated, bank statement must be submitted to prove\ntransactions within the last six months.\n(7) Applicant’s Tax Payment Certificate * Form 2316, must have signature of the Employer and Employee *\nFor business owners, proof of actual payment of tax must be submitted in addition to the BIR Tax\nPayment Certificate Form (copy acceptable)\n(8) Applicant’s Employment Certificate (must indicate period of employment, salary and position) * For\nbusiness owners, Department of Trade and Industry (DTI) “Certificate of Business Name Registration” and\nMayor’s Permit from the City Hall must be submitted. * If for any reason other than being a retiree,\nfull-time housewife or unemployed, applicant is unable to s

  return forward_call(*args, **kwargs)
