### Installing

In [1]:
pip install -U weaviate-client

Collecting weaviate-client
  Downloading weaviate_client-4.16.9-py3-none-any.whl.metadata (3.7 kB)
Downloading weaviate_client-4.16.9-py3-none-any.whl (579 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m579.1/579.1 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: weaviate-client
  Attempting uninstall: weaviate-client
    Found existing installation: weaviate-client 4.16.7
    Uninstalling weaviate-client-4.16.7:
      Successfully uninstalled weaviate-client-4.16.7
Successfully installed weaviate-client-4.16.9

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/opt/homebrew/opt/python@3.11/bin/python3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install python-dotenv


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/opt/homebrew/opt/python@3.11/bin/python3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
from dotenv import load_dotenv

In [2]:
import os
import weaviate
from weaviate.classes.init import Auth

# --- 1. Connection Details ---
load_dotenv()
WEAVIATE_URL = os.environ.get("WEAVIATE_URL")
WEAVIATE_API_KEY = os.environ.get("WEAVIATE_API_KEY")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

# --- 2. Connect to Weaviate Cloud (v4 method) ---
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,
    auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
    headers={"X-OpenAI-Api-Key": OPENAI_API_KEY}
)

print(client.is_ready())

True


In [3]:
import weaviate.classes.config as wvc

In [4]:
# --- 3. Define and Create the Collection ---
collection_name = "EcommerceProducts"

try:
    if not client.collections.exists(collection_name):
        print(f"Creating collection '{collection_name}'...")
        products = client.collections.create(
            name=collection_name,
            vectorizer_config=wvc.Configure.Vectorizer.text2vec_openai(),
            properties=[
                # Properties for semantic search
                wvc.Property(name="product_title", data_type=wvc.DataType.TEXT),
                wvc.Property(name="product_description", data_type=wvc.DataType.TEXT),
                wvc.Property(name="product_bullet_point", data_type=wvc.DataType.TEXT),
                
                # Properties for filtering ONLY (vectorization is skipped)
                wvc.Property(name="product_id", data_type=wvc.DataType.TEXT, skip_vectorization=True),
                wvc.Property(name="product_brand", data_type=wvc.DataType.TEXT, skip_vectorization=True),
                wvc.Property(name="product_color", data_type=wvc.DataType.TEXT, skip_vectorization=True),
            ]
        )
        print(f"Successfully created collection '{collection_name}'")
    else:
        print(f"Collection '{collection_name}' already exists.")

finally:
    client.close()
    print("Connection closed.")

Creating collection 'EcommerceProducts'...


            Use the `vector_config` argument instead.
            


Successfully created collection 'EcommerceProducts'
Connection closed.


In [5]:
import pandas as pd

# --- 1. Connection Details ---
load_dotenv()
WEAVIATE_URL = os.environ.get("WEAVIATE_URL")
WEAVIATE_API_KEY = os.environ.get("WEAVIATE_API_KEY")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

# --- 2. Connect to Weaviate Cloud (v4 method) ---
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,
    auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
    headers={"X-OpenAI-Api-Key": OPENAI_API_KEY}
)

# --- 3. Load Data and Ingest ---
try:
    # Get the collection object
    products = client.collections.get("EcommerceProducts")

    df = pd.read_parquet('../Dataset/shopping_queries_dataset_products_us.parquet')
    df.fillna('', inplace=True)

    print("Starting data ingestion to Weaviate Cloud...")

    # Use the collection's batch manager for dynamic batching
    with products.batch.dynamic() as batch:
        for i, row in df.iterrows():
            properties = {
                "product_id": row["product_id"],
                "product_title": row["product_title"],
                "product_description": row["product_description"],
                "product_bullet_point": row["product_bullet_point"],
                "product_brand": row["product_brand"],
                "product_color": row["product_color"]
            }
            # Add object to the batch
            batch.add_object(properties=properties)

            if (i + 1) % 10000 == 0:
                print(f"Imported {i + 1} of {len(df)} products")
    
    print(f"Data ingestion complete! Total objects in collection: {len(products)}")

finally:
    client.close()
    print("Connection closed.")

Starting data ingestion to Weaviate Cloud...
Imported 10000 of 1215854 products
Imported 20000 of 1215854 products
Imported 30000 of 1215854 products


{'message': 'Failed to send 451 in a batch of 1000', 'errors': {"unmarshal response body. Got: upstream connect error or disconnect/reset before headers. reset reason: connection termination: invalid character 'u' looking for beginning of value"}}
{'message': 'Failed to send 451 objects in a batch of 1000. Please inspect client.batch.failed_objects or collection.batch.failed_objects for the failed objects.'}


Imported 40000 of 1215854 products
Imported 50000 of 1215854 products
Imported 60000 of 1215854 products
Imported 70000 of 1215854 products
Imported 80000 of 1215854 products
Imported 90000 of 1215854 products
Imported 100000 of 1215854 products
Imported 110000 of 1215854 products
Imported 120000 of 1215854 products
Imported 130000 of 1215854 products
Imported 140000 of 1215854 products
Imported 150000 of 1215854 products
Imported 160000 of 1215854 products
Imported 170000 of 1215854 products
Imported 180000 of 1215854 products
Imported 190000 of 1215854 products
Imported 200000 of 1215854 products
Imported 210000 of 1215854 products
Imported 220000 of 1215854 products
Imported 230000 of 1215854 products
Imported 240000 of 1215854 products
Imported 250000 of 1215854 products
Imported 260000 of 1215854 products
Imported 270000 of 1215854 products
Imported 280000 of 1215854 products


{'message': 'Failed to send all objects in a batch of 1000', 'error': "WeaviateBatchError('Query call with protocol GRPC batch failed with message Channel closed!.')"}
{'message': 'Failed to send all objects in a batch of 1000', 'error': "WeaviateBatchError('Query call with protocol GRPC batch failed with message Channel closed!.')"}
{'message': 'Failed to send all objects in a batch of 1000', 'error': "WeaviateBatchError('Query call with protocol GRPC batch failed with message Channel closed!.')"}
{'message': 'Failed to send 1000 objects in a batch of 1000. Please inspect client.batch.failed_objects or collection.batch.failed_objects for the failed objects.'}
{'message': 'Failed to send all objects in a batch of 1000', 'error': "WeaviateBatchError('Query call with protocol GRPC batch failed with message Channel closed!.')"}
{'message': 'Failed to send all objects in a batch of 1000', 'error': "WeaviateBatchError('Query call with protocol GRPC batch failed with message Channel closed!.

Connection closed.


{'message': 'Failed to send 1000 objects in a batch of 1000. Please inspect client.batch.failed_objects or collection.batch.failed_objects for the failed objects.'}
{'message': 'Failed to send 1000 objects in a batch of 1000. Please inspect client.batch.failed_objects or collection.batch.failed_objects for the failed objects.'}


KeyboardInterrupt: 