### Installing

In [4]:
pip install -U weaviate-client

Collecting weaviate-client
  Downloading weaviate_client-4.16.7-py3-none-any.whl.metadata (3.7 kB)
Collecting httpx<0.29.0,>=0.26.0 (from weaviate-client)
  Using cached httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting validators<1.0.0,>=0.34.0 (from weaviate-client)
  Downloading validators-0.35.0-py3-none-any.whl.metadata (3.9 kB)
Collecting authlib<2.0.0,>=1.2.1 (from weaviate-client)
  Downloading authlib-1.6.1-py2.py3-none-any.whl.metadata (1.6 kB)
Collecting pydantic<3.0.0,>=2.8.0 (from weaviate-client)
  Using cached pydantic-2.11.7-py3-none-any.whl.metadata (67 kB)
Collecting grpcio<1.80.0,>=1.59.5 (from weaviate-client)
  Using cached grpcio-1.74.0-cp311-cp311-macosx_11_0_universal2.whl.metadata (3.8 kB)
Collecting grpcio-health-checking<1.80.0,>=1.59.5 (from weaviate-client)
  Downloading grpcio_health_checking-1.74.0-py3-none-any.whl.metadata (1.0 kB)
Collecting deprecation<3.0.0,>=2.1.0 (from weaviate-client)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl.me

In [5]:
pip install python-dotenv

Collecting python-dotenv
  Using cached python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Using cached python_dotenv-1.1.1-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/opt/homebrew/opt/python@3.11/bin/python3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [6]:
from dotenv import load_dotenv

In [30]:
import os
import weaviate
from weaviate.classes.init import Auth

# --- 1. Connection Details ---
load_dotenv()
WEAVIATE_URL = os.environ.get("WEAVIATE_URL")
WEAVIATE_API_KEY = os.environ.get("WEAVIATE_API_KEY")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

# --- 2. Connect to Weaviate Cloud (v4 method) ---
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,
    auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
    headers={"X-OpenAI-Api-Key": OPENAI_API_KEY}
)

print(client.is_ready())

True


In [31]:
import weaviate.classes.config as wvc

In [32]:
# --- 3. Define and Create the Collection ---
collection_name = "EcommerceProducts"

try:
    if not client.collections.exists(collection_name):
        print(f"Creating collection '{collection_name}'...")
        products = client.collections.create(
            name=collection_name,
            vectorizer_config=wvc.Configure.Vectorizer.text2vec_openai(),
            properties=[
                # Properties for semantic search
                wvc.Property(name="product_title", data_type=wvc.DataType.TEXT),
                wvc.Property(name="product_description", data_type=wvc.DataType.TEXT),
                wvc.Property(name="product_bullet_point", data_type=wvc.DataType.TEXT),
                
                # Properties for filtering ONLY (vectorization is skipped)
                wvc.Property(name="product_id", data_type=wvc.DataType.TEXT, skip_vectorization=True),
                wvc.Property(name="product_brand", data_type=wvc.DataType.TEXT, skip_vectorization=True),
                wvc.Property(name="product_color", data_type=wvc.DataType.TEXT, skip_vectorization=True),
            ]
        )
        print(f"Successfully created collection '{collection_name}'")
    else:
        print(f"Collection '{collection_name}' already exists.")

finally:
    client.close()
    print("Connection closed.")

Creating collection 'EcommerceProducts'...
Successfully created collection 'EcommerceProducts'
Connection closed.


In [None]:
import pandas as pd

# --- 1. Connection Details ---
load_dotenv()
WEAVIATE_URL = os.environ.get("WEAVIATE_URL")
WEAVIATE_API_KEY = os.environ.get("WEAVIATE_API_KEY")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

# --- 2. Connect to Weaviate Cloud (v4 method) ---
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,
    auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
    headers={"X-OpenAI-Api-Key": OPENAI_API_KEY}
)

# --- 3. Load Data and Ingest ---
try:
    # Get the collection object
    products = client.collections.get("EcommerceProducts")

    df = pd.read_parquet('../Dataset/shopping_queries_dataset_products_us.parquet')
    df.fillna('', inplace=True)

    print("Starting data ingestion to Weaviate Cloud...")

    # Use the collection's batch manager for dynamic batching
    with products.batch.dynamic() as batch:
        for i, row in df.iterrows():
            properties = {
                "product_id": row["product_id"],
                "product_title": row["product_title"],
                "product_description": row["product_description"],
                "product_bullet_point": row["product_bullet_point"],
                "product_brand": row["product_brand"],
                "product_color": row["product_color"]
            }
            # Add object to the batch
            batch.add_object(properties=properties)

            if (i + 1) % 10000 == 0:
                print(f"Imported {i + 1} of {len(df)} products")
    
    print(f"Data ingestion complete! Total objects in collection: {len(products)}")

finally:
    client.close()
    print("Connection closed.")

Starting data ingestion to Weaviate Cloud...
Imported 10000 of 1215854 products
Imported 20000 of 1215854 products
Imported 30000 of 1215854 products
Imported 40000 of 1215854 products
Imported 50000 of 1215854 products
Imported 60000 of 1215854 products
Imported 70000 of 1215854 products
Imported 80000 of 1215854 products
Imported 90000 of 1215854 products
Imported 100000 of 1215854 products
Imported 110000 of 1215854 products
Imported 120000 of 1215854 products
Imported 130000 of 1215854 products
Imported 140000 of 1215854 products
Imported 150000 of 1215854 products
Imported 160000 of 1215854 products
Imported 170000 of 1215854 products
Imported 180000 of 1215854 products
Imported 190000 of 1215854 products
Imported 200000 of 1215854 products


{'message': 'Failed to send 485 in a batch of 1000', 'errors': {"unmarshal response body. Got: <html>\r\n<head><title>502 Bad Gateway</title></head>\r\n<body>\r\n<center><h1>502 Bad Gateway</h1></center>\r\n<hr><center>cloudflare</center>\r\n</body>\r\n</html>\r\n: invalid character '<' looking for beginning of value"}}
{'message': 'Failed to send 485 objects in a batch of 1000. Please inspect client.batch.failed_objects or collection.batch.failed_objects for the failed objects.'}


Imported 210000 of 1215854 products


{'message': 'Failed to send 544 in a batch of 1000', 'errors': {"unmarshal response body. Got: upstream connect error or disconnect/reset before headers. reset reason: connection termination: invalid character 'u' looking for beginning of value"}}
{'message': 'Failed to send 544 objects in a batch of 1000. Please inspect client.batch.failed_objects or collection.batch.failed_objects for the failed objects.'}


Imported 220000 of 1215854 products
Imported 230000 of 1215854 products
Imported 240000 of 1215854 products
Imported 250000 of 1215854 products
