Testing to see if we can connect to our dockerized weaviate image/container

In [1]:
import weaviate
import json

from datetime import datetime

In [2]:
# Connect to Weaviate (v4 client)
client = weaviate.connect_to_local(
    host="localhost",
    port=8080
)

# Check connection
print(f"Connected to Weaviate: {client.is_ready()}")

Connected to Weaviate: True


In [4]:
# client.close()

# Delete existing Document class if it exists (to start fresh)
try:
    client.collections.delete("Chunk")
    print("Deleted existing Document class")
except Exception as e:
    print(f"No existing class to delete: {e}")


Deleted existing Document class


In [4]:
# Define an advanced RAG schema with contextual retrieval
from weaviate.classes.config import Configure, Property, DataType

class_obj = {
    "class": "Document",
    "vectorizer": "none",  # Manual vectorization - you compute vectors yourself
    "properties": [
        {
            "name": "content",
            "dataType": ["text"],
            "description": "The actual text to embed & search"
        },
        {
            "name": "context",
            "dataType": ["text"],
            "description": "Contextual information for this chunk (parent paragraph, section, etc)"
        },
        {
            "name": "source",
            "dataType": ["string"],
            "description": "Where it came from (file, URL, etc)"
        },
        {
            "name": "chunk_id",
            "dataType": ["int"],
            "description": "If you split docs into chunks"
        },
        {
            "name": "page_number",
            "dataType": ["int"],
            "description": "For PDFs"
        },
        {
            "name": "created_at",
            "dataType": ["date"],
            "description": "When added"
        },
        {
            "name": "tags",
            "dataType": ["string[]"],
            "description": "Categories/labels for filtering"
        }
    ]
}

# Create the class
try:
    client.collections.create_from_dict(class_obj)
    print("Advanced RAG Document class with context created successfully!")
except Exception as e:
    print(f"Error creating class: {e}")

Advanced RAG Document class with context created successfully!


In [5]:
# Add sample data with manual vectors and context
import numpy as np
from datetime import datetime, timezone

documents = client.collections.get("Document")

# Example: Create sample vectors (in real RAG, you'd use embeddings model)
# For testing, we'll use random vectors (384 dimensions is common for sentence-transformers)
vector_dim = 384

# Get current time in RFC3339 format with timezone
current_time = datetime.now(timezone.utc).isoformat()

sample_data = [
    {
        "content": "Python is a powerful programming language",
        "context": "Chapter 2: Python Basics - This section covers the fundamentals of Python programming language including syntax, data types, and control flow. Python is widely used in data science, machine learning, and web development.",
        "source": "documentation.pdf",
        "chunk_id": 1,
        "page_number": 5,
        "created_at": current_time,
        "tags": ["python", "programming", "basics"],
        "vector": np.random.rand(vector_dim).tolist()  # Replace with real embeddings
    },
    {
        "content": "Machine learning models require large amounts of quality training data",
        "context": "Chapter 5: Data Preparation - Before training any ML model, data must be cleaned, validated, and split into training and testing sets. The quality of training data directly impacts model performance.",
        "source": "ml_guide.md",
        "chunk_id": 2,
        "page_number": None,
        "created_at": current_time,
        "tags": ["machine-learning", "data", "training"],
        "vector": np.random.rand(vector_dim).tolist()  # Replace with real embeddings
    },
    {
        "content": "Vector databases enable fast similarity search over high-dimensional embeddings",
        "context": "Section 3.2: Vector Databases - Vector databases like Weaviate are specifically designed to store and query vector embeddings. They use sophisticated indexing techniques like HNSW to enable fast approximate nearest neighbor search.",
        "source": "vector_db_intro.md",
        "chunk_id": 1,
        "page_number": None,
        "created_at": current_time,
        "tags": ["vectors", "database", "search"],
        "vector": np.random.rand(vector_dim).tolist()  # Replace with real embeddings
    }
]

# Insert data with vectors and context
for obj in sample_data:
    documents.data.insert(obj)
    
print(f"Inserted {len(sample_data)} documents with vectors and context")

Inserted 3 documents with vectors and context


In [6]:
# Retrieve all documents with context
documents = client.collections.get("Document")

# Fetch all objects using fetch_objects
response = documents.query.fetch_objects(limit=100)

print(f"Total documents in DB: {len(response.objects)}")
print("\nRetrieved documents with context:")
for obj in response.objects:
    props = obj.properties
    print(f"\n- Source: {props['source']} (chunk {props['chunk_id']})")
    print(f"  Content: {props['content']}")
    print(f"  Context: {props['context'][:100]}...")
    print(f"  Tags: {props['tags']}")
    print(f"  Vectors: {props['vector']}")

Total documents in DB: 3

Retrieved documents with context:

- Source: ml_guide.md (chunk 2)
  Content: Machine learning models require large amounts of quality training data
  Context: Chapter 5: Data Preparation - Before training any ML model, data must be cleaned, validated, and spl...
  Tags: ['machine-learning', 'data', 'training']
  Vectors: [0.8706178311937327, 0.8963631780427518, 0.8899277512648187, 0.907055530401593, 0.23928190717254205, 0.5745661240952066, 0.03998999729059738, 0.7129130422786611, 0.36167533415701314, 0.23291333785332713, 0.5065746900476178, 0.2289496715770466, 0.8723620403958771, 0.9350409623805995, 0.48833757146228896, 0.5601387467879811, 0.8976089493621104, 0.8219294056345957, 0.6171987793584935, 0.5234343739389438, 0.759651627680428, 0.12400983559593703, 0.656880086622081, 0.3037999270460118, 0.02662667590510759, 0.9291983826449923, 0.9546613243451486, 0.7413599781529561, 0.7574259825215093, 0.7964938134793373, 0.5075485355241558, 0.7967959675691287, 0.717