In [1]:
# Cell 1 - Imports and Setup
from pathlib import Path
import os
from dotenv import load_dotenv
from txtai.embeddings import Embeddings
import logging
import json

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
load_dotenv()

True

# Cloud Storage Configuration

This notebook explores txtai's cloud storage capabilities. We'll test:
1. Google Cloud Storage integration
2. Cloud-specific configurations
3. Remote index management
4. Performance considerations


Here, we'll configure the cloud embeddings to use Google Cloud Storage.

The GCS bucket and prefix are loaded from the environment variables.

They require the following environment variables to be set:
- `GOOGLE_CLOUD_BUCKET`: aurite-txtai-dev
- `EMBEDDINGS_PREFIX`: txtai

In [2]:
# Cell 2 - Cloud Configuration
cloud_config = {
    "path": "sentence-transformers/nli-mpnet-base-v2",
    "content": True,
    "backend": "faiss",
    "hybrid": True,
    "scoring": {
        "method": "bm25",
        "terms": True,
        "normalize": True
    },
    "batch": 32,
    "normalize": True,
    "cloud": {
        "provider": "gcs", # This tells txtai to use Google Cloud Storage
        "container": os.getenv("GOOGLE_CLOUD_BUCKET"), # This specifies the name of the bucket where the embeddings will be stored (in GCS)
        "prefix": os.getenv("EMBEDDINGS_PREFIX") # This specifies the prefix, which is the folder in the bucket where the embeddings will be stored
    }
}

logger.info("Initializing cloud embeddings...")
cloud_embeddings = Embeddings(cloud_config)

2024-11-17 14:26:09,070 - INFO - Initializing cloud embeddings...


# Cell 3 - Cloud Storage Test Documents

We'll create test documents to verify cloud storage functionality.
These documents will help us test:
1. Remote storage and retrieval
2. Index persistence across sessions
3. Cloud-specific performance characteristics

In [3]:
# Cell 3 - Cloud Storage Test Documents
test_docs = [
    {
        "id": "cloud1",
        "text": "Document about cloud storage systems",
        "metadata": {
            "type": "technical",
            "tags": ["cloud", "storage"],
            "version": 1.0,
            "environment": "production"
        }
    },
    {
        "id": "cloud2",
        "text": "Guide to distributed data management",
        "metadata": {
            "type": "guide",
            "tags": ["distributed", "data"],
            "difficulty": "advanced",
            "environment": "development"
        }
    }
]

# Prepare documents for indexing
def prepare_docs(docs):
    return [(doc["id"], doc["text"], json.dumps(doc["metadata"])) for doc in docs]

logger.info("Indexing test documents to cloud storage...")
cloud_embeddings.index(prepare_docs(test_docs))

2024-11-17 14:28:15,519 - INFO - Indexing test documents to cloud storage...


# Cell 4 - Remote Index Management

Let's test saving and loading indices to/from cloud storage:
1. Save index to cloud
2. Load index from cloud
3. Verify data consistency

In [4]:
# Cell 4 - Remote Index Management
def test_cloud_persistence():
    logger.info("\nTesting cloud storage persistence:")

    # Initial search
    results = cloud_embeddings.search("cloud storage", 1)
    logger.info("Initial search result:")
    logger.info(f"Text: {results[0]['text']}")

    # Save to cloud
    cloud_path = f"{os.getenv('EMBEDDINGS_PREFIX')}/test_index"
    logger.info(f"Saving index to cloud: {cloud_path}")
    cloud_embeddings.save(cloud_path)

    # Create new instance and load from cloud
    logger.info("Loading index from cloud")
    new_embeddings = Embeddings(cloud_config)
    new_embeddings.load(cloud_path)

    # Verify data
    new_results = new_embeddings.search("cloud storage", 1)
    logger.info("Search result after reload:")
    logger.info(f"Text: {new_results[0]['text']}")

# Test cloud persistence
test_cloud_persistence()

2024-11-17 14:28:27,183 - INFO - 
Testing cloud storage persistence:
2024-11-17 14:28:27,211 - INFO - Initial search result:
2024-11-17 14:28:27,212 - INFO - Text: Document about cloud storage systems
2024-11-17 14:28:27,212 - INFO - Saving index to cloud: txtai/test_index
2024-11-17 14:28:27,243 - INFO - Loading index from cloud
2024-11-17 14:28:28,096 - INFO - Search result after reload:
2024-11-17 14:28:28,097 - INFO - Text: Document about cloud storage systems
