In [1]:
# Cell 1 - Imports and Setup
from pathlib import Path
import os
from dotenv import load_dotenv
from txtai.embeddings import Embeddings
import logging
import json

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
load_dotenv()

True

# Content Storage Notebook

This notebook explores txtai's content storage capabilities. We'll test:
1. Different storage backends (SQLite vs Memory)
2. Custom index configurations
3. Document storage patterns
4. Metadata handling

Here we define two different storage configurations for the embeddings. One using SQLite and one using in-memory storage.

We will then initialize the embeddings with both configurations and compare the results.

In [2]:
# Cell 2 - Storage Configuration Comparison
# Define different storage configurations
sqlite_config = {
    "path": "sentence-transformers/nli-mpnet-base-v2",
    "content": True,
    "contentpath": "txtai/test/content.db",  # SQLite storage
    "backend": "faiss",
    "hybrid": True,
    "normalize": True
}

memory_config = {
    "path": "sentence-transformers/nli-mpnet-base-v2",
    "content": True,  # In-memory storage
    "backend": "faiss",
    "hybrid": True,
    "normalize": True
}

# Initialize both configurations
logger.info("Initializing embeddings with different storage configs...")
sqlite_embeddings = Embeddings(sqlite_config)
memory_embeddings = Embeddings(memory_config)

2024-11-17 14:15:24,870 - INFO - Initializing embeddings with different storage configs...


# Cell 3 - Test Documents with Rich Metadata

We'll create test documents with various metadata structures to understand storage patterns.
This helps us test how different storage backends handle complex data.

In [3]:
# Cell 3 - Complex Test Documents
test_docs = [
    {
        "id": "doc1",
        "text": "Technical document about database indexing",
        "metadata": {
            "type": "technical",
            "tags": ["database", "index"],
            "version": 1.0,
            "authors": ["John Doe", "Jane Smith"]
        }
    },
    {
        "id": "doc2",
        "text": "Guide to vector similarity search",
        "metadata": {
            "type": "guide",
            "tags": ["vector", "similarity"],
            "difficulty": "intermediate",
            "prerequisites": ["basic math", "python"]
        }
    }
]

# Helper function to prepare documents for indexing
def prepare_docs(docs):
    return [(i, doc["text"], json.dumps(doc["metadata"])) for i, doc in enumerate(docs)]

# Cell 4 - Storage Comparison Tests

We'll index the same documents in both storage configurations and compare:
1. Storage persistence
2. Query response formats
3. Metadata retrieval
4. Performance characteristics

In [4]:
# Cell 4 - Storage Comparison Tests
# Index documents in both configurations
prepared_docs = prepare_docs(test_docs)

logger.info("Indexing documents in SQLite storage...")
sqlite_embeddings.index(prepared_docs)

logger.info("Indexing documents in memory storage...")
memory_embeddings.index(prepared_docs)

# Test query to compare results
test_query = "database indexing guide"

logger.info("\nComparing search results:")
logger.info("\nSQLite Storage Results:")
sqlite_results = sqlite_embeddings.search(test_query, 2)
for result in sqlite_results:
    logger.info(f"Text: {result['text']}")
    logger.info(f"Score: {result['score']}")
    logger.info(f"Metadata: {json.loads(result['metadata']) if result.get('metadata') else None}")
    logger.info("---")

logger.info("\nMemory Storage Results:")
memory_results = memory_embeddings.search(test_query, 2)
for result in memory_results:
    logger.info(f"Text: {result['text']}")
    logger.info(f"Score: {result['score']}")
    logger.info(f"Metadata: {json.loads(result['metadata']) if result.get('metadata') else None}")
    logger.info("---")

2024-11-17 14:15:31,295 - INFO - Indexing documents in SQLite storage...
2024-11-17 14:15:31,467 - INFO - Indexing documents in memory storage...
2024-11-17 14:15:31,477 - INFO - 
Comparing search results:
2024-11-17 14:15:31,477 - INFO - 
SQLite Storage Results:
2024-11-17 14:15:31,502 - INFO - Text: Technical document about database indexing
2024-11-17 14:15:31,502 - INFO - Score: 0.729941209462623
2024-11-17 14:15:31,502 - INFO - Metadata: None
2024-11-17 14:15:31,503 - INFO - ---
2024-11-17 14:15:31,503 - INFO - Text: Guide to vector similarity search
2024-11-17 14:15:31,503 - INFO - Score: 0.4563615721003911
2024-11-17 14:15:31,503 - INFO - Metadata: None
2024-11-17 14:15:31,504 - INFO - ---
2024-11-17 14:15:31,504 - INFO - 
Memory Storage Results:
2024-11-17 14:15:31,516 - INFO - Text: Technical document about database indexing
2024-11-17 14:15:31,518 - INFO - Score: 0.729941209462623
2024-11-17 14:15:31,519 - INFO - Metadata: None
2024-11-17 14:15:31,519 - INFO - ---
2024-11-17 

# Cell 5 - Storage Persistence Test

Let's test how storage persistence works:
1. Save and reload SQLite storage
2. Verify data persistence
3. Compare with memory storage after restart

In [5]:
# Cell 5 - Storage Persistence Test
import time
import os

# Create test directory if it doesn't exist
os.makedirs("txtai/test", exist_ok=True)

def test_persistence(embeddings, name):
    logger.info(f"\nTesting {name} persistence:")
    index_path = f"txtai/test/{name.lower()}"

    # Initial search
    results = embeddings.search("database", 1)
    logger.info("Initial search result:")
    logger.info(f"Text: {results[0]['text']}")

    # Save index
    logger.info(f"Saving index to {index_path}")
    embeddings.save(index_path)

    # Create new instance and load saved index
    logger.info("Loading saved index")
    if name == "SQLite":
        new_embeddings = Embeddings(sqlite_config)
        new_embeddings.load(index_path)
    else:
        new_embeddings = Embeddings(memory_config)
        new_embeddings.load(index_path)

    # Search with new instance
    new_results = new_embeddings.search("database", 1)
    logger.info("Search result after reload:")
    logger.info(f"Text: {new_results[0]['text'] if new_results else 'No results'}")

# Test both storage types
test_persistence(sqlite_embeddings, "SQLite")
test_persistence(memory_embeddings, "Memory")

2024-11-17 14:15:35,111 - INFO - 
Testing SQLite persistence:
2024-11-17 14:15:35,128 - INFO - Initial search result:
2024-11-17 14:15:35,131 - INFO - Text: Technical document about database indexing
2024-11-17 14:15:35,131 - INFO - Saving index to txtai/test/sqlite
2024-11-17 14:15:35,162 - INFO - Loading saved index
2024-11-17 14:15:36,316 - INFO - Search result after reload:
2024-11-17 14:15:36,317 - INFO - Text: Technical document about database indexing
2024-11-17 14:15:36,323 - INFO - 
Testing Memory persistence:
2024-11-17 14:15:36,336 - INFO - Initial search result:
2024-11-17 14:15:36,339 - INFO - Text: Technical document about database indexing
2024-11-17 14:15:36,339 - INFO - Saving index to txtai/test/memory
2024-11-17 14:15:36,369 - INFO - Loading saved index
2024-11-17 14:15:37,167 - INFO - Search result after reload:
2024-11-17 14:15:37,167 - INFO - Text: Technical document about database indexing
