# DocTree-NLP Demo

This notebook demonstrates the core functionalities of the DocTree-NLP library, including:
- Authentication with Notion API
- Listing and accessing documents
- Processing text with NLP capabilities
- Building document hierarchies
- Using lazy document loading
- Working with document windows
- Automatic tagging and keyword extraction

First, let's import the required modules:

In [ ]:
import os
from datetime import datetime

# Import core components
from doctree_nlp import (
    NotionClient,
    TextProcessor,
    Tagger,
    DocTree,
    DEFAULT_CACHE_DIR
)

# Import structure components
from doctree_nlp.structure import Document, Block

# Import performance optimization components
from doctree_nlp.lazy_document import LazyDocument, LazyDocumentCollection
from doctree_nlp.windowing import DocumentWindower, DocumentWindow

# Import notebook display helpers
from doctree_nlp.notebook import display_document, display_document_tree, display_document_table

## 1. Authentication

First, we'll initialize the Notion client with our API token:

*Note: The library now supports auto-discovery of your token from environment variables.*

In [ ]:
# Use auto token discovery (recommended approach)
client = NotionClient(
    token="auto",                       # Auto-discover token from environment variables
    cache_enabled=True,                 # Enable caching for faster repeat queries
    cache_dir=DEFAULT_CACHE_DIR,        # Use default cache directory
    max_cache_age_days=1,               # Cache valid for 1 day
    rate_limit=3                        # Limit to 3 requests per second (Notion API limit)
)

# Alternative approach with explicit token:
# notion_token = os.environ.get('NOTION_API_TOKEN')
# client = NotionClient(token=notion_token, ...)

# Verify authentication
auth_status = client.authenticate()
print(f"Authentication status: {'Successful' if auth_status else 'Failed'}")

# Display cache information
try:
    cache_info = client.get_cache_info()
    print(f"\nCache configuration:")
    print(f"- Cache enabled: {cache_info['enabled']}")
    print(f"- Cache directory: {cache_info['cache_dir']}")
    print(f"- Max age: {cache_info['max_age_days']} days")
except Exception as e:
    print(f"Could not get cache info: {e}")

## 2. Listing Documents

Let's retrieve and display the list of available documents:

In [ ]:
# List documents (uses cache if available)
print("Fetching documents (will use cache after first run)...")
documents = client.list_documents()
print(f"Found {len(documents)} documents:")
for doc in documents[:5]:  # Show first 5 documents for brevity
    print(f"- {doc.title} (ID: {doc.id})")
    print(f"  Last edited: {doc.last_edited_time}")

## 3. Lazy Document Loading

The library now supports lazy loading of documents, which is more memory efficient:

In [ ]:
# Create a lazy document collection
lazy_collection = LazyDocumentCollection(
    client=client,
    preload_metadata=True,  # Preload document metadata but not content
    load_strategy="on_demand"  # Load content only when accessed
)

# Print the number of documents in the collection
print(f"Lazy collection contains {len(lazy_collection.documents)} documents")

# Get a lazy document - content won't be loaded yet
if documents:  # Use first document from previous list
    lazy_doc = lazy_collection.get_document(documents[0].id)
    print(f"Lazy document: {lazy_doc.title}")
    print(f"Content loaded: {lazy_doc._blocks_loaded}")  # Should be False initially
    
    # Access a property that requires content - this will trigger loading
    print(f"\nAccessing document content...")
    preview = lazy_doc.preview_text(n_chars=150)
    print(f"Content loaded now: {lazy_doc._blocks_loaded}")  # Should be True now
    print(f"Preview: {preview}")

## 4. Processing Document Content

Now, let's fetch and process the content of a document:

In [ ]:
if documents:
    # Get a regular document with all content
    doc = documents[0]
    print(f"Processing document: {doc.title}")
    
    # Get complete document with content
    document = client.get_document(doc.id)
    print(f"Retrieved {len(document.blocks)} blocks from document")
    print(f"Document last edited: {document.last_edited_time}")
    print(f"Last fetched from API: {document.last_fetched}")
    
    # Process text
    processor = TextProcessor()
    processed_blocks = processor.process_blocks(document.blocks)
    
    # Display results
    print("\nProcessed content:")
    for block in processed_blocks[:2]:  # Show first 2 blocks for brevity
        print(f"\nBlock type: {block['type']}")
        print(f"Entities found: {[e['text'] for e in block['entities']]}")
        print(f"Keywords: {block['keywords']}")
        print(f"Sentences: {len(block['sentences'])}")
    
    # Try forcing a refresh (bypassing cache)
    print("\nForcing refresh (bypassing cache)...")
    refreshed_document = client.get_document(doc.id, use_cache=False)
    print(f"Retrieved {len(refreshed_document.blocks)} blocks directly from API")

## 5. Document Windowing

For large documents, you can use windowing to view portions of the document:

In [ ]:
if documents and len(document.blocks) > 0:
    # Create a document windower
    windower = DocumentWindower(default_window_size=5)  # 5 blocks per window
    
    # Create the first window
    window = windower.create_window(document)
    print(f"Window {window.start_index}-{window.end_index} of {window.total_blocks} blocks")
    
    # Display window blocks
    for block in window.blocks:
        print(f"- [{block.type}] {block.content[:50]}... ({block.id})")
    
    # Get the next window if available
    if window.has_next:
        next_window = windower.get_next_window(window, document)
        print(f"\nNext window {next_window.start_index}-{next_window.end_index}:")
        for block in next_window.blocks:
            print(f"- [{block.type}] {block.content[:50]}...")
    
    # Search for text and create a window around it
    if len(document.blocks) > 10:
        search_text = document.blocks[10].content[:20]  # Use part of a block farther down
        print(f"\nSearching for: '{search_text}'")
        search_window = windower.find_text_window(
            document, 
            search_text, 
            window_size=3,
            context_blocks=1
        )
        
        if search_window:
            print(f"Found text in window {search_window.start_index}-{search_window.end_index}")
            for block in search_window.blocks:
                print(f"- [{block.type}] {block.content[:50]}...")

## 6. Building Document Hierarchy

Let's analyze the document's structure using the improved DocTree class:

In [ ]:
if documents and hasattr(document, 'blocks') and len(document.blocks) > 0:
    # Build tree automatically
    document.build_tree()
    
    # Convert to dictionary for visualization
    structure = document.to_dict()
    print("Document structure:")
    print(f"- Title: {structure['title']}")
    print(f"- Created: {structure['created_time']}")
    print(f"- Last edited: {structure['last_edited_time']}")
    
    # Search for specific nodes in the tree by type
    if document.tree:
        headings = document.tree.find_nodes_by_type('heading_1')
        print(f"\nFound {len(headings)} level 1 headings:")
        for heading in headings:
            print(f"- {heading.block.content}")
            
        # Search for content using regex patterns
        if len(document.blocks) > 0:
            # Use the first word of the first block as search term
            search_term = document.blocks[0].content.split()[0] if document.blocks[0].content else "the"
            matching_nodes = document.tree.find_nodes_by_content(f".*{search_term}.*")
            print(f"\nFound {len(matching_nodes)} blocks containing '{search_term}'")
    
    # Check cache status after operations
    try:
        cache_info = client.get_cache_info()
        print(f"\nCache status after operations:")
        print(f"- Files in cache: {cache_info['num_files']}")
        print(f"- Cache size: {cache_info['total_size_mb']:.2f} MB")
    except Exception as e:
        print(f"Could not get cache info: {e}")

## 7. Jupyter Notebook Integration

The library includes built-in display functions for Jupyter notebooks:

In [ ]:
if documents and document and hasattr(document, 'blocks') and len(document.blocks) > 0:
    # Display document summary with preview
    print("Document summary:\n")
    display_document(document)
    
    # Display blocks as table
    print("\nDocument blocks table:\n")
    display_document_table(document)
    
    # Display document structure as interactive tree
    if document.tree:
        print("\nDocument structure tree:\n")
        display_document_tree(document)

## 8. Automatic Tagging

Finally, let's generate tags for the document content:

In [ ]:
if documents and hasattr(document, 'blocks') and len(document.blocks) > 0:
    # Initialize tagger
    tagger = Tagger()
    
    # Add some custom tags for matching
    tagger.add_custom_tags(["important", "review", "followup"])
    
    print("Generated tags:")
    for block in document.blocks[:3]:  # Process first 3 blocks for demonstration
        tags = tagger.generate_tags(block)
        print(f"\nBlock content: {block.content[:50]}...")
        print(f"Tags: {[tag.name for tag in tags]}")
        
        # Analyze sentiment
        sentiment = tagger.analyze_sentiment(block.content)
        print(f"Sentiment: {sentiment}")

## 9. Using Alternative Document Sources

The library also supports loading documents from Obsidian vaults and local directories:

```python
# Obsidian Client
from doctree_nlp import ObsidianClient

obsidian_client = ObsidianClient(
    vault_path='/path/to/obsidian/vault',
    cache_enabled=True
)

obsidian_docs = obsidian_client.list_documents()

# Local Source Client
from doctree_nlp import LocalSource

local_client = LocalSource(
    directory_path='/path/to/markdown/files',
    file_pattern='**/*.md',
    cache_enabled=True
)

local_docs = local_client.list_documents()
```