# DocTree-NLP Demo

This notebook demonstrates the core functionalities of the DocTree-NLP library, including:
- Authentication with Notion API
- Listing and accessing documents
- Processing text with NLP capabilities
- Building document hierarchies
- Automatic tagging and keyword extraction

First, let's import the required modules:

In [None]:
import os
from doctree_nlp import (
    NotionClient,
    Hierarchy,
    Tagger,
    TextProcessor,
    DEFAULT_CACHE_DIR
)

## 1. Authentication

First, we'll initialize the Notion client with our API token:

In [None]:
# Get token from environment variable
notion_token = os.environ.get('NOTION_API_TOKEN')

# Initialize client with caching and rate limiting
client = NotionClient(
    token=notion_token,
    cache_enabled=True,                 # Enable caching for faster repeat queries
    cache_dir=DEFAULT_CACHE_DIR,        # Use default cache directory
    max_cache_age_days=1,               # Cache valid for 1 day
    rate_limit=3                        # Limit to 3 requests per second (Notion API limit)
)

# Verify authentication
auth_status = client.authenticate()
print(f"Authentication status: {'Successful' if auth_status else 'Failed'}")

# Display cache information
try:
    cache_info = client.get_cache_info()
    print(f"\nCache configuration:")
    print(f"- Cache enabled: {cache_info['enabled']}")
    print(f"- Cache directory: {cache_info['cache_dir']}")
    print(f"- Max age: {cache_info['max_age_days']} days")
except Exception as e:
    print(f"Could not get cache info: {e}")

## 2. Listing Documents

Let's retrieve and display the list of available documents:

In [None]:
# List documents (uses cache if available)
print("Fetching documents (will use cache after first run)...")
documents = client.list_documents()
print(f"Found {len(documents)} documents:")
for doc in documents:
    print(f"- {doc.title} (ID: {doc.id})")
    print(f"  Last edited: {doc.last_edited_time}")

## 3. Processing Document Content

Now, let's fetch and process the content of the first document:

In [None]:
if documents:
    # Get document content
    doc = documents[0]
    print(f"Processing document: {doc.title}")
    
    # The get_document_content method returns a tuple of (metadata, blocks)
    metadata, blocks = client.get_document_content(doc.id)
    print(f"Retrieved {len(blocks)} blocks from document")
    print(f"Document last edited: {metadata.last_edited_time}")
    print(f"Last fetched from API: {metadata.last_fetched}")
    
    # Process text
    processor = TextProcessor()
    processed_blocks = processor.process_blocks(blocks)
    
    # Display results
    print("\nProcessed content:")
    for block in processed_blocks[:2]:  # Show first 2 blocks for brevity
        print(f"\nBlock type: {block['type']}")
        print(f"Entities found: {[e['text'] for e in block['entities']]}")
        print(f"Keywords: {block['keywords']}")
    
    # Try forcing a refresh (bypassing cache)
    print("\nForcing refresh (bypassing cache):")
    metadata, blocks = client.get_document_content(doc.id, use_cache=False)
    print(f"Retrieved {len(blocks)} blocks directly from API")

## 4. Building Document Hierarchy

Let's analyze the document's structure:

In [None]:
if documents:
    # Build hierarchy
    hierarchy = Hierarchy()
    root = hierarchy.build_hierarchy(blocks)
    
    # Convert to dictionary for visualization
    structure = hierarchy.to_dict()
    print("Document structure:")
    print(structure)
    
    # Check cache status after operations
    try:
        cache_info = client.get_cache_info()
        print(f"\nCache status after operations:")
        print(f"- Files in cache: {cache_info['num_files']}")
        print(f"- Cache size: {cache_info['total_size_mb']:.2f} MB")
    except Exception as e:
        print(f"Could not get cache info: {e}")

## 5. Automatic Tagging

Finally, let's generate tags for the document content:

In [None]:
if documents:
    # Initialize tagger
    tagger = Tagger()
    
    # Add some custom tags
    tagger.add_custom_tags(["important", "review", "followup"])
    
    print("Generated tags:")
    for block in blocks[:3]:  # Process first 3 blocks for demonstration
        tags = tagger.generate_tags(block)
        print(f"\nBlock content: {block.content[:50]}...")
        print(f"Tags: {[tag.name for tag in tags]}")
        
        # Analyze sentiment
        sentiment = tagger.analyze_sentiment(block.content)
        print(f"Sentiment: {sentiment}")