# NotioNLPToolkit Demo

This notebook demonstrates the core functionalities of the Notion NLP library, including:
- Authentication with Notion API
- Listing and accessing documents
- Processing text with NLP capabilities
- Building document hierarchies
- Automatic tagging

First, let's import the required modules:

In [ ]:
import os\nfrom notionlp import (\n    NotionClient,\n    Hierarchy,\n    Tagger,\n    TextProcessor,\n    DEFAULT_CACHE_DIR\n)

## 1. Authentication

First, we'll initialize the Notion client with our API token:

In [2]:
# Get token from environment variable
notion_token = os.environ.get('NOTION_API_TOKEN')
client = NotionClient(notion_token)

# Verify authentication
auth_status = client.authenticate()
print(f"Authentication status: {'Successful' if auth_status else 'Failed'}")

Authentication status: Successful


# Get token from environment variable\nnotion_token = os.environ.get('NOTION_API_TOKEN')\n\n# Initialize client with caching and rate limiting\nclient = NotionClient(\n    token=notion_token,\n    cache_enabled=True,                 # Enable caching for faster repeat queries\n    cache_dir=DEFAULT_CACHE_DIR,        # Use default cache directory\n    max_cache_age_days=1,               # Cache valid for 1 day\n    rate_limit=3                        # Limit to 3 requests per second (Notion API limit)\n)\n\n# Verify authentication\nauth_status = client.authenticate()\nprint(f\"Authentication status: {'Successful' if auth_status else 'Failed'}\")\n\n# Display cache information\ntry:\n    cache_info = client.get_cache_info()\n    print(f\"\\nCache configuration:\")\n    print(f\"- Cache enabled: {cache_info['enabled']}\")\n    print(f\"- Cache directory: {cache_info['cache_dir']}\")\n    print(f\"- Max age: {cache_info['max_age_days']} days\")\nexcept Exception as e:\n    print(f\"Could not get cache info: {e}\")

In [3]:
documents = client.list_documents()
print(f"Found {len(documents)} documents:")
for doc in documents:
    print(f"- {doc.title} (ID: {doc.id})")

Found 3 documents:
- Project Documentation (ID: abc123def456)
- Meeting Notes (ID: ghi789jkl012)
- Research Summary (ID: mno345pqr678)


# List documents (uses cache if available)\nprint(\"Fetching documents (will use cache after first run)...\")\ndocuments = client.list_documents()\nprint(f\"Found {len(documents)} documents:\")\nfor doc in documents:\n    print(f\"- {doc.title} (ID: {doc.id})\")\n    print(f\"  Last edited: {doc.last_edited_time}\")

In [4]:
if documents:
    # Get document content
    doc = documents[0]
    print(f"Processing document: {doc.title}")
    blocks = client.get_document_content(doc.id)
    
    # Process text
    processor = TextProcessor()
    processed_blocks = processor.process_blocks(blocks)
    
    # Display results
    print("\nProcessed content:")
    for block in processed_blocks:
        print(f"\nBlock type: {block['type']}")
        print(f"Entities found: {block['entities']}")
        print(f"Keywords: {block['keywords']}")

Processing document: Project Documentation

Processed content:

Block type: heading_1
Entities found: ['Project X', 'Documentation']
Keywords: ['project', 'documentation', 'overview']

Block type: paragraph
Entities found: ['Project X', 'ML', 'NLP']
Keywords: ['project', 'ai', 'machine learning', 'development']


if documents:\n    # Get document content\n    doc = documents[0]\n    print(f\"Processing document: {doc.title}\")\n    \n    # Note: get_document_content now returns a tuple (document, blocks)\n    document, blocks = client.get_document_content(doc.id)\n    print(f\"Retrieved {len(blocks)} blocks from document\")\n    print(f\"Document last edited: {document.last_edited_time}\")\n    print(f\"Last fetched from API: {document.last_fetched}\")\n    \n    # Process text\n    processor = TextProcessor()\n    processed_blocks = processor.process_blocks(blocks)\n    \n    # Display results\n    print(\"\\nProcessed content:\")\n    for block in processed_blocks[:2]:  # Show first 2 blocks for brevity\n        print(f\"\\nBlock type: {block['type']}\")\n        print(f\"Entities found: {[e['text'] for e in block['entities']]}\")\n        print(f\"Keywords: {block['keywords']}\")\n    \n    # Try forcing a refresh (bypassing cache)\n    print(\"\\nForcing refresh (bypassing cache):\")\n    document, blocks = client.get_document_content(doc.id, use_cache=False)\n    print(f\"Retrieved {len(blocks)} blocks directly from API\")

In [None]:
if documents:
    # Build hierarchy
    hierarchy = Hierarchy()
    root = hierarchy.build_hierarchy(blocks)
    
    # Convert to dictionary for visualization
    structure = hierarchy.to_dict()
    print("Document structure:")
    print(structure)

## 5. Automatic Tagging

Finally, let's generate tags for the document content:

In [ ]:
if documents:\n    # Build hierarchy\n    hierarchy = Hierarchy()\n    root = hierarchy.build_hierarchy(blocks)\n    \n    # Convert to dictionary for visualization\n    structure = hierarchy.to_dict()\n    print(\"Document structure:\")\n    print(structure)\n    \n    # Check cache status after operations\n    try:\n        cache_info = client.get_cache_info()\n        print(f\"\\nCache status after operations:\")\n        print(f\"- Files in cache: {cache_info['num_files']}\")\n        print(f\"- Cache size: {cache_info['total_size_mb']:.2f} MB\")\n    except Exception as e:\n        print(f\"Could not get cache info: {e}\")