# Converting Notion API Data to Portable Text for the Sanity.io Dataset
##### By Cdaprod (David Cannan)
##### November 19, 2024
## Prerequsite Step:
### Generate JSON Log for Persistence needed later

In [1]:
import json
import os

# File path for storing the progress log
PROGRESS_LOG_FILE = 'notion_progress_log.json'

# Load existing progress log or create a new one if it doesn't exist
if os.path.exists(PROGRESS_LOG_FILE):
    with open(PROGRESS_LOG_FILE, 'r') as file:
        processed_pages = json.load(file)
else:
    processed_pages = []

print(f"Loaded progress log. Number of pages already processed: {len(processed_pages)}")

Loaded progress log. Number of pages already processed: 1


## Setup Environment Variables and Notion Client

In [2]:
# Import necessary libraries
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Get Notion API token and Database ID from environment variables
NOTION_INTEGRATION_TOKEN = os.getenv('NOTION_INTEGRATION_TOKEN')
NOTION_DATABASE_ID = os.getenv('NOTION_DATABASE_ID')

# Ensure environment variables are loaded
if not NOTION_INTEGRATION_TOKEN or not NOTION_DATABASE_ID:
    raise EnvironmentError("Please set NOTION_INTEGRATION_TOKEN and NOTION_DATABASE_ID in the .env file")

In [3]:
# Import the Notion client
from notion_client import Client

# Initialize the Notion client
notion = Client(auth=NOTION_INTEGRATION_TOKEN)

# Confirm that the client is initialized
print("Notion client initialized successfully")

Notion client initialized successfully


# Inspect the database connection 

In [4]:
# Function to fetch and print pages from a Notion database
def fetch_notion_database(database_id):
    response = notion.databases.query(database_id=database_id)
    return response['results']

# Fetch and display results from the Notion database
try:
    notion_pages = fetch_notion_database(NOTION_DATABASE_ID)
    if notion_pages:
        print(f"Number of pages found: {len(notion_pages)}")
        for page in notion_pages:
            print(page)  # Print each page's details
    else:
        print("No pages found in the database.")
except Exception as e:
    print(f"Error fetching Notion database: {e}")

Number of pages found: 100
{'object': 'page', 'id': '14390ef7-bebc-80da-990c-ee3e541444d2', 'created_time': '2024-11-19T20:28:00.000Z', 'last_edited_time': '2024-11-19T21:56:00.000Z', 'created_by': {'object': 'user', 'id': '5c0ade10-6fb0-459c-bd30-0571b86d02e5'}, 'last_edited_by': {'object': 'user', 'id': '5c0ade10-6fb0-459c-bd30-0571b86d02e5'}, 'cover': None, 'icon': {'type': 'emoji', 'emoji': '📹'}, 'parent': {'type': 'database_id', 'database_id': '371d24c0-59fd-4622-a0b7-fa8369bbede8'}, 'archived': False, 'in_trash': False, 'properties': {'Created time': {'id': 'EOiB', 'type': 'created_time', 'created_time': '2024-11-19T20:28:00.000Z'}, 'Last edited by': {'id': 'MNQk', 'type': 'last_edited_by', 'last_edited_by': {'object': 'user', 'id': '5c0ade10-6fb0-459c-bd30-0571b86d02e5', 'name': 'David Cannan', 'avatar_url': 'https://lh3.googleusercontent.com/a-/AFdZucpV9CJe58bbkJi0GMZiBz9zgA-FyjhTzsemCkJy488=s100', 'type': 'person', 'person': {'email': 'davidacannan@gmail.com'}}}, 'Last edited 

## Fetch All Pages found in Database

In [5]:
# Function to handle pagination for large databases
def fetch_all_notion_pages(database_id):
    results = []
    next_cursor = None

    while True:
        response = notion.databases.query(database_id=database_id, start_cursor=next_cursor)
        results.extend(response['results'])
        next_cursor = response.get('next_cursor')
        
        if not next_cursor:
            break

    return results

# Fetch all pages and display the number of pages found
all_notion_pages = fetch_all_notion_pages(NOTION_DATABASE_ID)
print(f"Total number of pages found: {len(all_notion_pages)}")

Total number of pages found: 148


## List All Pages by Title

In [6]:
# Function to display the titles of all pages
def display_page_titles(pages):
    for page in pages:
        title_property = page['properties']['Name']['title']
        if title_property:
            title = title_property[0]['plain_text']
            print(f"Page Title: {title}")
        else:
            print("Unnamed page")

# Display the titles of all pages
display_page_titles(all_notion_pages)

Page Title: Raspberry Pi Cluster
Page Title: Mastering Emotional Detachment: Nurturing Relationships Without Losing Yourself
Page Title: Sanity Blog Registry Clap Back 👏 
Page Title: Guide to Focus
Page Title: Hash Tables and Hashing in Data Structures and Algorithms (DSA)
Page Title: Why Mastering Emotional Detachment is a Good Thing for Connection
Page Title: What’s an Edge Case in Problem Solving
Page Title: David’s 10 Year Guide to Starting Fresh and being Successful 
Page Title: Learn Go Through Real Questions 
Page Title: RTMP Golang Project
Page Title: From Service to Micro Service
Page Title: BlogPost 
Page Title: Designing a Principled Registry
Page Title: Go-Kube-Compiler
Page Title: Practical Data Structures in Golang
Page Title: Unsafe Memory Operations in Golang
Page Title: Building VM in Golang
Page Title: How Erasure Coding Works
Page Title: “Raising Elephants Is So Utterly Boring” 
Page Title: TJS - Rails Web Portfolio with Dynamic iFrames
Page Title: Multi-axis motoriz

## Part 2: Filtering Fetched Pages

In [7]:
# Function to fetch pages filtered by a specific metadata property (e.g., tag)
def fetch_filtered_notion_pages(database_id, filter_property, filter_value):
    response = notion.databases.query(
        database_id=database_id,
        filter={
            "property": filter_property,
            "multi_select": {
                "contains": filter_value
            }
        }
    )
    return response['results']

# Example usage: Replace 'Tags' with the property name and 'Important' with the filter value
filter_property = 'Tags'  # Replace with the name of your property
filter_value = 'Sanity Dataset'  # Replace with the value you want to filter by

filtered_pages = fetch_filtered_notion_pages(NOTION_DATABASE_ID, filter_property, filter_value)
print(f"Number of filtered pages found: {len(filtered_pages)}")

Number of filtered pages found: 1


# Function to fetch content from a Notion page and convert to Portable Text
## Function to parse Notion rich text (same as before)

In [8]:
def fetch_page_content(page_id):
    response = notion.blocks.children.list(page_id)
    return response['results']

# Function to convert Notion page content to Portable Text schema
def parse_page_content_to_portable_text(page_content):
    portable_text = []
    for block in page_content:
        block_type = block.get('type')
        if block_type in ['paragraph', 'heading_1', 'heading_2', 'heading_3']:
            style = 'normal' if block_type == 'paragraph' else block_type.replace('_', '')
            text_content = block[block_type].get('rich_text', [])
            children = parse_notion_rich_text(text_content)
            portable_text.append({
                "_type": "block",
                "style": style,
                "children": children
            })
        # Add cases for other block types as needed
    return portable_text

# Function to parse Notion rich text (same as before)
def parse_notion_rich_text(rich_text_array):
    children = []
    for text in rich_text_array:
        marks = []
        annotations = text.get('annotations', {})
        if annotations.get('bold'):
            marks.append('strong')
        if annotations.get('italic'):
            marks.append('em')
        if annotations.get('underline'):
            marks.append('underline')
        if annotations.get('strikethrough'):
            marks.append('strike-through')
        if annotations.get('code'):
            marks.append('code')
        # Add more annotations if needed

        children.append({
            "_type": "span",
            "text": text.get('plain_text', ''),
            "marks": marks
        })
    return children

## Convert Filtered Pages to Portable Text
### with JSON Log (cell #1) 

In [9]:
# Function to convert filtered pages into Portable Text and track progress
def convert_filtered_pages_to_portable_text(filtered_pages):
    all_portable_text = []
    new_processed_pages = []

    for page in filtered_pages:
        page_id = page['id']

        # Skip if the page has already been processed
        if page_id in processed_pages:
            print(f"Page {page_id} already processed. Skipping...")
            continue

        # Fetch and parse the page content
        page_content = fetch_page_content(page_id)
        portable_text = parse_page_content_to_portable_text(page_content)

        # Include metadata properties (e.g., page title or tags)
        page_title = page['properties']['Name']['title'][0]['plain_text'] if page['properties']['Name']['title'] else "Untitled"
        metadata = {
            "title": page_title,
            "tags": [tag['name'] for tag in page['properties'].get('Tags', {}).get('multi_select', [])]
        }

        # Append converted content and metadata
        all_portable_text.append({
            "metadata": metadata,
            "content": portable_text
        })

        # Mark page as processed
        new_processed_pages.append(page_id)
        print(f"Page {page_id} processed successfully.")

    # Update the progress log with new processed pages
    with open(PROGRESS_LOG_FILE, 'w') as file:
        json.dump(processed_pages + new_processed_pages, file)

    return all_portable_text

# Convert and display filtered pages as Portable Text
portable_text_data = convert_filtered_pages_to_portable_text(filtered_pages)
print("Conversion complete.")

Page 14090ef7-bebc-8027-8209-ebc86820299e processed successfully.
Conversion complete.


## Convert Notion Pages to Portable Text

In [10]:
# Display a summary of converted pages
for page_data in portable_text_data:
    print(f"Page Title: {page_data['metadata']['title']}")
    print("Portable Text Content:")
    print(page_data['content'])
    print("\n")

# Optionally save the results to a file
with open('converted_pages.json', 'w') as output_file:
    json.dump(portable_text_data, output_file, indent=2)
    print("Converted pages saved to converted_pages.json.")

Page Title: Why Mastering Emotional Detachment is a Good Thing for Connection
Portable Text Content:
[{'_type': 'block', 'style': 'heading3', 'children': [{'_type': 'span', 'text': 'Table of Contents', 'marks': []}]}, {'_type': 'block', 'style': 'normal', 'children': [{'_type': 'span', 'text': 'David’s Philosophical Hour 🍻 “Emotional detachment” isn’t ghosting, avoiding, or manipulating—it’s maintaining your peace while being present and supportive, not using distance as a weapon.', 'marks': []}]}, {'_type': 'block', 'style': 'normal', 'children': [{'_type': 'span', 'text': 'I will educate anyone today that wants to learn… and I’ll even go as far as to say knowing what “emotional detachment” is an important aspect of how we grow a healthy relationship, and is an essential part that helps maintain individuality.', 'marks': []}]}, {'_type': 'block', 'style': 'normal', 'children': [{'_type': 'span', 'text': '🙌 Let’s talk about why controlling our emotions in relationships is so important 

## The above data contained content [ ] so we to modify our parsing function to ensure we fetch the data appropriately

### Explanation:

#### Enhanced Block Support:
- List Items: Handles bulleted_list_item and numbered_list_item blocks, ensuring list items are parsed and included in the Portable Text output.
- Code Blocks: Adds support for code blocks, including specifying the programming language.
- Toggle Blocks: Adds parsing for toggle blocks, which are collapsible sections in Notion.
Fallbacks:
- Ensures content is only added to the portable_text list if there is rich text (rich_text) content present in the block.

#### Extensibility:

- Easily add more block types (e.g., quote, image, divider) as needed by extending the elif conditions.

In [11]:
# Function to fetch content from a Notion page, including nested blocks
def fetch_page_content(page_id):
    response = notion.blocks.children.list(page_id)
    return response['results']

# Function to convert Notion page content to Portable Text schema
def parse_page_content_to_portable_text(page_content):
    portable_text = []
    for block in page_content:
        block_type = block.get('type')

        # Parse paragraphs and headings
        if block_type in ['paragraph', 'heading_1', 'heading_2', 'heading_3']:
            style = 'normal' if block_type == 'paragraph' else block_type.replace('_', '')
            text_content = block[block_type].get('rich_text', [])
            if text_content:
                children = parse_notion_rich_text(text_content)
                portable_text.append({
                    "_type": "block",
                    "style": style,
                    "children": children
                })

        # Parse list items (bulleted or numbered)
        elif block_type in ['bulleted_list_item', 'numbered_list_item']:
            style = 'normal'
            text_content = block[block_type].get('rich_text', [])
            if text_content:
                children = parse_notion_rich_text(text_content)
                portable_text.append({
                    "_type": "block",
                    "style": style,
                    "listItem": 'bullet' if block_type == 'bulleted_list_item' else 'number',
                    "children": children
                })

        # Parse code blocks
        elif block_type == 'code':
            style = 'code'
            text_content = block[block_type].get('text', [])
            if text_content:
                children = parse_notion_rich_text(text_content)
                portable_text.append({
                    "_type": "block",
                    "style": style,
                    "children": children,
                    "codeLanguage": block[block_type].get('language', 'plaintext')
                })

        # Parse toggle blocks
        elif block_type == 'toggle':
            text_content = block[block_type].get('rich_text', [])
            if text_content:
                children = parse_notion_rich_text(text_content)
                portable_text.append({
                    "_type": "block",
                    "style": "toggle",
                    "children": children
                })

        # Handle other block types (e.g., quote, image, etc.) as needed

    return portable_text

# Function to parse Notion rich text to Portable Text format
def parse_notion_rich_text(rich_text_array):
    children = []
    for text in rich_text_array:
        marks = []
        annotations = text.get('annotations', {})
        if annotations.get('bold'):
            marks.append('strong')
        if annotations.get('italic'):
            marks.append('em')
        if annotations.get('underline'):
            marks.append('underline')
        if annotations.get('strikethrough'):
            marks.append('strike-through')
        if annotations.get('code'):
            marks.append('code')

        children.append({
            "_type": "span",
            "text": text.get('plain_text', ''),
            "marks": marks
        })
    return children

In [13]:
# Display a summary of converted pages
for page_data in portable_text_data:
    print(f"Page Title: {page_data['metadata']['title']}")
    print("Portable Text Content:")
    print(page_data['content'])
    print("\n")

# Optionally save the results to a file
with open('converted_pages.json', 'w') as output_file:
    json.dump(portable_text_data, output_file, indent=2)
    print("Converted pages saved to converted_pages.json.")

Page Title: Why Mastering Emotional Detachment is a Good Thing for Connection
Portable Text Content:
[{'_type': 'block', 'style': 'heading3', 'children': [{'_type': 'span', 'text': 'Table of Contents', 'marks': []}]}, {'_type': 'block', 'style': 'normal', 'children': [{'_type': 'span', 'text': 'David’s Philosophical Hour 🍻 “Emotional detachment” isn’t ghosting, avoiding, or manipulating—it’s maintaining your peace while being present and supportive, not using distance as a weapon.', 'marks': []}]}, {'_type': 'block', 'style': 'normal', 'children': [{'_type': 'span', 'text': 'I will educate anyone today that wants to learn… and I’ll even go as far as to say knowing what “emotional detachment” is an important aspect of how we grow a healthy relationship, and is an essential part that helps maintain individuality.', 'marks': []}]}, {'_type': 'block', 'style': 'normal', 'children': [{'_type': 'span', 'text': '🙌 Let’s talk about why controlling our emotions in relationships is so important 

### Debug
If you still encounter issues with empty content, print the page_content to inspect the exact structure returned by the Notion API:

In [12]:
print(json.dumps(page_content, indent=2))

NameError: name 'page_content' is not defined