In [109]:
from notion_client import Client
import os
from dotenv import load_dotenv
load_dotenv()
NOTION_API_KEY=os.getenv("NOTION_API_KEY")
notion = Client(auth=NOTION_API_KEY)
import json

In [110]:
def extract_text(block):
    """Extract text content from a block."""
    if "text" in block:
        return "".join([text["plain_text"] for text in block["text"]])
    return ""

def fetch_block_children(block_id, notion, depth=0):
    """Fetch all children blocks of a given block recursively."""
    block_children = notion.blocks.children.list(block_id=block_id)["results"]
    content = []

    for block in block_children:
        block_type = block["type"]
        block_data = block[block_type]
        block_content = {"type": block_type, "content": []}

        # Extract text content for various block types
        if block_type in ["paragraph", "heading_1", "heading_2", "heading_3", "bulleted_list_item", "numbered_list_item", "to_do", "toggle", "quote"]:
            block_content["content"].append(extract_text(block_data))

        elif block_type == "image":
            block_content["content"].append(block_data["image"]["file"]["url"])

        elif block_type == "child_page":
            block_content["content"].append({"title": block_data["title"]})

        # Recursively fetch children for blocks that can contain other blocks
        if "has_children" in block and block["has_children"]:
            block_content["children"] = fetch_block_children(block["id"], notion, depth + 1)

        content.append(block_content)

    return content

def fetch_all_pages(notion):
    """Fetch all standalone pages in the workspace."""
    pages = []
    query_results = notion.search(filter={"value": "page", "property": "object"})["results"]

    for page in query_results:
        
        page_id = page["id"]
        print(page_id)
        print(page)
        page_content = fetch_block_children(page_id, notion)
        pages.append({"page_id": page_id, "content": page_content})

    return pages

def create_corpus(notion):
    """Create a corpus from all standalone pages in the workspace."""
    corpus = fetch_all_pages(notion)
    return corpus

In [111]:
corpus = create_corpus(notion)

# Save the corpus to a JSON file
with open("notion_corpus.json", "w", encoding="utf-8") as f:
    json.dump(corpus, f, ensure_ascii=False, indent=4)

4bb67a2f-b3d5-450c-aa10-65fa3edbe1db
{'object': 'page', 'id': '4bb67a2f-b3d5-450c-aa10-65fa3edbe1db', 'created_time': '2024-04-05T09:57:00.000Z', 'last_edited_time': '2024-04-05T17:03:00.000Z', 'created_by': {'object': 'user', 'id': 'ee0f78bc-69b2-4c8b-8b45-f90b287fb803'}, 'last_edited_by': {'object': 'user', 'id': 'ee0f78bc-69b2-4c8b-8b45-f90b287fb803'}, 'cover': {'type': 'external', 'external': {'url': 'https://images.unsplash.com/photo-1527345931282-806d3b11967f?ixlib=rb-4.0.3&q=85&fm=jpg&crop=entropy&cs=srgb'}}, 'icon': {'type': 'external', 'external': {'url': 'https://www.notion.so/icons/list_lightgray.svg'}}, 'parent': {'type': 'workspace', 'workspace': True}, 'archived': False, 'properties': {'title': {'id': 'title', 'type': 'title', 'title': [{'type': 'text', 'text': {'content': 'Weekly To-do List', 'link': None}, 'annotations': {'bold': False, 'italic': False, 'strikethrough': False, 'underline': False, 'code': False, 'color': 'default'}, 'plain_text': 'Weekly To-do List', 'hr