In [1]:
from notion_client import Client
import os
from dotenv import load_dotenv
load_dotenv()
NOTION_API_KEY=os.getenv("NOTION_API_KEY")
notion = Client(auth=NOTION_API_KEY)
import json

In [2]:
def fetch_block_children(block_id, notion):
    """Fetch all children blocks of a given block and include all block data."""
    block_children = notion.blocks.children.list(block_id=block_id)["results"]
    content = []
    for block in block_children:
        # Include the entire block information
        content.append(block)
        if "has_children" in block and block["has_children"]:
            # Recursively fetch children blocks and include their data
            block["children"] = fetch_block_children(block["id"], notion)
    return content

def fetch_all_pages(notion):
    """Fetch all standalone pages in the workspace and include all page and block data."""
    pages = []
    query_results = notion.search(filter={"value": "page", "property": "object"})["results"]

    for page in query_results:
        page_details = {"page_data": page}  # Include the entire page information
        page_id = page["id"]
        page_details["content"] = fetch_block_children(page_id, notion)  # Include all block data
        pages.append(page_details)

    return pages

def create_corpus(notion):
    """Create a corpus from all standalone pages in the workspace, including all data."""
    corpus = fetch_all_pages(notion)
    return corpus

In [3]:
corpus = create_corpus(notion)

# Save the corpus to a JSON file
with open("notion_corpus.json", "w", encoding="utf-8") as f:
    json.dump(corpus, f, ensure_ascii=False, indent=4)

In [3]:
def load_corpus(file_path='notion_corpus.json'):
    with open(file_path, 'r',encoding="utf-8") as f:
        return json.load(f)
    
def remove_keys_from_dict(d, keys):
    if isinstance(d, dict):
        return {k: remove_keys_from_dict(v, keys) for k, v in d.items() if k not in keys}
    elif isinstance(d, list):
        return [remove_keys_from_dict(v, keys) for v in d]
    else:
        return d

corpus = load_corpus()

In [4]:
keys_to_remove = [
    "id", "color", "type", "link", "href", "public_url", "object",
    "database_id", "icon", "cover", "bold", "italic", "strikethrough",
    "underline", "code", "archived", "last_edited_time", "created_by",
    "parent", "relation", "has_more", "Sub-item","has_children","last_edited_by","annotations","is_toggleable",
    "url","divider","toggle","file",'created_time',"unsupported"

]

cleaned_data = remove_keys_from_dict(corpus, keys_to_remove)
cleaned_file_path = 'cleaned_notion_corpus.json'
with open(cleaned_file_path, 'w') as file:
    json.dump(cleaned_data, file, indent=4)

In [5]:
corpus=load_corpus('cleaned_notion_corpus.json')

In [9]:
def parse_value(v, new_key):
    if isinstance(v, dict):
        # Only parse non-empty dictionaries
        return parse_dict(v, new_key) if v else None
    elif isinstance(v, list):
        # Process each item in the list, removing empty ones
        processed_items = [item for item in map(lambda x: parse_value(x, ''), v) if item]
        # Join the non-empty items with commas and only return if there's something to show
        return f"{new_key}:[{','.join(processed_items)}]" if processed_items else None
    elif v:  # Check if value is not an empty string
        return f"{new_key}:{v}"
    # Exclude empty strings, empty lists, and empty dicts
    return None

def parse_dict(d, parent_key=''):
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}:{k}" if parent_key else k
        item = parse_value(v, new_key)
        if item:
            items.append(item)
    return ','.join(filter(None, items))

In [12]:
formatted_data = [parse_dict(page) for page in corpus]
text=""

for data in formatted_data:
    text=text+data+"\n"

page_data:properties:title:title:[text:content:Testing,plain_text:Testing],content:[heading_1:rich_text:[text:content:What is Lorem Ipsum?,plain_text:What is Lorem Ipsum?],paragraph:rich_text:[text:content:Lorem ispum lorem ipsum,plain_text:Lorem ispum lorem ipsum]]
page_data:properties:title:title:[text:content:Monthly Budget,plain_text:Monthly Budget],content:[paragraph:rich_text:[text:content:Use this template to figure out how much you make and spend in a given month. List your ,plain_text:Use this template to figure out how much you make and spend in a given month. List your ,text:content:income,plain_text:income,text:content: sources for the month and their amount in the ,plain_text: sources for the month and their amount in the ,text:content:Income,plain_text:Income,text:content: table and your ,plain_text: table and your ,text:content:expenses,plain_text:expenses,text:content: in the ,plain_text: in the ,text:content:Expenses,plain_text:Expenses,text:content: table. Find you to