In [56]:
#Initiate Notion client

import os
from dotenv import load_dotenv
from notion_client import Client

load_dotenv()

NOTION_API_TOKEN = os.getenv('NOTION_SENSENETS_TOKEN')
URLS_DATABASE_ID = os.getenv('NOTION_URLS_DATABASE_ID')
POSTS_DATABASE_ID = os.getenv('NOTION_POSTS_DATABASE_ID')
PROFILES_DATABASE_ID =os.getenv('NOTION_PROFILES_DATABASE_ID')
KEYWORDS_DATABASE_ID = os.getenv("NOTION_KEYWORDS_DATABASE_ID")
HEADERS = {
    'Authorization': f'Bearer {NOTION_API_TOKEN}',
    'Content-Type': 'application/json',
    'Notion-Version': '2022-06-28'
}
notion = Client(auth=NOTION_API_TOKEN)

In [2]:
#Initiate firebase client
import firebase_admin
from firebase_admin import credentials, firestore

# Path to your service account key
cred = credentials.Certificate('creds/sensenets-dataset-firebase-adminsdk-rpero-9c552cac56.json')
firebase_admin.initialize_app(cred)

# Initialize Firestore DB
db = firestore.client()


In [217]:
#functions for mapping json to Notion profile table
def lookup_notion_profile(firebase_id):
    response = notion.databases.query(
        **{
            "database_id": PROFILES_DATABASE_ID,
            "filter": {
                "property": "firebase_id",  # The property name in Notion
                "rich_text": {
                    "equals": firebase_id  # Use 'equals' to match the authorId exactly
                }
            }
        }
    )
    
    # Check if any results are found
    if response["results"]:
        print(f"Profile found for authorId: {firebase_id}")
        return response["results"][0]["id"]  # Return the first matched profile ID if it exists
    else:
        print(f"No profile found for authorId: {firebase_id}")
        return None
    
def build_base_notion_profile(profile_doc:dict,profile_id:str):
    print("in build profile page")
    platform = profile_doc["platformIds"][0].split(':')[0]
    profile = profile_doc[platform][0]["profile"]
    #make handle modular to include mastodon
    handle = profile["username"]
    if platform == 'mastodon':
        handle_field = "Mastodon Handle"
        platformId_field = "mastodon_id"
        name = profile['displayName']
        server = profile['mastodonServer']

    else:
        handle_field = "Twitter Handle"
        platformId_field = "twitter_id"
        name = profile['name']
        server = ''

    platform_id = profile['id']
    firebase_id = profile_id
    response = notion.pages.create(
        **{
            "parent": {"database_id": PROFILES_DATABASE_ID},
            "properties": {
                "Name": {
                    "title": [
                        {
                            "text": {"content": name}
                        }
                    ]
                },
                handle_field: {
                     "rich_text": [
                        {
                            "text": {"content": handle}
                        }
                    ]
                },
                "firebase_id": {
                     "rich_text": [
                        {
                            "text": {"content": firebase_id}
                        }
                    ]
                },
                "Server": {
                     "rich_text": [
                        {
                            "text": {"content": server}
                        }
                    ]
                },
                platformId_field: {
                     "rich_text": [
                        {
                            "text": {"content": platform_id}
                        }
                    ]
                },
                 "Platforms": {
                    "multi_select": [
                        {"name": platform} 
                    ]
                }
            }
        }
    )
    
    print(f"Created new page for profile with name: {name}")
    return response["id"]  # Return the page ID of the created URL page
def get_firebase_profile_doc(firebase_id):
    # Query the profile collection by postId
    profile_ref = db.collection('users').document(firebase_id)

    # Execute the query
    doc = profile_ref.get()
    if doc.exists:
        print(f'Document data: {doc.to_dict()}')
        return doc.to_dict()
    else:
        print('No such document!')
        return None
    
def load_profile_to_notion(firebase_id:str):
    notion_id = lookup_notion_profile(firebase_id)
    if  not notion_id:
        profile_dict = get_firebase_profile_doc(firebase_id)
        notion_id = build_base_notion_profile(profile_dict,firebase_id)
    return notion_id
        

In [208]:
load_profile_to_notion("twitter:550845228")

No profile found for authorId: twitter:550845228
Document data: {'settings': {'notificationFreq': 'daily', 'autopost': {'nanopub': {'value': 'MANUAL'}}}, 'platformIds': ['twitter:550845228'], 'twitter': [{'fetched': {'oldest_id': '1813496335233900907', 'newest_id': '1833265798137397612'}, 'read': {'refreshToken': 'AAAAAAAAAAAAAAAAAAAAACq0sAEAAAAAuw3FkbTK0Lf9qLbwqCqGssSvZWs%3Duvvst2vzOuEcfGYhHisS4oLUsnGE9cPep8Q5j2gjpAbZ88GRph', 'expiresIn': 315360000, 'accessToken': 'AAAAAAAAAAAAAAAAAAAAACq0sAEAAAAAuw3FkbTK0Lf9qLbwqCqGssSvZWs%3Duvvst2vzOuEcfGYhHisS4oLUsnGE9cPep8Q5j2gjpAbZ88GRph', 'expiresAtMs': 2041712750000}, 'user_id': '550845228', 'profile': {'id': '550845228', 'profile_image_url': 'https://pbs.twimg.com/profile_images/1736673697325281280/P9p1RCqB_normal.jpg', 'username': 'AnnaLeptikon', 'name': 'Anna Riedl'}, 'signupDate': 0}], 'signupDate': 1726518252489}
in build profile page
Created new page for profile with name: Anna Riedl


'10a96ae9-0655-810e-b338-f90e09b8e53d'

In [73]:
# Query the latest 10 users, ordered by the 'created_at' timestamp in descending order
users_ref = db.collection('users')

# Execute the query
docs = users_ref.stream()
profiles = []
# Print the results
for doc in docs:
    profiles.append((doc.to_dict(),doc.id))
    print(f'{doc.id} => {doc.to_dict()}')
print(f'query resulted with {len(profiles)} profiles')

mastodon:107714271376772838 => {'mastodon': [{'fetched': {'oldest_id': '110573206339687503', 'newest_id': '112903729932000692'}, 'read': {'accessToken': 'b2DpTiV5xxYjMXyf71D2WBtgmTZ7NNAdLciup7LBawY'}, 'user_id': '107714271376772838', 'profile': {'avatar': 'https://files.mastodon.social/accounts/avatars/107/714/271/376/772/838/original/9331699b5c120ed4.jpg', 'mastodonServer': 'mastodon.social', 'displayName': 'Ronen Tamari', 'id': '107714271376772838', 'username': 'ronent'}, 'signupDate': 1726521428364}], 'settings': {'notificationFreq': 'daily', 'autopost': {'nanopub': {'value': 'MANUAL'}}}, 'platformIds': ['mastodon:107714271376772838'], 'signupDate': 1726521428679}
mastodon:109301127580127072 => {'mastodon': [{'fetched': {'oldest_id': '112920697751318393', 'newest_id': '113127425695271858'}, 'read': {'accessToken': 'b2DpTiV5xxYjMXyf71D2WBtgmTZ7NNAdLciup7LBawY'}, 'user_id': '109301127580127072', 'profile': {'avatar': 'https://fediscience.org/system/accounts/avatars/109/301/127/580/127

In [91]:
build_base_notion_profile(profiles[0][0],profiles[0][1])

in build profile page
Created new page for profile with name: Ronen Tamari


'10596ae9-0655-8147-a994-f8dccdaab681'

In [88]:
print(profiles[0][0])

{'mastodon': [{'fetched': {'oldest_id': '110573206339687503', 'newest_id': '112903729932000692'}, 'read': {'accessToken': 'b2DpTiV5xxYjMXyf71D2WBtgmTZ7NNAdLciup7LBawY'}, 'user_id': '107714271376772838', 'profile': {'avatar': 'https://files.mastodon.social/accounts/avatars/107/714/271/376/772/838/original/9331699b5c120ed4.jpg', 'mastodonServer': 'mastodon.social', 'displayName': 'Ronen Tamari', 'id': '107714271376772838', 'username': 'ronent'}, 'signupDate': 1726521428364}], 'settings': {'notificationFreq': 'daily', 'autopost': {'nanopub': {'value': 'MANUAL'}}}, 'platformIds': ['mastodon:107714271376772838'], 'signupDate': 1726521428679}


In [83]:
def load_profiles_to_notion(profile_docs:list):
    for profile_doc,profile_id in profile_docs:
      if search_profile_in_notion(profile_id):
        print(f'page with {profile_id} exist in notion profile database')
      else:
        try:
          build_base_notion_profile(profile_doc,profile_id)
        except Exception as e:
          print(f"An error occurred while building the base Notion profile: {e}")


          

In [92]:
get_firebase_profile
load_profiles_to_notion(profiles)


Profile found for authorId: mastodon:107714271376772838
page with mastodon:107714271376772838 exist in notion profile database
No profile found for authorId: mastodon:109301127580127072
in build profile page
Created new page for profile with name: Ulrike Hahn
No profile found for authorId: twitter:550845228
in build profile page
Created new page for profile with name: Anna Riedl


In [47]:
#query for specific profile
profile_id = "twitter:550845228"

# Query the profile collection by postId
profile_ref = db.collection('profiles').document(profile_id)

# Execute the query
doc = profile_ref.get()
# Print the results
for doc in docs:
    print(f'{doc.id} => {doc.to_dict()}')

  return query.where(field_path, op_string, value)


twitter:550845228-twitter-550845228 => {'user_id': '550845228', 'platformId': 'twitter', 'userId': 'twitter:550845228', 'profile': {'id': '550845228', 'profile_image_url': 'https://pbs.twimg.com/profile_images/1736673697325281280/P9p1RCqB_normal.jpg', 'username': 'AnnaLeptikon', 'name': 'Anna Riedl'}}


In [150]:
#Updating Zotero Item types from triples when creating the url object
def get_url_zotero_triples(url:str):
    triples_ref = db.collection('triples').where('subject','==',url).where('predicate','==','https://sense-nets.xyz/hasZoteroItemType')

    # Execute the query
    docs = triples_ref.stream()
    triples = []
    # Print the results
    for doc in docs:
        triples.append(doc.to_dict())
        #print(f'{doc.id} => {doc.to_dict()}')
    print(f'query resulted with {len(triples)} triples')
    return triples

def get_zotero_types_from_triples(triples:list):
    types = set(triple["object"] for triple in triples)
    return types

def get_zotero_types_from_url(url:str):
    return get_zotero_types_from_triples(get_url_zotero_triples(url))
    

In [154]:
#Creating URL pages
def url_notion_lookup(url:str):
    response = notion.databases.query(
        **{
            "database_id": URLS_DATABASE_ID,
            "filter": {
                "property": "URL",  # The property name in Notion
                "url": {
                    "equals": url  # Use 'equals' to match the authorId exactly
                }
            }
        }
    )
    
    # Check if any results are found
    if response["results"]:
        print(f"page found for url: {url}")
        return response["results"][0]["id"]  # Return the first matched profile ID if it exists
    else:
        print(f"No page found for url: {url}")
        return None
    
def create_url_notion_page(url:str):
    print("in create url page")
    types = get_zotero_types_from_url(url)
    response = notion.pages.create(
        **{
            "parent": {"database_id": URLS_DATABASE_ID},
            "properties": {
                "Name": {
                    "title": [
                        {
                            "text": {"content": url}
                        }
                    ]
                },
                "URL": {
                    "url": url
                },
                "hasZoteroItemType":{
                    "multi_select": [{'name':typ} for typ in types]
                }
            }
        }
    )
    
    print(f"Created new page for URL: {url}")
    return response["id"]  # Return the page ID of the created URL page


In [54]:
url_notion_lookup("https://example.com/genai1")

page found for url: https://example.com/genai1


'0ff96ae9-0655-81d8-b72d-f751bdc605b8'

In [153]:
types = get_zotero_types_from_url('http://arxiv.org/abs/2408.14508')
create_url_notion_page('http://arxiv.org/abs/2408.14508',types)

  return query.where(field_path, op_string, value)
  triples_ref = db.collection('triples').where('subject','==',url).where('predicate','==','https://sense-nets.xyz/hasZoteroItemType')


query resulted with 1 triples
in create url page
Created new page for URL: http://arxiv.org/abs/2408.14508


'10796ae9-0655-81c6-ad3e-dd881d8ef9d6'

In [61]:
#Create Keyword page
def keyword_notion_lookup(keyword:str):
    response = notion.databases.query(
        **{
            "database_id": KEYWORDS_DATABASE_ID,
            "filter": {
                "property": "Name",  # The property name in Notion
                "title": {
                    "equals": keyword  # Use 'equals' to match the authorId exactly
                }
            }
        }
    )
    
    # Check if any results are found
    if response["results"]:
        print(f"page found for keyword: {keyword}")
        return response["results"][0]["id"]  # Return the first matched profile ID if it exists
    else:
        print(f"No page found for keyword: {keyword}")
        return None

def create_keyword_notion_page(keyword:str):
    print("in create page")
    response = notion.pages.create(
        **{
            "parent": {"database_id": KEYWORDS_DATABASE_ID},
            "properties": {
                "Name": {
                    "title": [
                        {
                            "text": {"content": keyword}
                        }
                    ]
                }
            }
        }
    )
    
    print(f"Created new page for keyword: {keyword}")
    return response["id"]  # Return the page ID of the created URL page


In [58]:
keyword_notion_lookup('Art history')

page found for keyword: Art history


'2b21e135-6035-4022-8145-b3a494cf4216'

In [62]:
create_keyword_notion_page('cats')

in create page
Created new page for keyword: cats


'10496ae9-0655-8194-970f-c1787835af3b'

In [225]:
#Query firebase for posts
# Query the latest 10 users, ordered by the 'created_at' timestamp in descending order
posts_ref = db.collection('posts').where("createdAtMs",'>',0).limit(None)


# Execute the query
docs = posts_ref.stream()
posts = []
# Print the results
for doc in docs:
    posts.append((doc.to_dict(),doc.id))
    #print(f'{doc.id} => {doc.to_dict()}')
print(f'query resulted with {len(posts)} posts')

  return query.where(field_path, op_string, value)


query resulted with 130 posts


In [93]:
#utility functions
def get_thread(thread_list:list):
    full_text = ''
    url = thread_list[0]["url"]
    for post_dict in thread_list:
        full_text = f'{full_text} {post_dict["content"]}'
    return full_text,url


In [97]:
get_thread(posts[0][0]["generic"]["thread"])

(' Rediscovering that statistics was originally „the science concerned with states“, German „Staat“.\n\nAnother field were the application (administration and accounting) emerged clearly before the more theoretical approaches.\n\nPorter, T. M. (2020): The rise of statistical thinking, 1820-1900 The feeling of learning that statistics was originally about „matters of states“ was the same as learning that wash-bears (the English translation of the German word for raccoon) are called that way because they wash their food.\n\nSome call it „Turing surprised“: not being surprised by new information but by finally integrating information one already possessed.',
 'https://x.com/AnnaLeptikon/status/1833265798137397612')

In [128]:
def get_post_triples(post_firebase_id:str):
    # Query the latest 10 users, ordered by the 'created_at' timestamp in descending order
    triples_ref = db.collection('triples').where('postId','==',post_firebase_id).where('predicate','in',props)

    # Execute the query
    docs = triples_ref.stream()
    triples = []
    # Print the results
    for doc in docs:
        triples.append(doc.to_dict())
        #print(f'{doc.id} => {doc.to_dict()}')
    print(f'query resulted with {len(triples)} triples')
    return triples

#RDF properties to filter triples by
props = ['https://sense-nets.xyz/summarizes',
'https://sense-nets.xyz/mentionsCallForPapers',
'https://sense-nets.xyz/endorses',
'http://purl.org/spar/cito/disagreesWith',
'http://purl.org/spar/cito/agreesWith',
'https://sense-nets.xyz/indicatesInterest',
'https://sense-nets.xyz/mentionsFundingOpportunity',
'https://sense-nets.xyz/mentionsWatchingStatus',
'https://sense-nets.xyz/mentionsReadingStatus',
'https://sense-nets.xyz/mentionsListeningStatus',
'http://purl.org/spar/cito/reviews',
'https://sense-nets.xyz/recommends',
'https://sense-nets.xyz/asksQuestionAbout',
'http://purl.org/spar/cito/includesQuotationFrom',
'http://purl.org/spar/cito/discusses',
'https://sense-nets.xyz/announcesEvent',
'https://sense-nets.xyz/announcesJob',
'https://sense-nets.xyz/announcesResource',
'https://sense-nets.xyz/possibleMissingReference',
'https://schema.org/Observation',
'https://schema.org/Claim',
'https://schema.org/Question',
'https://schema.org/keywords']
def get_uri_property_name(props:list):
    return [uri.split('/')[-1] for uri in props]

In [220]:
import datetime

def post_notion_lookup(postDocId:str):
    response = notion.databases.query(
        **{
            "database_id": POSTS_DATABASE_ID,
            "filter": {
                "property": "firebase_id",  # The property name in Notion
                "rich_text": {
                    "equals": postDocId  # Use 'equals' to match the authorId exactly
                }
            }
        }
    )
    
    # Check if any results are found
    if response["results"]:
        print(f"page found for post with id: {postDocId}")
        return response["results"][0]["id"]  # Return the first matched profile ID if it exists
    else:
        print(f"No page found for post with id: {postDocId}")
        return None

def convert_timestamp_to_notion_date(createdAtMs):
    # Convert milliseconds to seconds
    timestamp_s = createdAtMs / 1000
    # Convert to a Python datetime object (in UTC)
    dt = datetime.datetime.utcfromtimestamp(timestamp_s)
    # Convert to ISO 8601 format
    return dt.isoformat()  # Notion expects ISO 8601 format

def build_dynamic_postpage_properties(post_dict: dict, post_firebase_id: str):
    post_text, post_url = get_thread(post_dict["generic"]["thread"])
    author_handle = post_dict["generic"]["author"]["username"]
    post_title = create_post_name(post_text, author_handle)
    profile_notion_id = load_profile_to_notion(post_dict["authorId"])
    post_Ms = post_dict["createdAtMs"]
    creation_time =  convert_timestamp_to_notion_date(post_Ms)
    properties = {
        "Name": {
            "title": [
                {
                    "text": {"content": post_title}
                }
            ]
        },
        "Text": {
            "rich_text": [
                {
                    "text": {"content": post_text[:1900]}
                }
            ]
        },
        "Creators handle": {
            "rich_text": [
                {
                    "text": {"content": author_handle}
                }
            ]
        },
        "firebase_id": {
            "rich_text": [
                {
                    "text": {"content": post_firebase_id}
                }
            ]
        },
        "createdAtMs": {
            "rich_text": [
                {
                    "text": {"content": str(post_Ms)}
                }
            ]
        },
        "Author": {
            "relation": [{"id": profile_notion_id}]
        },
        "Post URL": {
            "url": post_url
            },
        "publish date": {
            "date": {
                "start": creation_time  # Notion date field
            }
        }
    }
    

    # Initialize a dictionary to collect multiple relations for each prop_name
    relations = {}
    # Initialize a list for all urls to add under linksTo 
    urls = []
    prop_name_set = set()
    # Get triples from the post
    triples = get_post_triples(post_firebase_id)
    print("got triplets")
    for triple in triples:
        prop_name = triple["predicate"].split('/')[-1]
        
        # Initialize the relation list for each property if it doesn't exist
        if prop_name not in relations:
            relations[prop_name] = []


        # Handle keywords as an example
        if prop_name == "keywords":
            keyword_id = keyword_notion_lookup(triple["object"])
            if not keyword_id:
                keyword_id = create_keyword_notion_page(triple["object"])

            # Append keyword_id to the relations dictionary under the "keywords" key
            relations[prop_name].append({"id": keyword_id})

        else:
            # Handle other relation properties dynamically (e.g., URLs)
            url_id = url_notion_lookup(triple["object"])
            if not url_id:
                url_id = create_url_notion_page(triple["object"])

            # Append the new relation to the corresponding prop_name
            relations[prop_name].append({"id": url_id})
            urls.append({"id": url_id})
            prop_name_set.add(prop_name)
            print(f'prop name set is {prop_name_set}')

    # After processing all triples, update the properties with collected relations
    for prop_name, relation_list in relations.items():
        properties[prop_name] = {"relation": relation_list}
    properties["linksTo"] = {"relation":urls}
    properties["relations tags"] = {"multi_select":[{"name":p} for p in prop_name_set]}
    return properties

def create_notion_post_page(post_dict:dict,post_firebase_id:str):
    post_text, post_url = get_thread(post_dict["generic"]["thread"])
    author_handle = post_dict["generic"]["author"]["username"]
    post_title = create_post_name(post_text,author_handle)
    response = notion.pages.create(
        **{
            "parent": {"database_id": POSTS_DATABASE_ID},
            "properties": build_dynamic_postpage_properties(post_dict,post_firebase_id)
        }
    )
    print(f"Created new post page for '{post_title}'.")
    return response["id"]  # Return the page ID of the created post page


def create_post_name(post_text,post_handle):
    # Create the post name as "User name : First 10 characters of content"
    return f"{post_handle}: {post_text[:60]}"


In [189]:
create_notion_post_page(posts[1][0],posts[1][1])

Profile found for authorId: twitter:550845228
Profile found for authorId: twitter:550845228
query resulted with 6 triples
got triplets


  triples_ref = db.collection('triples').where('postId','==',post_firebase_id).where('predicate','in',props)


page found for keyword: infinite
page found for keyword: beauty
page found for keyword: Augustine
page found for keyword: history
page found for keyword: cognitive-science
page found for keyword: fractals
Created new post page for 'AnnaLeptikon:  Visited Stift Heiligenkreuz with @AnalogWhole

Contemplatin'.


'10a96ae9-0655-8158-b0d8-c039726a31f2'

In [221]:
def load_posts_to_notion(post_tuple_list:list):
    for post_dict, post_id in post_tuple_list:
        create_notion_post_page(post_dict,post_id)
        print(f'post page with id {post_id} was loaded to Notion')

In [222]:
load_posts_to_notion(posts[18:23])

Profile found for authorId: twitter:550845228


  return query.where(field_path, op_string, value)
  triples_ref = db.collection('triples').where('postId','==',post_firebase_id).where('predicate','in',props)


query resulted with 7 triples
got triplets
page found for keyword: dubbing
page found for keyword: German
page found for keyword: media-studies
page found for keyword: parody
page found for url: https://www.instagram.com/p/C86-gClq7k6/
prop name set is {'indicatesInterest'}
page found for keyword: linguistics
page found for keyword: media
Created new post page for 'AnnaLeptikon:  why is dubbing in German so bad? After I had seen so many v'.
post page with id U0xqpacG2HvjgBbvbWsr was loaded to Notion
Profile found for authorId: twitter:550845228
query resulted with 6 triples
got triplets
page found for keyword: extremism
page found for url: https://podcast.clearerthinking.org/episode/103/jesse-morton-a-former-al-qaeda-recruiter-speaks/
prop name set is {'linksTo'}
page found for keyword: ideology
page found for keyword: brainwashing
page found for keyword: deradicalization
page found for keyword: notorious-jihadist-preachers
Created new post page for 'AnnaLeptikon:  I keep recommending 

In [None]:
# Function to query Firestore by document ID
def get_document_by_id(collection_name, doc_id):
    doc_ref = db.collection(collection_name).document(doc_id)
    doc = doc_ref.get()

    if doc.exists:
        print(f'Document data: {doc.to_dict()}')
        return doc.to_dict()
    else:
        print('No document found!')
        return None

post = get_document_by_id("posts",'r2tcQ5UKRxXrNdPa7poH')


In [None]:
create_notion_post_page(post,'r2tcQ5UKRxXrNdPa7poH')

In [None]:
get_post_triples('3PWK7jWTHWzR5XQszG1J')

In [151]:
get_zotero_types_from_url('http://arxiv.org/abs/2408.14508')

query resulted with 1 triples


  triples_ref = db.collection('triples').where('subject','==',url).where('predicate','==','https://sense-nets.xyz/hasZoteroItemType')


{'preprint'}