In [48]:
import sys
import os
import pandas as pd
import os
import re

sys.path.append('..')

from src.data.data_processing import (
    load_gpt_conversation_data,
    extract_message_history
)

In [None]:
raw_data_dir = # Replace with your actual directory

output_file = # Replace with your actual file path
email_value = # Replace with your actual email value

notion_api_key = # Replace with your actual API key
page_id =  # Replace with your actual page ID

csv_file_path = output_file
parent_page_id = page_id

raw_data_path = os.path.join(raw_data_dir, "conversations.json")
conversations = load_gpt_conversation_data(raw_data_path)

In [49]:

def sanitize_filename(filename):
    # Replace any character that is not a letter, number, space, or underscore
    return re.sub(r'[\\/*?:"<>|]', "", filename)

def create_markdown_from_dataframe(df, output_dir):
    if df.empty:
        print("Dataframe is empty")
        return
    
    # Extract date from the first entry of the dataframe and convert to string
    date_str = pd.to_datetime(df['create_time'].iloc[0]).strftime('%Y-%m-%d')
    
    # Extract title from the dataframe
    title = df['title'].iloc[0]
    
    # Sanitize the title to remove any invalid characters
    sanitized_title = sanitize_filename(title)
    
    # Construct the filename with date and sanitized title
    filename = f"{date_str} {sanitized_title}.md"
    filepath = os.path.join(output_dir, filename)

    # Open and write to the markdown file
    with open(filepath, 'w', encoding='utf-8') as md_file:
        for _, row in df.iterrows():
            # Write "## USER" or "## ASSISTANT" based on the author
            if row['author'] == 'user':
                md_file.write("## USER:\n")
            elif row['author'] == 'assistant':
                md_file.write("## ASSISTANT:\n")
            
            # Write the content of the message
            md_file.write(f"{row['content']}\n\n")
    
    print(f"Markdown file created: {filepath}")


In [50]:


# Specify output directory
# output_dir = './markdown_conversations'
# os.makedirs(output_dir, exist_ok=True)

# for conversation_id in conversations["conversation_id"].values:
#     message_history = extract_message_history(conversations, conversation_id)
#     create_markdown_from_dataframe(message_history, output_dir)


In [51]:
import pandas as pd
import os
import csv

def create_single_csv_with_all_chats(conversations, output_file, email_value):
    # Open the CSV file for writing
    with open(output_file, 'w', newline='', encoding='utf-8') as csv_file:
        csv_writer = csv.writer(csv_file)
        # Write the header
        csv_writer.writerow(['Date', 'Title', 'Chat Content', 'Email'])

        # Loop through each conversation
        for conversation_id in conversations["conversation_id"].unique():
            # Extract message history using your extract_message_history function
            message_history = extract_message_history(conversations, conversation_id)
            
            # Get the conversation date from the first message
            if len(message_history) == 0:
                continue
            
            conversation_date = pd.to_datetime(message_history['create_time'].iloc[0]).strftime('%Y-%m-%d')

            # Try to get the conversation title, use a fallback if missing
            conversation_title = message_history['title'].iloc[0] 
            
            # Initialize a variable to hold all the chat content for this conversation
            chat_content = ""

            # Iterate through each message and append it to chat_content
            for _, row in message_history.iterrows():
                author = row['author'].upper()  # Format as USER or ASSISTANT
                content = row['content']
                chat_content += f"## {author}: \n{content}\n\n"  # Format message with "USER:" or "ASSISTANT:"
            
            # Write the conversation details to the CSV
            csv_writer.writerow([conversation_date, conversation_title, chat_content.strip(), email_value])

    print(f"CSV file created: {output_file}")

# Example usage
# Assuming 'conversations' is your DataFrame with all conversations

# Specify the output CSV file


# Create the CSV with all chats
create_single_csv_with_all_chats(conversations, output_file, email_value)


CSV file created: ./../data/zak99.csv


In [52]:
from notion_client import Client


# Initialize the Notion client with your API key
notion = Client(auth=notion_api_key)  # Replace with your actual API key



In [53]:
import csv
from tqdm import tqdm
# Set CSV field size limit to a large value (1 billion characters)
csv.field_size_limit(10**9)

def find_or_create_database(parent_page_id):
    try:
        # Retrieve blocks within the page to search for a database
        blocks = notion.blocks.children.list(block_id=parent_page_id)

        for block in blocks['results']:
            if block['type'] == 'child_database':
                print(f"Found existing database with ID: {block['id']}")
                return block['id']  # Return the existing database ID

        # If no database is found, create a new one
        print("No existing database found. Creating a new database...")

        # Create a new database within the page
        response = notion.databases.create(
            parent={"page_id": parent_page_id},
            title=[
                {
                    "type": "text",
                    "text": {"content": "Chat Database"}
                }
            ],
            properties={
                "Title": {"title": {}},
                "Date": {"date": {}},
                "Email": {"rich_text": {}}
            }
        )
        print(f"New database created with ID: {response['id']}")
        return response['id']

    except Exception as e:
        print(f"An error occurred while finding or creating the database: {e}")
        return None

# Function to split long text into chunks of 2000 characters
def split_text_into_chunks(text, chunk_size=1000):
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

def replace_user_and_assistant(text):
    # Replace ## USER with 👤 USER, ## ASSISTANT with 🤖 ASSISTANT, and others
    text = text.replace("## USER", "👤 USER")
    text = text.replace("## ASSISTANT", "🤖 ASSISTANT")
    text = text.replace("## TOOL", "🛠 TOOL")  # Add tool replacement
    text = text.replace("## SYSTEM", "💻 SYSTEM")  # Add system replacement
    return text


def add_entry_to_database(database_id, row):
    try:
        # Prepare the chat content by replacing markers and splitting into chunks
        chat_content = replace_user_and_assistant(row["Chat Content"])
        chat_content_chunks = split_text_into_chunks(chat_content)

        # Create the main properties (Title, Date, Email)
        properties = {
            "Title": {
                "title": [
                    {
                        "text": {
                            "content": row["Title"]  # Using 'Title' as the page title
                        }
                    }
                ]
            },
            "Date": {
                "date": {
                    "start": row["Date"]  # Date property
                }
            },
            "Email": {
                "rich_text": [
                    {
                        "text": {
                            "content": row["Email"]  # Email property
                        }
                    }
                ]
            }
        }

        # Create a new page in the Notion database with the initial content and properties
        response = notion.pages.create(
            parent={"database_id": database_id},
            properties=properties,
            children=[]  # We'll add children later for chat content
        )

        # Now that the page is created, add chat content as children blocks
        page_id = response["id"]
        children_blocks = []
        for chunk in chat_content_chunks:
            children_blocks.append({
                "object": "block",
                "type": "paragraph",
                "paragraph": {
                    "rich_text": [
                        {
                            "text": {
                                "content": chunk
                            }
                        }
                    ]
                }
            })

        # Append the chat content as children to the created page
        if children_blocks:
            notion.blocks.children.append(block_id=page_id, children=children_blocks)

        #print(f"Entry added: {row['Title']}")
    except Exception as e:
        print(f"An error occurred while adding entry: {e}")


# Path to your CSV file

# Step 1: Find or create a database inside the specified page
database_id = find_or_create_database(parent_page_id)

if database_id:
    # Open the CSV and create a page for each row in the database
    with open(csv_file_path, mode='r', encoding='utf-8') as file:
        csv_reader = list(csv.DictReader(file))  # Convert csv_reader to a list to calculate length
        total_rows = len(csv_reader)  # Get total number of rows
        
        # Use tqdm with the total number of rows for progress tracking
        for row in tqdm(csv_reader, desc="Adding entries to database", total=total_rows):
            add_entry_to_database(database_id, row)

print("CSV data import completed!")


Found existing database with ID: 6cb1e54f-6de6-429a-8794-5fd643817197


Adding entries to database:   5%|▍         | 28/583 [01:10<21:54,  2.37s/it]

An error occurred while adding entry: body failed validation: body.children.length should be ≤ `100`, instead was `120`.


Adding entries to database:  11%|█         | 64/583 [02:52<16:13,  1.87s/it]

An error occurred while adding entry: body failed validation: body.children.length should be ≤ `100`, instead was `107`.


Adding entries to database:  14%|█▎        | 79/583 [03:24<14:02,  1.67s/it]

An error occurred while adding entry: body failed validation: body.children.length should be ≤ `100`, instead was `109`.


Adding entries to database:  16%|█▌        | 93/583 [03:55<21:24,  2.62s/it]

An error occurred while adding entry: body failed validation: body.children.length should be ≤ `100`, instead was `128`.


Adding entries to database:  17%|█▋        | 98/583 [04:08<19:10,  2.37s/it]

An error occurred while adding entry: body failed validation: body.children.length should be ≤ `100`, instead was `190`.


Adding entries to database:  17%|█▋        | 101/583 [04:12<13:58,  1.74s/it]

An error occurred while adding entry: body failed validation: body.children.length should be ≤ `100`, instead was `185`.


Adding entries to database:  17%|█▋        | 102/583 [04:14<13:30,  1.69s/it]

An error occurred while adding entry: body failed validation: body.children.length should be ≤ `100`, instead was `131`.


Adding entries to database:  23%|██▎       | 134/583 [05:38<12:06,  1.62s/it]

An error occurred while adding entry: body failed validation: body.children.length should be ≤ `100`, instead was `212`.


Adding entries to database:  24%|██▍       | 141/583 [05:52<12:19,  1.67s/it]

An error occurred while adding entry: body failed validation: body.children.length should be ≤ `100`, instead was `102`.


Adding entries to database:  25%|██▍       | 144/583 [05:58<13:01,  1.78s/it]

An error occurred while adding entry: body failed validation: body.children.length should be ≤ `100`, instead was `144`.


Adding entries to database:  25%|██▌       | 146/583 [06:03<16:19,  2.24s/it]

An error occurred while adding entry: body failed validation: body.children.length should be ≤ `100`, instead was `164`.


Adding entries to database:  28%|██▊       | 161/583 [06:45<18:46,  2.67s/it]

An error occurred while adding entry: body failed validation: body.children.length should be ≤ `100`, instead was `165`.


Adding entries to database:  32%|███▏      | 188/583 [07:56<24:29,  3.72s/it]

An error occurred while adding entry: body failed validation: body.children.length should be ≤ `100`, instead was `224`.


Adding entries to database:  33%|███▎      | 193/583 [08:09<16:11,  2.49s/it]

An error occurred while adding entry: body failed validation: body.children.length should be ≤ `100`, instead was `143`.


Adding entries to database:  35%|███▌      | 206/583 [08:36<13:46,  2.19s/it]

An error occurred while adding entry: body failed validation: body.children.length should be ≤ `100`, instead was `224`.


Adding entries to database:  39%|███▉      | 226/583 [09:31<17:42,  2.98s/it]

An error occurred while adding entry: body failed validation: body.children.length should be ≤ `100`, instead was `148`.


Adding entries to database:  39%|███▉      | 230/583 [09:43<18:11,  3.09s/it]

An error occurred while adding entry: body failed validation: body.children.length should be ≤ `100`, instead was `162`.


Adding entries to database:  40%|███▉      | 231/583 [09:45<15:55,  2.71s/it]

An error occurred while adding entry: body failed validation: body.children.length should be ≤ `100`, instead was `158`.


Adding entries to database:  40%|███▉      | 232/583 [09:47<14:34,  2.49s/it]

An error occurred while adding entry: body failed validation: body.children.length should be ≤ `100`, instead was `135`.


Adding entries to database:  42%|████▏     | 243/583 [10:26<12:39,  2.23s/it]

An error occurred while adding entry: body failed validation: body.children.length should be ≤ `100`, instead was `176`.


Adding entries to database:  47%|████▋     | 275/583 [11:53<18:29,  3.60s/it]

An error occurred while adding entry: body failed validation: body.children.length should be ≤ `100`, instead was `134`.


Adding entries to database:  52%|█████▏    | 305/583 [13:15<13:38,  2.94s/it]

An error occurred while adding entry: body failed validation: body.children.length should be ≤ `100`, instead was `160`.


Adding entries to database:  53%|█████▎    | 310/583 [13:29<11:38,  2.56s/it]

An error occurred while adding entry: body failed validation: body.children.length should be ≤ `100`, instead was `123`.


Adding entries to database:  54%|█████▍    | 314/583 [13:39<11:14,  2.51s/it]

An error occurred while adding entry: body failed validation: body.children.length should be ≤ `100`, instead was `149`.


Adding entries to database:  55%|█████▌    | 321/583 [14:00<09:45,  2.23s/it]

An error occurred while adding entry: body failed validation: body.children.length should be ≤ `100`, instead was `107`.


Adding entries to database:  60%|█████▉    | 348/583 [15:44<15:43,  4.02s/it]

An error occurred while adding entry: body failed validation: body.children.length should be ≤ `100`, instead was `155`.


Adding entries to database:  63%|██████▎   | 369/583 [16:55<13:23,  3.76s/it]

An error occurred while adding entry: body failed validation: body.children.length should be ≤ `100`, instead was `106`.


Adding entries to database:  68%|██████▊   | 394/583 [17:51<05:02,  1.60s/it]

An error occurred while adding entry: body failed validation: body.children.length should be ≤ `100`, instead was `150`.


Adding entries to database:  75%|███████▌  | 439/583 [21:14<06:48,  2.83s/it]

An error occurred while adding entry: body failed validation: body.children.length should be ≤ `100`, instead was `214`.


Adding entries to database:  97%|█████████▋| 563/583 [27:12<00:45,  2.28s/it]

An error occurred while adding entry: body failed validation: body.children.length should be ≤ `100`, instead was `112`.


Adding entries to database: 100%|██████████| 583/583 [28:16<00:00,  2.91s/it]

CSV data import completed!



