In [16]:
from langchain_community.document_loaders import JSONLoader
from typing import List, Dict

def extract_metadata(record: dict, metadata: dict) -> dict:

        metadata["id"] = record.get("id", ""),
        metadata["parentId"] = record.get("parentId", ""),
        metadata["role"] = record.get("role", ""),
        metadata["modelName"] = record.get("modelName") or record.get("model", "")
        metadata["timestamp"] = record.get("timestamp", "")

        return metadata

# Create the loader
loader = JSONLoader(
    file_path='../data/open-webui/open-webui-chat-export.json',
    text_content=False,
    json_lines=True,
    is_content_key_jq_parsable=True,
    content_key='.content',
    jq_schema='.[].chat.messages[]',
    metadata_func=extract_metadata
)

# Load the documents
docs = loader.load()

# Calculate the number of words total for all docs
total_words = sum(len(doc.page_content.split()) for doc in docs)

print(f"{len(docs)} docs loaded with a total of {total_words:,} words.")

1639 docs loaded with a total of 627,363 words.


In [17]:
from datetime import datetime
from collections import defaultdict
import os

def get_date_from_timestamp(timestamp):
    date_obj = datetime.fromtimestamp(timestamp)
    return date_obj.strftime('%Y-%m-%d')

def get_folder_structure_and_filename(date):
    date_obj = datetime.strptime(date, '%Y-%m-%d')
    year = date_obj.strftime('%Y')
    month_num = date_obj.strftime('%m')
    month_name = date_obj.strftime('%B')
    day_name = date_obj.strftime('%A')
    
    folder_path = os.path.join(year, f"{month_num}-{month_name}")
    filename = f"{date}-{day_name}.md"
    return folder_path, filename

def format_daily_content(docs_for_date):
    content = []
    for doc in docs_for_date:
        timestamp = doc.metadata.get('timestamp')
        time_str = datetime.fromtimestamp(timestamp).strftime('%H:%M:%S')
        content.append(f"### Time: {time_str}")
        content.append(f"**Role:** {doc.metadata['role']}")
        content.append(f"**Content:** {doc.page_content}")
        content.append("\n---\n")  # Markdown separator
    return "\n".join(content)

def save_daily_journal(docs):
    base_folder = "/home/codyt/Documents/Personal/Journal"
    
    daily_groups = defaultdict(list)
    for doc in docs:
        timestamp = doc.metadata.get('timestamp')
        if timestamp and isinstance(timestamp, (int, float)):
            date = get_date_from_timestamp(timestamp)
            daily_groups[date].append(doc)
    
    for date, docs_for_date in daily_groups.items():
        folder_structure, filename = get_folder_structure_and_filename(date)
        full_folder_path = os.path.join(base_folder, folder_structure)
        os.makedirs(full_folder_path, exist_ok=True)
        
        file_path = os.path.join(full_folder_path, filename)
        content = format_daily_content(docs_for_date)
        
        with open(file_path, "a") as file:
            file.write(f"\n---\n\n# Chat History {date}\n\n")
            file.write(content)
            file.write("\n")


In [18]:
# Save the docs
save_daily_journal(docs)