In [5]:
from langchain_community.document_loaders import JSONLoader
from typing import List, Dict

def extract_metadata(record: dict, metadata: dict) -> dict:

        metadata["id"] = record.get("id", ""),
        metadata["parentId"] = record.get("parentId", ""),
        metadata["role"] = record.get("role", ""),
        metadata["modelName"] = record.get("modelName") or record.get("model", "")
        metadata["timestamp"] = record.get("timestamp", "")

        return metadata

# Create the loader
loader = JSONLoader(
    file_path='../data/open-webui/open-webui-chat-export.json',
    text_content=False,
    json_lines=True,
    is_content_key_jq_parsable=True,
    content_key='.content',
    jq_schema='.[].chat.messages[]',
    metadata_func=extract_metadata
)

# Load the documents
docs = loader.load()

# Calculate the number of words total for all docs
total_words = sum(len(doc.page_content.split()) for doc in docs)

print(f"{len(docs)} docs loaded with a total of {total_words:,} words.")

1639 docs loaded with a total of 627,363 words.
{'source': '/home/codyt/Documents/Projects/qdrant-upload/data/open-webui/open-webui-chat-export.json', 'seq_num': 1, 'id': ('d8e6f8ef-12a8-4855-8921-7718bc0df4c3',), 'parentId': (None,), 'role': ('user',), 'modelName': '', 'timestamp': 1728660597}


In [27]:
from datetime import datetime
from collections import defaultdict

def get_day_from_timestamp(timestamp):
    date_obj = datetime.fromtimestamp(timestamp)
    return date_obj.strftime('%A')  # Get day name (Monday, Tuesday, etc.)

def get_weekly_date_from_timestamp(timestamp):
    date_obj = datetime.fromtimestamp(timestamp)
    return date_obj.isocalendar()[:2]

# Group documents by week and then by day
weekly_groups = {}
for doc in docs:
    timestamp = doc.metadata.get('timestamp')
    if timestamp and isinstance(timestamp, (int, float)):
        weekly_date = get_weekly_date_from_timestamp(timestamp)
        day_of_week = get_day_from_timestamp(timestamp)
        
        if weekly_date not in weekly_groups:
            weekly_groups[weekly_date] = defaultdict(list)
        
        weekly_groups[weekly_date][day_of_week].append(doc)

# Print available weeks
print("\nAvailable weeks for sampling:")
for week in sorted(weekly_groups.keys()):
    total_messages = sum(len(days) for days in weekly_groups[week].values())
    print(f"- Year {week[0]}, Week {week[1]}: {total_messages} messages")

# Function to print sample from specific week
def print_weekly_sample(week):
    if week in weekly_groups:
        year, week_num = week
        print(f"\nYear {year}, Week {week_num}")
        
        # Print messages grouped by day
        for day in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']:
            if day in weekly_groups[week]:
                print(f"\n{datetime.fromtimestamp(timestamp)} {day}:")
                print(f"Number of messages: {len(weekly_groups[week][day])}")
                
                for doc in weekly_groups[week][day]:
                    print(f"- Role: {doc.metadata['role']}")
                    print(f"  Content: {doc.page_content}\n")
    else:
        print(f"Week {week} not found in the data")



Available weeks for sampling:
- Year 2024, Week 33: 2 messages
- Year 2024, Week 35: 20 messages
- Year 2024, Week 36: 6 messages
- Year 2024, Week 38: 130 messages
- Year 2024, Week 39: 125 messages
- Year 2024, Week 40: 23 messages
- Year 2024, Week 41: 210 messages
- Year 2024, Week 42: 124 messages
- Year 2024, Week 43: 219 messages
- Year 2024, Week 44: 82 messages
- Year 2024, Week 45: 92 messages
- Year 2024, Week 46: 71 messages
- Year 2024, Week 47: 6 messages
- Year 2024, Week 48: 40 messages
- Year 2024, Week 49: 166 messages
- Year 2024, Week 50: 105 messages
- Year 2024, Week 51: 106 messages
- Year 2024, Week 52: 22 messages


In [None]:
# Example: view a specific week
sample_week = (2024, 35)  # Change these numbers to view different weeks
print_weekly_sample(sample_week)

In [None]:
from datetime import datetime
from collections import defaultdict

def get_date_from_timestamp(timestamp):
    date_obj = datetime.fromtimestamp(timestamp)
    return date_obj.strftime('%Y-%m-%d')  # Get date in YYYY-MM-DD format

# Group documents by date
daily_groups = defaultdict(list)
for doc in docs:
    timestamp = doc.metadata.get('timestamp')
    if timestamp and isinstance(timestamp, (int, float)):
        date = get_date_from_timestamp(timestamp)
        daily_groups[date].append(doc)

# Print available dates
print("\nAvailable dates for sampling:")
for date in sorted(daily_groups.keys()):
    print(f"- {date}: {len(daily_groups[date])} messages")

# Function to print sample from specific date
def print_daily_sample(date):
    if date in daily_groups:
        print(f"\nDate: {date}")
        print(f"Number of messages: {len(daily_groups[date])}")
        
        for doc in daily_groups[date]:
            timestamp = doc.metadata.get('timestamp')
            time_str = datetime.fromtimestamp(timestamp).strftime('%H:%M:%S')
            print(f"\n- Time: {time_str}")
            print(f"  Role: {doc.metadata['role']}")
            print(f"  Content: {doc.page_content}")
    else:
        print(f"Date {date} not found in the data")
