In [36]:
from langchain_community.document_loaders import JSONLoader
from typing import List, Dict

def extract_metadata(record: dict, metadata: dict) -> dict:

        metadata["id"] = record.get("id", ""),
        metadata["parentId"] = record.get("parentId", ""),
        metadata["role"] = record.get("role", ""),
        metadata["modelName"] = record.get("modelName") or record.get("model", "")
        metadata["timestamp"] = record.get("timestamp", "")

        return metadata

# Create the loader
loader = JSONLoader(
    file_path='../data/open-webui/open-webui-chat-export.json',
    text_content=False,
    json_lines=True,
    is_content_key_jq_parsable=True,
    content_key='.content',
    jq_schema='.[].chat.messages[]',
    metadata_func=extract_metadata
)

# Load the documents
docs = loader.load()

# Calculate the number of words total for all docs
total_words = sum(len(doc.page_content.split()) for doc in docs)

print(f"{len(docs)} docs loaded with a total of {total_words:,} words.")

1639 docs loaded with a total of 627,363 words.


In [37]:
import pandas as pd

# Convert the loaded docs to a DataFrame
df = pd.DataFrame([{
    'content': doc.page_content,
    'id': doc.metadata['id'][0],  # Note: fixing the tuple issue in your metadata
    'role': doc.metadata['role'][0],
    'parent_id': doc.metadata['parentId'][0],
    'model': doc.metadata['modelName'],
    'timestamp': doc.metadata['timestamp']
} for doc in docs])

# Convert the timestamp to datetime assuming they are in Unix epoch format
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')

# Format the timestamp to display only the date
df['date'] = df['timestamp'].dt.date

# Add content length column
df['content_length'] = df['content'].str.len()

# Drop rows with a content length of less than 75 characters
df = df[df['content_length'] > 75]

# Display the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1375 entries, 1 to 1638
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   content         1375 non-null   object        
 1   id              1375 non-null   object        
 2   role            1375 non-null   object        
 3   parent_id       1237 non-null   object        
 4   model           1375 non-null   object        
 5   timestamp       1301 non-null   datetime64[ns]
 6   date            1301 non-null   object        
 7   content_length  1375 non-null   int64         
dtypes: datetime64[ns](1), int64(1), object(6)
memory usage: 96.7+ KB
