# Web App Event Logs - Exploratory Data Analysis

This notebook analyzes web app event logs stored as JSON files in `reports/monitor-logs/YYYY/MM/DD/` structure.

Each JSON file contains:
```json
{
  "sessionId": "session_1754052978056_4i3x2c3bc1g3g4j2",
  "timestamp": "20250801T125618127Z",
  "eventType": "sessionStart",
  "payload": { ... },
  "server_context": { ... }
}
```

In [None]:
import glob
import json
import os
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd

# Set up paths
log_base_path = Path("monitor-logs")
print(f"Base log path: {log_base_path}")
print(f"Path exists: {log_base_path.exists()}")

## 1. Load All JSON Files and Create DataFrame

Find and load all JSON files, immediately converting to a pandas DataFrame.

In [None]:
def load_all_log_files(base_path="monitor-logs", min_date="20250705"):
    """
    Load all JSON files from the log directory structure.

    Handles the recursive YYYY/MM/DD/ folder structure.
    Each JSON file contains: sessionId, timestamp, eventType, payload, server_context.

    Args:
        base_path: Path to monitor-logs directory
        min_date: Only load files from this date onwards (format: YYYYMMDD, e.g., '20250705')

    """
    all_events = []
    file_count = 0
    error_count = 0
    skipped_count = 0

    # Use glob to find all JSON files recursively
    json_pattern = os.path.join(base_path, "**/*.json")
    json_files = glob.glob(json_pattern, recursive=True)

    print(f"Found {len(json_files)} JSON files total\n")
    if min_date:
        print(f"Filtering for files from {min_date} onwards (July 5, 2025)\n")

    # Load each file
    for file_path in json_files:
        # Filter by date if min_date is specified
        if min_date:
            # Extract date from path: monitor-logs/YYYY/MM/DD/...
            path_parts = file_path.split(os.sep)
            if len(path_parts) >= 4:
                try:
                    year = path_parts[-4]
                    month = path_parts[-3]
                    day = path_parts[-2]
                    file_date = f"{year}{month}{day}"
                    if file_date < min_date:
                        skipped_count += 1
                        continue
                except (ValueError, IndexError):
                    pass

        try:
            with open(file_path) as f:
                event_data = json.load(f)

            # Validate required fields
            if all(
                key in event_data for key in ["sessionId", "timestamp", "eventType"]
            ):
                all_events.append(event_data)
                file_count += 1
            else:
                error_count += 1

        except (OSError, json.JSONDecodeError):
            error_count += 1

    print(f"Successfully loaded: {file_count} files")
    print(f"Skipped (before cutoff): {skipped_count} files")
    if error_count > 0:
        print(f"Errors encountered: {error_count} files\n")

    return all_events


# Load all events (from July 5, 2025 onwards)
all_events = load_all_log_files(min_date="20250705")

print(f"\nTotal events loaded: {len(all_events)}")

# Create DataFrame immediately
events_df = pd.DataFrame(all_events)

# Parse timestamp to datetime
events_df["timestamp"] = pd.to_datetime(
    events_df["timestamp"], format="%Y%m%dT%H%M%S%fZ"
)

# Explode server_context into separate columns
if "server_context" in events_df.columns:
    server_context_df = pd.json_normalize(events_df["server_context"].tolist())
    # Prefix columns to avoid conflicts
    server_context_df.columns = ["server_" + col for col in server_context_df.columns]
    events_df = pd.concat([events_df, server_context_df], axis=1)
    print(f"\nExpanded {len(server_context_df.columns)} server_context fields")

print(f"\nDataFrame created with shape: {events_df.shape}")
print(f"Columns: {events_df.columns.tolist()}")

## 2. Summary Statistics

Get key statistics on number of events and unique sessions.

In [None]:
print("\n" + "=" * 60)
print("SUMMARY STATISTICS")
print("=" * 60)
print(f"Total events:          {len(events_df):,}")
print(f"Total unique sessions: {events_df['sessionId'].nunique():,}")
print(
    f"Avg events per session: {len(events_df) / events_df['sessionId'].nunique():.2f}"
)
print(f"\nDate range: {events_df['timestamp'].min()} to {events_df['timestamp'].max()}")

# Show server context fields if available
server_cols = [col for col in events_df.columns if col.startswith("server_")]
if server_cols:
    print(
        f"\nServer context fields: {', '.join([col.replace('server_', '') for col in server_cols])}"
    )

## 3. Events by Type

Breakdown of event types in the dataset.

In [None]:
print("\n" + "=" * 60)
print("EVENTS BY TYPE")
print("=" * 60)

event_type_df = events_df["eventType"].value_counts().reset_index()
event_type_df.columns = ["eventType", "count"]
event_type_df["percentage"] = (
    event_type_df["count"] / event_type_df["count"].sum() * 100
).round(2)

print(event_type_df.to_string(index=False))

## 4. Session Statistics

Analyze events distribution across sessions.

In [None]:
print("\n" + "=" * 60)
print("SESSION EVENT DISTRIBUTION")
print("=" * 60)

# Count events per session
session_stats_df = events_df.groupby("sessionId").size().reset_index(name="event_count")

print(session_stats_df["event_count"].describe())

# Histogram of events per session (adding up every bin above 20 together)

plt.figure(figsize=(10, 6))

# Cap the data at 21 (anything >=21 becomes 21)
capped_counts = session_stats_df["event_count"].clip(upper=21)

# Create histogram with bins from 1 to 22
plt.hist(capped_counts, bins=range(1, 23), align="left", color="skyblue")

plt.title("Distribution of Events per Session")
plt.xlabel("Number of Events")
plt.ylabel("Number of Sessions")

# Set x-ticks and relabel the last one as "21+"
xticks = list(range(1, 22))
labels = [str(x) if x < 21 else "21+" for x in xticks]
plt.xticks(xticks, labels)

plt.grid(axis="y")
plt.show()

## 5. Sample Events

Peek at sample events from each type to understand data structure.

In [None]:
print("\n" + "=" * 60)
print("SAMPLE EVENTS BY TYPE")
print("=" * 60)

for event_type in sorted(events_df["eventType"].unique()):
    sample = events_df[events_df["eventType"] == event_type].iloc[0].to_dict()
    print(f"\n{event_type.upper()}:")
    print(json.dumps(sample, indent=2, default=str)[:600])
    print("-" * 60)

## 6. DataFrame Overview

Display key information about the DataFrame structure.

In [None]:
print(f"\nDataFrame shape: {events_df.shape}")
print(f"\nColumns: {events_df.columns.tolist()}")
print("\nData types:")
print(events_df.dtypes)
print("\nFirst few rows:")
display_cols = ["sessionId", "timestamp", "eventType"]
# Add server context columns if available
server_cols = [col for col in events_df.columns if col.startswith("server_")]
if server_cols:
    display_cols.extend(server_cols[:3])  # Show first 3 server fields
print(events_df[display_cols].head(10))

## 6b. Server Context Analysis

Analyze server-side context information (IP addresses, user agents, etc.).

In [None]:
server_cols = [col for col in events_df.columns if col.startswith("server_")]

if server_cols:
    print("\n" + "=" * 60)
    print("SERVER CONTEXT FIELDS")
    print("=" * 60)

    for col in server_cols:
        unique_count = events_df[col].nunique()
        field_name = col.replace("server_", "")
        print(f"\n{field_name}:")
        print(f"  Unique values: {unique_count}")

        # Show top values for fields with reasonable cardinality
        if unique_count <= 20:
            print(f"  Values: {events_df[col].value_counts().to_dict()}")
        elif unique_count < len(events_df) / 2:
            print("  Top 5 values:")
            for val, count in events_df[col].value_counts().head(5).items():
                print(f"    {val}: {count}")
else:
    print("No server_context fields found in data")

## 7. Payload Structure Analysis

Analyze the structure of payload fields for each event type.

In [None]:
print("\n" + "=" * 60)
print("PAYLOAD STRUCTURE BY EVENT TYPE")
print("=" * 60)

# Extract payload keys for each event type
payload_structure = []
for event_type in sorted(events_df["eventType"].unique()):
    type_df = events_df[events_df["eventType"] == event_type]
    all_keys = set()
    for payload in type_df["payload"]:
        if isinstance(payload, dict):
            all_keys.update(payload.keys())

    payload_structure.append(
        {
            "eventType": event_type,
            "count": len(type_df),
            "payload_keys": sorted(list(all_keys)),
        }
    )

payload_df = pd.DataFrame(payload_structure)
for _, row in payload_df.iterrows():
    print(f"\n{row['eventType']} ({row['count']} events):")
    print(f"  Keys: {', '.join(row['payload_keys'])}")

## 8. Session Timeline Analysis

Analyze event sequences and timing within sessions.

In [None]:
print("\n" + "=" * 60)
print("EVENT COUNTS BY SESSION AND TYPE")
print("=" * 60)

# Count events by session and type
session_event_counts = (
    events_df.groupby(["sessionId", "eventType"]).size().reset_index(name="count")
)
session_event_counts = session_event_counts.sort_values(
    ["sessionId", "count"], ascending=[True, False]
)

print(session_event_counts.head(20).to_string(index=False))

print("\n" + "=" * 60)
print("SESSION DURATION AND EVENT TIMING")
print("=" * 60)

# Calculate session duration and event rate
session_timing = (
    events_df.groupby("sessionId")
    .agg({"timestamp": ["min", "max", "count"]})
    .reset_index()
)
session_timing.columns = ["sessionId", "first_event", "last_event", "event_count"]
session_timing["duration_seconds"] = (
    session_timing["last_event"] - session_timing["first_event"]
).dt.total_seconds()
session_timing["events_per_minute"] = (
    session_timing["event_count"] / (session_timing["duration_seconds"] / 60)
).round(2)

print(session_timing.head(10).to_string(index=False))