# NiFi Processor Analysis Dashboard

Generate interactive HTML dashboard analyzing processor activity across NiFi flows.

**Purpose**: Identify unused processors for cleanup/removal
**Data Source**: Delta table `nifi_processor_snapshots_full_attributes`
**Time Range**: Last 30 days
**Output**: Interactive HTML dashboard with Plotly visualizations

In [None]:
# Cell 1: Setup & Imports

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime, timedelta
import json
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, sum as spark_sum, max as spark_max, min as spark_min, when, datediff, current_timestamp, lit

print("‚úì Libraries imported successfully!")

In [None]:
# Cell 2: Configuration

CONFIG = {
    'table': '1dp_mfg_sbx.validation_test_eric.nifi_processor_snapshots_full_attributes',
    'days_back': 30,
    'output_file': '/Volumes/1dp_mfg_sbx/validation_test_eric/files/processor_analysis_summary.html',
    'output_dir': '/Volumes/1dp_mfg_sbx/validation_test_eric/files/',
    'flow_reports_dir': '/Volumes/1dp_mfg_sbx/validation_test_eric/files/flow_reports/',
    'inactive_threshold_pct': 1.0,  # <1% activity = inactive
    'servers': None,  # None = all servers, or list like ['prod', 'thailand']
    # Server-specific NiFi canvas base URLs
    'server_urls': {
        'thailand': 'https://thbnk01hdpnp002.th-bnk01.nxp.com:8443/nifi/',
        'prod': 'https://us-chd01-prod-nifi.us-chd01.nxp.com:8443/nifi/'
    }
}

print(f"‚úì Configuration loaded")
print(f"  Table: {CONFIG['table']}")
print(f"  Analysis period: Last {CONFIG['days_back']} days")
print(f"  Summary output: {CONFIG['output_file']}")
print(f"  Flow reports: {CONFIG['flow_reports_dir']}")

In [None]:
# Cell 3: Load Data from Delta Table (Optimized - Stats Query Skipped for Performance)

print("‚ö†Ô∏è  PERFORMANCE OPTIMIZATION: Skipping detailed stats query")
print("   Reason: Table is not partitioned, causing 7+ minute full table scans")
print("   Solution: Run table optimization (see plan) or continue without stats\n")
print("üìä Loading processor metrics directly from Cell 4...")
print("   Dashboard will still generate successfully!")
print("\n" + "="*60)

# Calculate cutoff date
cutoff_date = (datetime.now() - timedelta(days=CONFIG['days_back'])).strftime('%Y-%m-%d')
print(f"\n‚úì Cutoff date: {cutoff_date}")

# Build query with server filter if needed (used by Cell 4)
where_clause = f"WHERE snapshot_timestamp >= '{cutoff_date}'"
if CONFIG['servers']:
    servers_filter = "','".join(CONFIG['servers'])
    where_clause += f" AND server IN ('{servers_filter}')"

print(f"‚úì Filter clause prepared for Cell 4")
print("\n‚ö†Ô∏è  Skipping stats query - proceeding to processor metrics calculation...")

In [None]:
# Cell 4: Calculate Processor Activity Metrics in Spark SQL

print("Calculating processor activity metrics in Spark SQL...")

# Calculate all processor metrics in a single Spark SQL query
metrics_query = f"""
WITH processor_activity AS (
    SELECT
        server,
        flow_name,
        processor_id,
        processor_name,
        processor_type,
        parent_group_id,
        parent_group_name,
        snapshot_timestamp,
        flow_files_in,
        flow_files_out,
        bytes_in,
        bytes_out,
        tasks,
        run_status,
        CASE 
            WHEN flow_files_out > 0 OR tasks > 0 THEN 1 
            ELSE 0 
        END as has_activity
    FROM {CONFIG['table']}
    {where_clause}
),
processor_metrics AS (
    SELECT
        server,
        flow_name,
        processor_id,
        processor_name,
        processor_type,
        parent_group_id,
        parent_group_name,
        COUNT(*) as total_snapshots,
        SUM(has_activity) as snapshots_with_activity,
        SUM(flow_files_in) as total_flowfiles_in,
        SUM(flow_files_out) as total_flowfiles_out,
        SUM(bytes_in) as total_bytes_in,
        SUM(bytes_out) as total_bytes_out,
        SUM(tasks) as total_tasks,
        MAX(CASE WHEN has_activity = 1 THEN snapshot_timestamp END) as last_active_time,
        MAX(run_status) as last_run_status
    FROM processor_activity
    GROUP BY server, flow_name, processor_id, processor_name, processor_type, parent_group_id, parent_group_name
)
SELECT
    server,
    flow_name,
    processor_id,
    processor_name,
    processor_type,
    parent_group_id,
    parent_group_name,
    total_snapshots,
    snapshots_with_activity,
    total_flowfiles_in,
    total_flowfiles_out,
    total_bytes_in,
    total_bytes_out,
    total_tasks,
    last_active_time,
    last_run_status,
    ROUND((snapshots_with_activity * 100.0 / total_snapshots), 2) as activity_rate_pct,
    COALESCE(DATEDIFF(CURRENT_TIMESTAMP(), last_active_time), {CONFIG['days_back']}) as days_since_active,
    CASE
        WHEN last_run_status != 'Running' THEN 'Stopped'
        WHEN snapshots_with_activity = 0 THEN 'Inactive'
        WHEN (snapshots_with_activity * 100.0 / total_snapshots) < {CONFIG['inactive_threshold_pct']} THEN 'Low Activity'
        ELSE 'Active'
    END as activity_status,
    CASE
        WHEN last_run_status != 'Running' AND snapshots_with_activity = 0 THEN 'Remove'
        WHEN snapshots_with_activity = 0 THEN 'Remove'
        WHEN (snapshots_with_activity * 100.0 / total_snapshots) < {CONFIG['inactive_threshold_pct']} THEN 'Review'
        ELSE 'Keep'
    END as recommendation,
    CASE
        WHEN snapshots_with_activity = 0 THEN '0 (Never)'
        WHEN snapshots_with_activity <= 100 THEN '1-100 (Rarely)'
        WHEN snapshots_with_activity <= 1000 THEN '101-1000 (Occasionally)'
        WHEN snapshots_with_activity <= 4000 THEN '1001-4000 (Regularly)'
        ELSE '4000+ (Constantly)'
    END as activity_bucket
FROM processor_metrics
ORDER BY server, flow_name, activity_rate_pct
"""

# Execute query and convert only the aggregated results to Pandas
processor_metrics = spark.sql(metrics_query).toPandas()

print(f"‚úì Calculated metrics for {len(processor_metrics)} processors")
print(f"\nActivity Status Distribution:")
print(processor_metrics['activity_status'].value_counts())
print(f"\nRecommendations:")
print(processor_metrics['recommendation'].value_counts())

In [None]:
# Cell 5: Helper Functions for Visualizations

def create_summary_cards(metrics_df):
    """Create executive summary metrics."""
    total_processors = len(metrics_df)
    inactive_processors = len(metrics_df[metrics_df['activity_status'].isin(['Inactive', 'Stopped'])])
    cleanup_candidates = len(metrics_df[metrics_df['recommendation'] == 'Remove'])
    review_required = len(metrics_df[metrics_df['recommendation'] == 'Review'])
    flows_analyzed = metrics_df['flow_name'].nunique()
    servers = metrics_df['server'].unique().tolist()
    
    cleanup_impact_pct = (cleanup_candidates / total_processors * 100) if total_processors > 0 else 0
    
    return {
        'total_processors': total_processors,
        'inactive_processors': inactive_processors,
        'cleanup_candidates': cleanup_candidates,
        'review_required': review_required,
        'flows_analyzed': flows_analyzed,
        'servers': servers,
        'cleanup_impact_pct': round(cleanup_impact_pct, 1),
        'last_update': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    }

def create_status_badge(status):
    """Create HTML status badge."""
    colors = {
        'Active': '#28a745',
        'Low Activity': '#ffc107',
        'Inactive': '#dc3545',
        'Stopped': '#6c757d'
    }
    emojis = {
        'Active': 'üü¢',
        'Low Activity': 'üü°',
        'Inactive': 'üî¥',
        'Stopped': '‚ö´'
    }
    color = colors.get(status, '#6c757d')
    emoji = emojis.get(status, '‚ö´')
    return f'{emoji} <span style="color:{color};font-weight:bold;">{status}</span>'

def create_recommendation_badge(rec):
    """Create HTML recommendation badge."""
    colors = {
        'Keep': '#28a745',
        'Review': '#ffc107',
        'Remove': '#dc3545'
    }
    symbols = {
        'Keep': '‚úì',
        'Review': '‚ö†Ô∏è',
        'Remove': '‚ùå'
    }
    color = colors.get(rec, '#6c757d')
    symbol = symbols.get(rec, '?')
    return f'{symbol} <span style="color:{color};font-weight:bold;">{rec}</span>'

def create_canvas_link(server, parent_group_id, processor_id):
    """
    Create a clickable link to open the processor in NiFi canvas.

    Args:
        server: Server name ('prod', 'thailand')
        parent_group_id: Parent process group UUID
        processor_id: Processor UUID

    Returns:
        HTML link element or "N/A" if data is missing
    """
    # Get base URL for server
    base_url = CONFIG['server_urls'].get(server)

    # Handle missing data
    if not base_url or not parent_group_id or not processor_id:
        return 'N/A'

    # Construct canvas URL
    canvas_url = f"{base_url}?processGroupId={parent_group_id}&componentIds={processor_id}"

    # Return HTML link with icon
    return f'<a href="{canvas_url}" target="_blank" class="canvas-link" title="Open in NiFi Canvas">üîó View</a>'

print("‚úì Helper functions defined")

In [None]:
# Cell 6: Create Executive Summary Visualizations

print("Creating executive summary visualizations...")

summary = create_summary_cards(processor_metrics)

# Pie chart: Active vs Inactive
status_counts = processor_metrics['activity_status'].value_counts()
fig_pie = go.Figure(data=[go.Pie(
    labels=status_counts.index,
    values=status_counts.values,
    marker=dict(colors=['#28a745', '#ffc107', '#dc3545', '#6c757d']),
    hole=0.4
)])
fig_pie.update_layout(
    title="Processor Status Distribution",
    height=400
)

# Bar chart: Top flows by processor count
flow_summary = processor_metrics.groupby('flow_name').agg({
    'processor_id': 'count',
    'activity_status': lambda x: (x.isin(['Inactive', 'Stopped'])).sum()
}).reset_index()
flow_summary.columns = ['flow_name', 'total_processors', 'inactive_processors']
flow_summary['active_processors'] = flow_summary['total_processors'] - flow_summary['inactive_processors']
flow_summary = flow_summary.sort_values('total_processors', ascending=False).head(10)

fig_bar = go.Figure()
fig_bar.add_trace(go.Bar(
    name='Active',
    x=flow_summary['flow_name'],
    y=flow_summary['active_processors'],
    marker_color='#28a745'
))
fig_bar.add_trace(go.Bar(
    name='Inactive',
    x=flow_summary['flow_name'],
    y=flow_summary['inactive_processors'],
    marker_color='#dc3545'
))
fig_bar.update_layout(
    title="Top 10 Flows by Processor Count",
    barmode='stack',
    xaxis_title="Flow Name",
    yaxis_title="Processor Count",
    height=400,
    xaxis_tickangle=-45
)

print("‚úì Executive summary visualizations created")

In [None]:
# Cell 7: Create Per-Flow Visualizations

def create_flow_histogram(flow_data):
    """Create activity frequency histogram for a flow."""
    bucket_counts = flow_data['activity_bucket'].value_counts().reindex([
        '0 (Never)', '1-100 (Rarely)', '101-1000 (Occasionally)',
        '1001-4000 (Regularly)', '4000+ (Constantly)'
    ], fill_value=0)
    
    colors = ['#dc3545', '#fd7e14', '#ffc107', '#90EE90', '#28a745']
    
    fig = go.Figure(data=[go.Bar(
        x=bucket_counts.index,
        y=bucket_counts.values,
        marker_color=colors,
        text=bucket_counts.values,
        textposition='auto'
    )])
    
    fig.update_layout(
        title="Processor Activity Frequency",
        xaxis_title="Activity Level",
        yaxis_title="Number of Processors",
        height=350,
        showlegend=False
    )
    
    return fig

def create_flow_timeline(flow_name, flow_server):
    """Create activity timeline for a flow - loads data on-demand."""
    # Query only this flow's daily aggregated data
    timeline_query = f"""
    SELECT
        processor_name,
        DATE_TRUNC('day', snapshot_timestamp) as day,
        SUM(flow_files_out) as daily_flowfiles,
        SUM(tasks) as daily_tasks
    FROM {CONFIG['table']}
    {where_clause}
        AND flow_name = '{flow_name}'
        AND server = '{flow_server}'
    GROUP BY processor_name, DATE_TRUNC('day', snapshot_timestamp)
    ORDER BY processor_name, day
    """
    
    daily = spark.sql(timeline_query).toPandas()
    
    if daily.empty:
        # Return empty figure if no data
        fig = go.Figure()
        fig.update_layout(title="No activity data available", height=200)
        return fig
    
    # Convert day to date for pivoting
    daily['day'] = pd.to_datetime(daily['day']).dt.date
    
    # Create heatmap
    pivot = daily.pivot(index='processor_name', columns='day', values='daily_flowfiles').fillna(0)
    
    fig = go.Figure(data=go.Heatmap(
        z=pivot.values,
        x=pivot.columns,
        y=pivot.index,
        colorscale='YlGnBu',
        hovertemplate='Processor: %{y}<br>Date: %{x}<br>FlowFiles: %{z}<extra></extra>'
    ))
    
    fig.update_layout(
        title="Processor Activity Timeline (Daily)",
        xaxis_title="Date",
        yaxis_title="Processor",
        height=max(400, len(pivot) * 20)  # Scale height with processor count
    )
    
    return fig

def create_processor_type_chart(flow_data):
    """Create processor type distribution chart."""
    type_summary = flow_data.groupby(['processor_type', 'activity_status']).size().reset_index(name='count')
    
    # Separate active and inactive
    active_df = type_summary[type_summary['activity_status'] == 'Active']
    inactive_df = type_summary[type_summary['activity_status'].isin(['Inactive', 'Stopped', 'Low Activity'])]
    
    fig = go.Figure()
    
    if not active_df.empty:
        fig.add_trace(go.Bar(
            name='Active',
            x=active_df['processor_type'],
            y=active_df['count'],
            marker_color='#28a745'
        ))
    
    if not inactive_df.empty:
        inactive_grouped = inactive_df.groupby('processor_type')['count'].sum().reset_index()
        fig.add_trace(go.Bar(
            name='Inactive',
            x=inactive_grouped['processor_type'],
            y=inactive_grouped['count'],
            marker_color='#dc3545'
        ))
    
    fig.update_layout(
        title="Processor Type Distribution",
        barmode='stack',
        xaxis_title="Processor Type",
        yaxis_title="Count",
        height=350,
        xaxis_tickangle=-45
    )
    
    return fig

print("‚úì Per-flow visualization functions defined")

In [None]:
# Cell 8: Generate Summary HTML Dashboard

print("Generating summary HTML dashboard...")

# Start building summary HTML
html_summary = []

# HTML header for summary
html_summary.append("""
<!DOCTYPE html>
<html>
<head>
    <title>NiFi Processor Analysis - Executive Summary</title>
    <meta charset="utf-8">
    <style>
        body {
            font-family: Arial, sans-serif;
            margin: 0;
            background-color: #f5f5f5;
        }
        .container {
            max-width: 1400px;
            margin: 0 auto;
            background-color: white;
            padding: 30px;
            box-shadow: 0 0 10px rgba(0,0,0,0.1);
        }
        h1 {
            color: #333;
            border-bottom: 3px solid #007bff;
            padding-bottom: 10px;
        }
        h2 {
            color: #0056b3;
            margin-top: 30px;
            border-left: 4px solid #007bff;
            padding-left: 10px;
        }
        h3 {
            color: #495057;
            margin-top: 20px;
        }
        .summary-cards {
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
            gap: 20px;
            margin: 20px 0;
        }
        .card {
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            padding: 20px;
            border-radius: 10px;
            box-shadow: 0 4px 6px rgba(0,0,0,0.1);
        }
        .card-title {
            font-size: 14px;
            opacity: 0.9;
            margin-bottom: 10px;
        }
        .card-value {
            font-size: 32px;
            font-weight: bold;
        }
        .card.red {
            background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
        }
        .card.green {
            background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%);
        }
        .card.yellow {
            background: linear-gradient(135deg, #fa709a 0%, #fee140 100%);
        }
        .chart-container {
            margin: 20px 0;
        }
        .timestamp {
            color: #6c757d;
            font-size: 14px;
            margin-top: 10px;
        }
        .flow-list {
            margin: 30px 0;
        }
        .flow-card {
            background-color: #f8f9fa;
            border: 1px solid #dee2e6;
            border-radius: 8px;
            padding: 20px;
            margin: 15px 0;
            transition: box-shadow 0.3s;
        }
        .flow-card:hover {
            box-shadow: 0 4px 12px rgba(0,0,0,0.15);
        }
        .flow-title {
            font-size: 18px;
            font-weight: bold;
            color: #333;
            margin-bottom: 10px;
        }
        .flow-stats {
            display: flex;
            gap: 30px;
            margin: 10px 0;
            font-size: 14px;
            color: #6c757d;
        }
        .flow-link {
            display: inline-block;
            margin-top: 10px;
            padding: 8px 16px;
            background-color: #007bff;
            color: white;
            text-decoration: none;
            border-radius: 4px;
            transition: background-color 0.3s;
        }
        .flow-link:hover {
            background-color: #0056b3;
        }
        .stat-label {
            font-weight: bold;
        }
        .stat-good {
            color: #28a745;
        }
        .stat-warn {
            color: #ffc107;
        }
        .stat-bad {
            color: #dc3545;
        }
        table {
            width: 100%;
            border-collapse: collapse;
            margin: 20px 0;
            background-color: white;
            font-size: 14px;
        }
        th, td {
            padding: 12px;
            text-align: left;
            border-bottom: 1px solid #dee2e6;
        }
        th {
            background-color: #007bff;
            color: white;
            font-weight: bold;
            position: sticky;
            top: 0;
        }
        tr:hover {
            background-color: #f8f9fa;
        }
        .processor-id {
            font-family: monospace;
            font-size: 11px;
            color: #6c757d;
        }
        .canvas-link {
            display: inline-block;
            padding: 4px 8px;
            background-color: #17a2b8;
            color: white;
            text-decoration: none;
            border-radius: 4px;
            font-size: 12px;
            transition: background-color 0.3s;
        }
        .canvas-link:hover {
            background-color: #138496;
            text-decoration: none;
        }
    </style>
</head>
<body>
<div class="container">
    <h1>üìä NiFi Processor Analysis - Executive Summary</h1>
    <p class="timestamp">Generated: """ + summary['last_update'] + """</p>
    <p class="timestamp">Analysis Period: Last """ + str(CONFIG['days_back']) + """ days</p>
""")

# Executive Summary Cards
html_summary.append("""
    <h2>Overview</h2>
    <div class="summary-cards">
        <div class="card green">
            <div class="card-title">Total Processors</div>
            <div class="card-value">""" + str(summary['total_processors']) + """</div>
        </div>
        <div class="card red">
            <div class="card-title">Inactive Processors</div>
            <div class="card-value">""" + str(summary['inactive_processors']) + """</div>
        </div>
        <div class="card red">
            <div class="card-title">Cleanup Candidates</div>
            <div class="card-value">""" + str(summary['cleanup_candidates']) + """</div>
        </div>
        <div class="card yellow">
            <div class="card-title">Review Required</div>
            <div class="card-value">""" + str(summary['review_required']) + """</div>
        </div>
        <div class="card green">
            <div class="card-title">Flows Analyzed</div>
            <div class="card-value">""" + str(summary['flows_analyzed']) + """</div>
        </div>
        <div class="card">
            <div class="card-title">Cleanup Impact</div>
            <div class="card-value">""" + str(summary['cleanup_impact_pct']) + """%</div>
        </div>
    </div>
""")

# Add executive summary charts
html_summary.append('<div class="chart-container">')
html_summary.append(fig_pie.to_html(full_html=False, include_plotlyjs='cdn'))
html_summary.append('</div>')

html_summary.append('<div class="chart-container">')
html_summary.append(fig_bar.to_html(full_html=False, include_plotlyjs=False))
html_summary.append('</div>')

# Add flow list section (will be populated in next cell)
html_summary.append('<h2>Individual Flow Reports</h2>')
html_summary.append('<p>Click on any flow below to view its detailed analysis:</p>')
html_summary.append('<div class="flow-list">')

print("‚úì Summary HTML header created")

In [None]:
# Cell 9: Generate Individual Flow HTML Files

print("Generating individual flow HTML files...")

# Create output directory if it doesn't exist
import os
os.makedirs(CONFIG['flow_reports_dir'], exist_ok=True)

flows = processor_metrics[['server', 'flow_name']].drop_duplicates().values

for i, (flow_server, flow_name) in enumerate(flows, 1):
    print(f"  Processing {i}/{len(flows)}: {flow_name} ({flow_server})")
    
    flow_data = processor_metrics[(processor_metrics['flow_name'] == flow_name) & 
                                   (processor_metrics['server'] == flow_server)].copy()
    
    total_procs = len(flow_data)
    inactive_procs = len(flow_data[flow_data['activity_status'].isin(['Inactive', 'Stopped'])])
    active_procs = total_procs - inactive_procs
    
    # Create safe filename
    safe_filename = f"{flow_server}_{flow_name}".replace(' ', '_').replace('/', '_').replace('\\', '_')
    flow_html_file = f"{CONFIG['flow_reports_dir']}{safe_filename}.html"
    
    # Build individual flow HTML
    flow_html = []
    
    # HTML header for flow
    flow_html.append(f"""
<!DOCTYPE html>
<html>
<head>
    <title>NiFi Flow Analysis - {flow_name}</title>
    <meta charset="utf-8">
    <style>
        body {{
            font-family: Arial, sans-serif;
            margin: 0;
            background-color: #f5f5f5;
        }}
        .container {{
            max-width: 1400px;
            margin: 0 auto;
            background-color: white;
            padding: 30px;
            box-shadow: 0 0 10px rgba(0,0,0,0.1);
        }}
        h1 {{
            color: #333;
            border-bottom: 3px solid #007bff;
            padding-bottom: 10px;
        }}
        h2 {{
            color: #0056b3;
            margin-top: 30px;
            border-left: 4px solid #007bff;
            padding-left: 10px;
        }}
        h3 {{
            color: #495057;
            margin-top: 20px;
        }}
        .summary-cards {{
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
            gap: 15px;
            margin: 20px 0;
        }}
        .card {{
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            padding: 15px;
            border-radius: 8px;
            box-shadow: 0 4px 6px rgba(0,0,0,0.1);
        }}
        .card-title {{
            font-size: 13px;
            opacity: 0.9;
            margin-bottom: 8px;
        }}
        .card-value {{
            font-size: 24px;
            font-weight: bold;
        }}
        .card.green {{
            background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%);
        }}
        .card.red {{
            background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
        }}
        table {{
            width: 100%;
            border-collapse: collapse;
            margin: 20px 0;
            background-color: white;
            font-size: 13px;
        }}
        th, td {{
            padding: 10px;
            text-align: left;
            border-bottom: 1px solid #dee2e6;
        }}
        th {{
            background-color: #007bff;
            color: white;
            font-weight: bold;
            position: sticky;
            top: 0;
        }}
        tr:hover {{
            background-color: #f8f9fa;
        }}
        .chart-container {{
            margin: 20px 0;
        }}
        .timestamp {{
            color: #6c757d;
            font-size: 14px;
            margin-top: 10px;
        }}
        .processor-id {{
            font-family: monospace;
            font-size: 11px;
            color: #6c757d;
        }}
        .canvas-link {{
            display: inline-block;
            padding: 4px 8px;
            background-color: #17a2b8;
            color: white;
            text-decoration: none;
            border-radius: 4px;
            font-size: 12px;
            transition: background-color 0.3s;
        }}
        .canvas-link:hover {{
            background-color: #138496;
            text-decoration: none;
        }}
    </style>
</head>
<body>
<div class="container">
    <h1>üìÅ {flow_name}</h1>
    <p class="timestamp">Server: {flow_server}</p>
    <p class="timestamp">Analysis Period: Last {CONFIG['days_back']} days</p>
""")
    
    # Flow summary cards
    flow_html.append(f"""
    <div class="summary-cards">
        <div class="card green">
            <div class="card-title">Total Processors</div>
            <div class="card-value">{total_procs}</div>
        </div>
        <div class="card green">
            <div class="card-title">Active</div>
            <div class="card-value">{active_procs}</div>
        </div>
        <div class="card red">
            <div class="card-title">Inactive</div>
            <div class="card-value">{inactive_procs}</div>
        </div>
    </div>
""")
    
    # Activity histogram
    flow_html.append('<h2>Activity Frequency Distribution</h2>')
    fig_hist = create_flow_histogram(flow_data)
    flow_html.append('<div class="chart-container">')
    flow_html.append(fig_hist.to_html(full_html=False, include_plotlyjs='cdn'))
    flow_html.append('</div>')
    
    # Activity timeline
    flow_html.append('<h2>Activity Timeline</h2>')
    fig_timeline = create_flow_timeline(flow_name, flow_server)
    flow_html.append('<div class="chart-container">')
    flow_html.append(fig_timeline.to_html(full_html=False, include_plotlyjs=False))
    flow_html.append('</div>')
    
    # Processor type distribution
    flow_html.append('<h2>Processor Type Distribution</h2>')
    fig_types = create_processor_type_chart(flow_data)
    flow_html.append('<div class="chart-container">')
    flow_html.append(fig_types.to_html(full_html=False, include_plotlyjs=False))
    flow_html.append('</div>')
    
    # Processor details table - sorted by parent group (no header rows)
    flow_html.append('<h2>Processor Details</h2>')
    flow_html.append('<table>')
    flow_html.append('<tr>')
    flow_html.append('<th>Processor ID</th>')
    flow_html.append('<th>Processor Name</th>')
    flow_html.append('<th>Type</th>')
    flow_html.append('<th>Parent Group Name</th>')
    flow_html.append('<th>Parent Group ID</th>')
    flow_html.append('<th>Canvas Link</th>')
    flow_html.append('<th>Status</th>')
    flow_html.append('<th>Activity %</th>')
    flow_html.append('<th>FlowFiles (30d)</th>')
    flow_html.append('<th>Tasks (30d)</th>')
    flow_html.append('<th>Days Since Active</th>')
    flow_html.append('<th>Recommendation</th>')
    flow_html.append('</tr>')
    
    # Sort by parent group name first, then by activity rate within each group
    flow_data_sorted = flow_data.sort_values(['parent_group_name', 'activity_rate_pct'])
    
    # NO group header logic - just iterate through sorted rows
    for _, row in flow_data_sorted.iterrows():
        flow_html.append('<tr>')
        flow_html.append(f'<td><span class="processor-id">{row["processor_id"]}</span></td>')
        flow_html.append(f'<td>{row["processor_name"]}</td>')
        flow_html.append(f'<td>{row["processor_type"]}</td>')
        flow_html.append(f'<td>{row["parent_group_name"]}</td>')
        flow_html.append(f'<td><span class="processor-id">{row["parent_group_id"]}</span></td>')
        flow_html.append(f'<td>{create_canvas_link(flow_server, row["parent_group_id"], row["processor_id"])}</td>')
        flow_html.append(f'<td>{create_status_badge(row["activity_status"])}</td>')
        flow_html.append(f'<td>{row["activity_rate_pct"]:.2f}%</td>')
        flow_html.append(f'<td>{int(row["total_flowfiles_out"]):,}</td>')
        flow_html.append(f'<td>{int(row["total_tasks"]):,}</td>')
        flow_html.append(f'<td>{int(row["days_since_active"])}</td>')
        flow_html.append(f'<td>{create_recommendation_badge(row["recommendation"])}</td>')
        flow_html.append('</tr>')
    
    flow_html.append('</table>')
    flow_html.append('</div></body></html>')
    
    # Write individual flow HTML file
    with open(flow_html_file, 'w', encoding='utf-8') as f:
        f.write(''.join(flow_html))
    
    # Add link to summary page
    html_summary.append(f"""
    <div class="flow-card">
        <div class="flow-title">üìÅ {flow_name}</div>
        <div class="flow-stats">
            <div><span class="stat-label">Server:</span> {flow_server}</div>
            <div><span class="stat-label">Total:</span> {total_procs} processors</div>
            <div><span class="stat-label stat-good">Active:</span> {active_procs}</div>
            <div><span class="stat-label stat-bad">Inactive:</span> {inactive_procs}</div>
        </div>
        <a href="flow_reports/{safe_filename}.html" class="flow-link">View Detailed Report ‚Üí</a>
    </div>
""")

# Close flow list and summary HTML
html_summary.append('</div>')  # Close flow-list
html_summary.append('</div></body></html>')

print(f"‚úì Generated {len(flows)} individual flow HTML files")

In [None]:
# Cell 10: Add Cleanup Recommendations to Summary

print("Adding cleanup recommendations to summary page...")

# High-priority removals
remove_list = processor_metrics[processor_metrics['recommendation'] == 'Remove'].copy()
review_list = processor_metrics[processor_metrics['recommendation'] == 'Review'].copy()
keep_list = processor_metrics[processor_metrics['recommendation'] == 'Keep']

# Add cleanup section to summary HTML (before closing the flow-list div)
cleanup_html = []

cleanup_html.append("""
    <h2>üßπ Cleanup Recommendations</h2>
""")

cleanup_html.append(f'<h3>High-Priority Removals ({len(remove_list)} processors)</h3>')
cleanup_html.append('<p>These processors have zero activity and can be safely removed:</p>')

if len(remove_list) > 0:
    cleanup_html.append('<table>')
    cleanup_html.append('<tr><th>Processor ID</th><th>Flow</th><th>Server</th><th>Processor Name</th><th>Type</th><th>Parent Group</th><th>Canvas Link</th><th>Run Status</th><th>Days Inactive</th></tr>')
    for _, row in remove_list.head(50).iterrows():  # Limit to 50 for summary
        cleanup_html.append(f'<tr>')
        cleanup_html.append(f'<td><span class="processor-id">{row["processor_id"]}</span></td>')
        cleanup_html.append(f'<td>{row["flow_name"]}</td>')
        cleanup_html.append(f'<td>{row["server"]}</td>')
        cleanup_html.append(f'<td>{row["processor_name"]}</td>')
        cleanup_html.append(f'<td>{row["processor_type"]}</td>')
        cleanup_html.append(f'<td>{row["parent_group_name"]}</td>')
        cleanup_html.append(f'<td>{create_canvas_link(row["server"], row["parent_group_id"], row["processor_id"])}</td>')
        cleanup_html.append(f'<td>{row["last_run_status"]}</td>')
        cleanup_html.append(f'<td>{int(row["days_since_active"])}</td>')
        cleanup_html.append('</tr>')
    cleanup_html.append('</table>')
    if len(remove_list) > 50:
        cleanup_html.append(f'<p><em>Showing top 50 of {len(remove_list)} processors recommended for removal. See cleanup CSV for full list.</em></p>')
else:
    cleanup_html.append('<p>‚úì No processors recommended for removal!</p>')

# Review required
cleanup_html.append(f'<h3>Review Required ({len(review_list)} processors)</h3>')
cleanup_html.append('<p>These processors have low activity and should be reviewed manually:</p>')

if len(review_list) > 0:
    cleanup_html.append('<table>')
    cleanup_html.append('<tr><th>Processor ID</th><th>Flow</th><th>Server</th><th>Processor Name</th><th>Type</th><th>Parent Group</th><th>Canvas Link</th><th>Activity %</th><th>Total Tasks</th></tr>')
    for _, row in review_list.head(20).iterrows():  # Limit to 20
        cleanup_html.append(f'<tr>')
        cleanup_html.append(f'<td><span class="processor-id">{row["processor_id"]}</span></td>')
        cleanup_html.append(f'<td>{row["flow_name"]}</td>')
        cleanup_html.append(f'<td>{row["server"]}</td>')
        cleanup_html.append(f'<td>{row["processor_name"]}</td>')
        cleanup_html.append(f'<td>{row["processor_type"]}</td>')
        cleanup_html.append(f'<td>{row["parent_group_name"]}</td>')
        cleanup_html.append(f'<td>{create_canvas_link(row["server"], row["parent_group_id"], row["processor_id"])}</td>')
        cleanup_html.append(f'<td>{row["activity_rate_pct"]:.2f}%</td>')
        cleanup_html.append(f'<td>{int(row["total_tasks"]):,}</td>')
        cleanup_html.append('</tr>')
    cleanup_html.append('</table>')
    if len(review_list) > 20:
        cleanup_html.append(f'<p><em>Showing top 20 of {len(review_list)} processors requiring review. See cleanup CSV for full list.</em></p>')
else:
    cleanup_html.append('<p>‚úì No processors require review!</p>')

# Active processors summary
cleanup_html.append(f'<h3>Active Processors ({len(keep_list)} processors)</h3>')
cleanup_html.append(f'<p>‚úì {len(keep_list)} processors are active and should be kept.</p>')
cleanup_html.append(f'<p>Average activity rate: {keep_list["activity_rate_pct"].mean():.1f}%</p>')
cleanup_html.append(f'<p>Total throughput (30d): {int(keep_list["total_flowfiles_out"].sum()):,} flowfiles</p>')

# Store cleanup HTML for later insertion into summary
cleanup_html_content = ''.join(cleanup_html)

print("‚úì Cleanup recommendations section created")

In [None]:
# Cell 11: Write Summary HTML File

print("Writing summary HTML file...")

# Insert cleanup recommendations before closing tags
html_summary.append(cleanup_html_content)

# Combine all summary HTML parts
html_content = ''.join(html_summary)

# Write summary HTML file
with open(CONFIG['output_file'], 'w', encoding='utf-8') as f:
    f.write(html_content)

print(f"‚úì Summary HTML saved to: {CONFIG['output_file']}")

In [None]:
# Cell 12: Export Summary Files

print("Exporting additional summary files...")

# Export summary JSON
summary_file = CONFIG['output_dir'] + 'summary_stats.json'
with open(summary_file, 'w') as f:
    json.dump(summary, f, indent=2, default=str)
print(f"‚úì Summary stats saved to: {summary_file}")

# Export cleanup CSV
cleanup_file = CONFIG['output_dir'] + 'cleanup_recommendations_all_flows.csv'
cleanup_cols = ['server', 'flow_name', 'processor_id', 'processor_name', 'processor_type',
                'recommendation', 'activity_rate_pct', 'days_since_active', 'last_run_status']
processor_metrics[cleanup_cols].to_csv(cleanup_file, index=False)
print(f"‚úì Cleanup recommendations saved to: {cleanup_file}")

print("\n" + "="*60)
print("‚úÖ DASHBOARD GENERATION COMPLETE!")
print("="*60)
print(f"\nüìä Summary dashboard: {CONFIG['output_file']}")
print(f"üìÅ Individual flow reports: {CONFIG['flow_reports_dir']}")
print(f"üìÑ Summary stats: {summary_file}")
print(f"üìÑ Cleanup CSV: {cleanup_file}")
print(f"\nüìà Statistics:")
print(f"  Total processors: {summary['total_processors']}")
print(f"  Flows analyzed: {summary['flows_analyzed']}")
print(f"  üî¥ Cleanup candidates: {summary['cleanup_candidates']}")
print(f"  ‚ö†Ô∏è  Review required: {summary['review_required']}")
print(f"  üü¢ Active: {len(processor_metrics[processor_metrics['recommendation'] == 'Keep'])}")
print(f"\nOpen processor_analysis_summary.html to view the dashboard!")
print("Click on any flow to view its detailed report.")