# NiFi Processor Analysis Dashboard

Generate interactive HTML dashboard analyzing processor activity across NiFi flows.

**Purpose**: Identify unused processors for cleanup/removal
**Data Source**: Delta table `nifi_processor_snapshots_full_attributes`
**Time Range**: Last 30 days
**Output**: Interactive HTML dashboard with Plotly visualizations

In [None]:
# Cell 1: Setup & Imports

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime, timedelta
import json
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, sum as spark_sum, max as spark_max, min as spark_min, when, datediff, current_timestamp, lit

print("‚úì Libraries imported successfully!")

In [None]:
# Cell 2: Configuration

CONFIG = {
    'table': '1dp_mfg_sbx.validation_test_eric.nifi_processor_snapshots_full_attributes',
    'days_back': 30,
    'output_file': '/Volumes/1dp_mfg_sbx/validation_test_eric/files/processor_analysis_dashboard.html',
    'output_dir': '/Volumes/1dp_mfg_sbx/validation_test_eric/files/',
    'inactive_threshold_pct': 1.0,  # <1% activity = inactive
    'servers': None,  # None = all servers, or list like ['prod', 'thailand']
}

print(f"‚úì Configuration loaded")
print(f"  Table: {CONFIG['table']}")
print(f"  Analysis period: Last {CONFIG['days_back']} days")
print(f"  Output: {CONFIG['output_file']}")

In [None]:
# Cell 3: Load Data from Delta Table (Optimized)

print("Loading data from Delta table...")

# Calculate cutoff date
cutoff_date = (datetime.now() - timedelta(days=CONFIG['days_back'])).strftime('%Y-%m-%d')
print(f"  Cutoff date: {cutoff_date}")

# Build query with server filter if needed
where_clause = f"WHERE snapshot_timestamp >= '{cutoff_date}'"
if CONFIG['servers']:
    servers_filter = "','".join(CONFIG['servers'])
    where_clause += f" AND server IN ('{servers_filter}')"

# Get basic data stats (without loading all data)
stats_query = f"""
    SELECT 
        COUNT(*) as total_records,
        COUNT(DISTINCT processor_id) as unique_processors,
        COUNT(DISTINCT flow_name) as unique_flows,
        COUNT(DISTINCT server) as unique_servers,
        MIN(snapshot_timestamp) as earliest_snapshot,
        MAX(snapshot_timestamp) as latest_snapshot
    FROM {CONFIG['table']}
    {where_clause}
"""

stats = spark.sql(stats_query).toPandas().iloc[0]
print(f"‚úì Data overview:")
print(f"  Total records: {stats['total_records']:,}")
print(f"  Unique processors: {stats['unique_processors']}")
print(f"  Flows: {stats['unique_flows']}")
print(f"  Servers: {stats['unique_servers']}")
print(f"  Date range: {stats['earliest_snapshot']} to {stats['latest_snapshot']}")

print("\n‚ö†Ô∏è  Optimizing: Aggregating metrics in Spark SQL to avoid memory issues...")
print("   (This may take a few minutes for large datasets)")

In [None]:
# Cell 4: Calculate Processor Activity Metrics in Spark SQL

print("Calculating processor activity metrics in Spark SQL...")

# Calculate all processor metrics in a single Spark SQL query
metrics_query = f"""
WITH processor_activity AS (
    SELECT
        server,
        flow_name,
        processor_id,
        processor_name,
        processor_type,
        snapshot_timestamp,
        flow_files_in,
        flow_files_out,
        bytes_in,
        bytes_out,
        tasks,
        run_status,
        CASE 
            WHEN flow_files_out > 0 OR tasks > 0 THEN 1 
            ELSE 0 
        END as has_activity
    FROM {CONFIG['table']}
    {where_clause}
),
processor_metrics AS (
    SELECT
        server,
        flow_name,
        processor_id,
        processor_name,
        processor_type,
        COUNT(*) as total_snapshots,
        SUM(has_activity) as snapshots_with_activity,
        SUM(flow_files_in) as total_flowfiles_in,
        SUM(flow_files_out) as total_flowfiles_out,
        SUM(bytes_in) as total_bytes_in,
        SUM(bytes_out) as total_bytes_out,
        SUM(tasks) as total_tasks,
        MAX(CASE WHEN has_activity = 1 THEN snapshot_timestamp END) as last_active_time,
        MAX(run_status) as last_run_status
    FROM processor_activity
    GROUP BY server, flow_name, processor_id, processor_name, processor_type
)
SELECT
    server,
    flow_name,
    processor_id,
    processor_name,
    processor_type,
    total_snapshots,
    snapshots_with_activity,
    total_flowfiles_in,
    total_flowfiles_out,
    total_bytes_in,
    total_bytes_out,
    total_tasks,
    last_active_time,
    last_run_status,
    ROUND((snapshots_with_activity * 100.0 / total_snapshots), 2) as activity_rate_pct,
    COALESCE(DATEDIFF(CURRENT_TIMESTAMP(), last_active_time), {CONFIG['days_back']}) as days_since_active,
    CASE
        WHEN last_run_status != 'Running' THEN 'Stopped'
        WHEN snapshots_with_activity = 0 THEN 'Inactive'
        WHEN (snapshots_with_activity * 100.0 / total_snapshots) < {CONFIG['inactive_threshold_pct']} THEN 'Low Activity'
        ELSE 'Active'
    END as activity_status,
    CASE
        WHEN last_run_status != 'Running' AND snapshots_with_activity = 0 THEN 'Remove'
        WHEN snapshots_with_activity = 0 THEN 'Remove'
        WHEN (snapshots_with_activity * 100.0 / total_snapshots) < {CONFIG['inactive_threshold_pct']} THEN 'Review'
        ELSE 'Keep'
    END as recommendation,
    CASE
        WHEN snapshots_with_activity = 0 THEN '0 (Never)'
        WHEN snapshots_with_activity <= 100 THEN '1-100 (Rarely)'
        WHEN snapshots_with_activity <= 1000 THEN '101-1000 (Occasionally)'
        WHEN snapshots_with_activity <= 4000 THEN '1001-4000 (Regularly)'
        ELSE '4000+ (Constantly)'
    END as activity_bucket
FROM processor_metrics
ORDER BY server, flow_name, activity_rate_pct
"""

# Execute query and convert only the aggregated results to Pandas
processor_metrics = spark.sql(metrics_query).toPandas()

print(f"‚úì Calculated metrics for {len(processor_metrics)} processors")
print(f"\nActivity Status Distribution:")
print(processor_metrics['activity_status'].value_counts())
print(f"\nRecommendations:")
print(processor_metrics['recommendation'].value_counts())

In [None]:
# Cell 5: Helper Functions for Visualizations

def create_summary_cards(metrics_df):
    """Create executive summary metrics."""
    total_processors = len(metrics_df)
    inactive_processors = len(metrics_df[metrics_df['activity_status'].isin(['Inactive', 'Stopped'])])
    cleanup_candidates = len(metrics_df[metrics_df['recommendation'] == 'Remove'])
    review_required = len(metrics_df[metrics_df['recommendation'] == 'Review'])
    flows_analyzed = metrics_df['flow_name'].nunique()
    servers = metrics_df['server'].unique().tolist()
    
    cleanup_impact_pct = (cleanup_candidates / total_processors * 100) if total_processors > 0 else 0
    
    return {
        'total_processors': total_processors,
        'inactive_processors': inactive_processors,
        'cleanup_candidates': cleanup_candidates,
        'review_required': review_required,
        'flows_analyzed': flows_analyzed,
        'servers': servers,
        'cleanup_impact_pct': round(cleanup_impact_pct, 1),
        'last_update': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    }

def create_status_badge(status):
    """Create HTML status badge."""
    colors = {
        'Active': '#28a745',
        'Low Activity': '#ffc107',
        'Inactive': '#dc3545',
        'Stopped': '#6c757d'
    }
    emojis = {
        'Active': 'üü¢',
        'Low Activity': 'üü°',
        'Inactive': 'üî¥',
        'Stopped': '‚ö´'
    }
    color = colors.get(status, '#6c757d')
    emoji = emojis.get(status, '‚ö´')
    return f'{emoji} <span style="color:{color};font-weight:bold;">{status}</span>'

def create_recommendation_badge(rec):
    """Create HTML recommendation badge."""
    colors = {
        'Keep': '#28a745',
        'Review': '#ffc107',
        'Remove': '#dc3545'
    }
    symbols = {
        'Keep': '‚úì',
        'Review': '‚ö†Ô∏è',
        'Remove': '‚ùå'
    }
    color = colors.get(rec, '#6c757d')
    symbol = symbols.get(rec, '?')
    return f'{symbol} <span style="color:{color};font-weight:bold;">{rec}</span>'

print("‚úì Helper functions defined")

In [None]:
# Cell 6: Create Executive Summary Visualizations

print("Creating executive summary visualizations...")

summary = create_summary_cards(processor_metrics)

# Pie chart: Active vs Inactive
status_counts = processor_metrics['activity_status'].value_counts()
fig_pie = go.Figure(data=[go.Pie(
    labels=status_counts.index,
    values=status_counts.values,
    marker=dict(colors=['#28a745', '#ffc107', '#dc3545', '#6c757d']),
    hole=0.4
)])
fig_pie.update_layout(
    title="Processor Status Distribution",
    height=400
)

# Bar chart: Top flows by processor count
flow_summary = processor_metrics.groupby('flow_name').agg({
    'processor_id': 'count',
    'activity_status': lambda x: (x.isin(['Inactive', 'Stopped'])).sum()
}).reset_index()
flow_summary.columns = ['flow_name', 'total_processors', 'inactive_processors']
flow_summary['active_processors'] = flow_summary['total_processors'] - flow_summary['inactive_processors']
flow_summary = flow_summary.sort_values('total_processors', ascending=False).head(10)

fig_bar = go.Figure()
fig_bar.add_trace(go.Bar(
    name='Active',
    x=flow_summary['flow_name'],
    y=flow_summary['active_processors'],
    marker_color='#28a745'
))
fig_bar.add_trace(go.Bar(
    name='Inactive',
    x=flow_summary['flow_name'],
    y=flow_summary['inactive_processors'],
    marker_color='#dc3545'
))
fig_bar.update_layout(
    title="Top 10 Flows by Processor Count",
    barmode='stack',
    xaxis_title="Flow Name",
    yaxis_title="Processor Count",
    height=400,
    xaxis_tickangle=-45
)

print("‚úì Executive summary visualizations created")

In [None]:
# Cell 7: Create Per-Flow Visualizations

def create_flow_histogram(flow_data):
    """Create activity frequency histogram for a flow."""
    bucket_counts = flow_data['activity_bucket'].value_counts().reindex([
        '0 (Never)', '1-100 (Rarely)', '101-1000 (Occasionally)',
        '1001-4000 (Regularly)', '4000+ (Constantly)'
    ], fill_value=0)
    
    colors = ['#dc3545', '#fd7e14', '#ffc107', '#90EE90', '#28a745']
    
    fig = go.Figure(data=[go.Bar(
        x=bucket_counts.index,
        y=bucket_counts.values,
        marker_color=colors,
        text=bucket_counts.values,
        textposition='auto'
    )])
    
    fig.update_layout(
        title="Processor Activity Frequency",
        xaxis_title="Activity Level",
        yaxis_title="Number of Processors",
        height=350,
        showlegend=False
    )
    
    return fig

def create_flow_timeline(flow_name, flow_server):
    """Create activity timeline for a flow - loads data on-demand."""
    # Query only this flow's daily aggregated data
    timeline_query = f"""
    SELECT
        processor_name,
        DATE_TRUNC('day', snapshot_timestamp) as day,
        SUM(flow_files_out) as daily_flowfiles,
        SUM(tasks) as daily_tasks
    FROM {CONFIG['table']}
    {where_clause}
        AND flow_name = '{flow_name}'
        AND server = '{flow_server}'
    GROUP BY processor_name, DATE_TRUNC('day', snapshot_timestamp)
    ORDER BY processor_name, day
    """
    
    daily = spark.sql(timeline_query).toPandas()
    
    if daily.empty:
        # Return empty figure if no data
        fig = go.Figure()
        fig.update_layout(title="No activity data available", height=200)
        return fig
    
    # Convert day to date for pivoting
    daily['day'] = pd.to_datetime(daily['day']).dt.date
    
    # Create heatmap
    pivot = daily.pivot(index='processor_name', columns='day', values='daily_flowfiles').fillna(0)
    
    fig = go.Figure(data=go.Heatmap(
        z=pivot.values,
        x=pivot.columns,
        y=pivot.index,
        colorscale='YlGnBu',
        hovertemplate='Processor: %{y}<br>Date: %{x}<br>FlowFiles: %{z}<extra></extra>'
    ))
    
    fig.update_layout(
        title="Processor Activity Timeline (Daily)",
        xaxis_title="Date",
        yaxis_title="Processor",
        height=max(400, len(pivot) * 20)  # Scale height with processor count
    )
    
    return fig

def create_processor_type_chart(flow_data):
    """Create processor type distribution chart."""
    type_summary = flow_data.groupby(['processor_type', 'activity_status']).size().reset_index(name='count')
    
    # Separate active and inactive
    active_df = type_summary[type_summary['activity_status'] == 'Active']
    inactive_df = type_summary[type_summary['activity_status'].isin(['Inactive', 'Stopped', 'Low Activity'])]
    
    fig = go.Figure()
    
    if not active_df.empty:
        fig.add_trace(go.Bar(
            name='Active',
            x=active_df['processor_type'],
            y=active_df['count'],
            marker_color='#28a745'
        ))
    
    if not inactive_df.empty:
        inactive_grouped = inactive_df.groupby('processor_type')['count'].sum().reset_index()
        fig.add_trace(go.Bar(
            name='Inactive',
            x=inactive_grouped['processor_type'],
            y=inactive_grouped['count'],
            marker_color='#dc3545'
        ))
    
    fig.update_layout(
        title="Processor Type Distribution",
        barmode='stack',
        xaxis_title="Processor Type",
        yaxis_title="Count",
        height=350,
        xaxis_tickangle=-45
    )
    
    return fig

print("‚úì Per-flow visualization functions defined")

In [None]:
# Cell 8: Generate HTML Dashboard

print("Generating HTML dashboard...")

# Start building HTML
html_parts = []

# HTML header
html_parts.append("""
<!DOCTYPE html>
<html>
<head>
    <title>NiFi Processor Analysis Dashboard</title>
    <meta charset="utf-8">
    <style>
        body {
            font-family: Arial, sans-serif;
            margin: 20px;
            background-color: #f5f5f5;
        }
        .container {
            max-width: 1400px;
            margin: 0 auto;
            background-color: white;
            padding: 30px;
            box-shadow: 0 0 10px rgba(0,0,0,0.1);
        }
        h1 {
            color: #333;
            border-bottom: 3px solid #007bff;
            padding-bottom: 10px;
        }
        h2 {
            color: #0056b3;
            margin-top: 30px;
            border-left: 4px solid #007bff;
            padding-left: 10px;
        }
        h3 {
            color: #495057;
            margin-top: 20px;
        }
        .summary-cards {
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
            gap: 20px;
            margin: 20px 0;
        }
        .card {
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            padding: 20px;
            border-radius: 10px;
            box-shadow: 0 4px 6px rgba(0,0,0,0.1);
        }
        .card-title {
            font-size: 14px;
            opacity: 0.9;
            margin-bottom: 10px;
        }
        .card-value {
            font-size: 32px;
            font-weight: bold;
        }
        .card.red {
            background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
        }
        .card.green {
            background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%);
        }
        .card.yellow {
            background: linear-gradient(135deg, #fa709a 0%, #fee140 100%);
        }
        .flow-section {
            margin: 40px 0;
            border: 1px solid #dee2e6;
            border-radius: 8px;
            padding: 20px;
            background-color: #f8f9fa;
        }
        .flow-header {
            display: flex;
            justify-content: space-between;
            align-items: center;
            cursor: pointer;
            padding: 10px;
            background-color: #e9ecef;
            border-radius: 5px;
            margin-bottom: 15px;
        }
        .flow-header:hover {
            background-color: #dee2e6;
        }
        .flow-content {
            display: block;
        }
        table {
            width: 100%;
            border-collapse: collapse;
            margin: 20px 0;
            background-color: white;
        }
        th, td {
            padding: 12px;
            text-align: left;
            border-bottom: 1px solid #dee2e6;
        }
        th {
            background-color: #007bff;
            color: white;
            font-weight: bold;
            position: sticky;
            top: 0;
        }
        tr:hover {
            background-color: #f8f9fa;
        }
        .chart-container {
            margin: 20px 0;
        }
        .timestamp {
            color: #6c757d;
            font-size: 14px;
            margin-top: 10px;
        }
        .nav-menu {
            background-color: #343a40;
            padding: 15px;
            border-radius: 5px;
            margin-bottom: 20px;
        }
        .nav-menu a {
            color: white;
            text-decoration: none;
            padding: 8px 15px;
            margin: 0 5px;
            border-radius: 3px;
            display: inline-block;
        }
        .nav-menu a:hover {
            background-color: #495057;
        }
    </style>
</head>
<body>
<div class="container">
    <h1>üìä NiFi Processor Analysis Dashboard</h1>
    <p class="timestamp">Generated: """ + summary['last_update'] + """</p>
    <p class="timestamp">Analysis Period: Last """ + str(CONFIG['days_back']) + """ days</p>
""")

# Executive Summary Cards
html_parts.append("""
    <h2>Executive Summary</h2>
    <div class="summary-cards">
        <div class="card green">
            <div class="card-title">Total Processors</div>
            <div class="card-value">""" + str(summary['total_processors']) + """</div>
        </div>
        <div class="card red">
            <div class="card-title">Inactive Processors</div>
            <div class="card-value">""" + str(summary['inactive_processors']) + """</div>
        </div>
        <div class="card red">
            <div class="card-title">Cleanup Candidates</div>
            <div class="card-value">""" + str(summary['cleanup_candidates']) + """</div>
        </div>
        <div class="card yellow">
            <div class="card-title">Review Required</div>
            <div class="card-value">""" + str(summary['review_required']) + """</div>
        </div>
        <div class="card green">
            <div class="card-title">Flows Analyzed</div>
            <div class="card-value">""" + str(summary['flows_analyzed']) + """</div>
        </div>
        <div class="card">
            <div class="card-title">Cleanup Impact</div>
            <div class="card-value">""" + str(summary['cleanup_impact_pct']) + """%</div>
        </div>
    </div>
""")

# Add executive summary charts
html_parts.append('<div class="chart-container">')
html_parts.append(fig_pie.to_html(full_html=False, include_plotlyjs='cdn'))
html_parts.append('</div>')

html_parts.append('<div class="chart-container">')
html_parts.append(fig_bar.to_html(full_html=False, include_plotlyjs=False))
html_parts.append('</div>')

print("‚úì Executive summary section created")

In [None]:
# Cell 9: Generate Per-Flow Sections

print("Generating per-flow sections...")

flows = processor_metrics[['server', 'flow_name']].drop_duplicates().values

for i, (flow_server, flow_name) in enumerate(flows, 1):
    print(f"  Processing {i}/{len(flows)}: {flow_name} ({flow_server})")
    
    flow_data = processor_metrics[(processor_metrics['flow_name'] == flow_name) & 
                                   (processor_metrics['server'] == flow_server)].copy()
    
    total_procs = len(flow_data)
    inactive_procs = len(flow_data[flow_data['activity_status'].isin(['Inactive', 'Stopped'])])
    
    # Flow section header
    html_parts.append(f"""
    <div class="flow-section" id="flow-{i}">
        <div class="flow-header" onclick="toggleFlow({i})">
            <h3>üìÅ {flow_name} ({flow_server})</h3>
            <div>
                <span style="margin-right:20px;">Total: {total_procs} | Inactive: {inactive_procs}</span>
                <span id="toggle-{i}">‚ñº</span>
            </div>
        </div>
        <div class="flow-content" id="content-{i}">
    """)
    
    # Activity histogram
    fig_hist = create_flow_histogram(flow_data)
    html_parts.append('<div class="chart-container">')
    html_parts.append(fig_hist.to_html(full_html=False, include_plotlyjs=False))
    html_parts.append('</div>')
    
    # Activity timeline (loads data on-demand to save memory)
    fig_timeline = create_flow_timeline(flow_name, flow_server)
    html_parts.append('<div class="chart-container">')
    html_parts.append(fig_timeline.to_html(full_html=False, include_plotlyjs=False))
    html_parts.append('</div>')
    
    # Processor type distribution
    fig_types = create_processor_type_chart(flow_data)
    html_parts.append('<div class="chart-container">')
    html_parts.append(fig_types.to_html(full_html=False, include_plotlyjs=False))
    html_parts.append('</div>')
    
    # Processor details table
    html_parts.append('<h4>Processor Details</h4>')
    html_parts.append('<table>')
    html_parts.append('<tr>')
    html_parts.append('<th>Processor Name</th>')
    html_parts.append('<th>Type</th>')
    html_parts.append('<th>Status</th>')
    html_parts.append('<th>Activity %</th>')
    html_parts.append('<th>FlowFiles (30d)</th>')
    html_parts.append('<th>Tasks (30d)</th>')
    html_parts.append('<th>Days Since Active</th>')
    html_parts.append('<th>Recommendation</th>')
    html_parts.append('</tr>')
    
    # Sort by activity rate (inactive first)
    flow_data_sorted = flow_data.sort_values('activity_rate_pct')
    
    for _, row in flow_data_sorted.iterrows():
        html_parts.append('<tr>')
        html_parts.append(f'<td>{row["processor_name"]}</td>')
        html_parts.append(f'<td>{row["processor_type"]}</td>')
        html_parts.append(f'<td>{create_status_badge(row["activity_status"])}</td>')
        html_parts.append(f'<td>{row["activity_rate_pct"]:.2f}%</td>')
        html_parts.append(f'<td>{int(row["total_flowfiles_out"]):,}</td>')
        html_parts.append(f'<td>{int(row["total_tasks"]):,}</td>')
        html_parts.append(f'<td>{int(row["days_since_active"])}</td>')
        html_parts.append(f'<td>{create_recommendation_badge(row["recommendation"])}</td>')
        html_parts.append('</tr>')
    
    html_parts.append('</table>')
    html_parts.append('</div></div>')  # Close flow-content and flow-section

print("‚úì Per-flow sections created")

In [None]:
# Cell 10: Add Cleanup Recommendations Section

print("Creating cleanup recommendations section...")

html_parts.append("""
    <h2>üßπ Cleanup Recommendations</h2>
""")

# High-priority removals
remove_list = processor_metrics[processor_metrics['recommendation'] == 'Remove'].copy()
html_parts.append(f'<h3>High-Priority Removals ({len(remove_list)} processors)</h3>')
html_parts.append('<p>These processors have zero activity and can be safely removed:</p>')

if len(remove_list) > 0:
    html_parts.append('<table>')
    html_parts.append('<tr><th>Flow</th><th>Processor Name</th><th>Type</th><th>Run Status</th><th>Days Inactive</th></tr>')
    for _, row in remove_list.iterrows():
        html_parts.append(f'<tr>')
        html_parts.append(f'<td>{row["flow_name"]}</td>')
        html_parts.append(f'<td>{row["processor_name"]}</td>')
        html_parts.append(f'<td>{row["processor_type"]}</td>')
        html_parts.append(f'<td>{row["last_run_status"]}</td>')
        html_parts.append(f'<td>{int(row["days_since_active"])}</td>')
        html_parts.append('</tr>')
    html_parts.append('</table>')
else:
    html_parts.append('<p>‚úì No processors recommended for removal!</p>')

# Review required
review_list = processor_metrics[processor_metrics['recommendation'] == 'Review'].copy()
html_parts.append(f'<h3>Review Required ({len(review_list)} processors)</h3>')
html_parts.append('<p>These processors have low activity and should be reviewed manually:</p>')

if len(review_list) > 0:
    html_parts.append('<table>')
    html_parts.append('<tr><th>Flow</th><th>Processor Name</th><th>Type</th><th>Activity %</th><th>Total Tasks</th></tr>')
    for _, row in review_list.head(20).iterrows():  # Limit to 20
        html_parts.append(f'<tr>')
        html_parts.append(f'<td>{row["flow_name"]}</td>')
        html_parts.append(f'<td>{row["processor_name"]}</td>')
        html_parts.append(f'<td>{row["processor_type"]}</td>')
        html_parts.append(f'<td>{row["activity_rate_pct"]:.2f}%</td>')
        html_parts.append(f'<td>{int(row["total_tasks"]):,}</td>')
        html_parts.append('</tr>')
    html_parts.append('</table>')
    if len(review_list) > 20:
        html_parts.append(f'<p><em>Showing top 20 of {len(review_list)} processors requiring review.</em></p>')
else:
    html_parts.append('<p>‚úì No processors require review!</p>')

# Active processors summary
keep_list = processor_metrics[processor_metrics['recommendation'] == 'Keep']
html_parts.append(f'<h3>Active Processors ({len(keep_list)} processors)</h3>')
html_parts.append(f'<p>‚úì {len(keep_list)} processors are active and should be kept.</p>')
html_parts.append(f'<p>Average activity rate: {keep_list["activity_rate_pct"].mean():.1f}%</p>')
html_parts.append(f'<p>Total throughput (30d): {int(keep_list["total_flowfiles_out"].sum()):,} flowfiles</p>')

print("‚úì Cleanup recommendations section created")

In [None]:
# Cell 11: Close HTML and Add JavaScript

html_parts.append("""
    <script>
        function toggleFlow(id) {
            var content = document.getElementById('content-' + id);
            var toggle = document.getElementById('toggle-' + id);
            if (content.style.display === 'none') {
                content.style.display = 'block';
                toggle.textContent = '‚ñº';
            } else {
                content.style.display = 'none';
                toggle.textContent = '‚ñ∂';
            }
        }
    </script>
</div>
</body>
</html>
""")

# Combine all HTML parts
html_content = ''.join(html_parts)

print("‚úì HTML dashboard assembled")

In [None]:
# Cell 12: Export HTML and Summary Files

print("Exporting files...")

# Write HTML dashboard
with open(CONFIG['output_file'], 'w', encoding='utf-8') as f:
    f.write(html_content)
print(f"‚úì HTML dashboard saved to: {CONFIG['output_file']}")

# Export summary JSON
summary_file = CONFIG['output_dir'] + 'summary_stats.json'
with open(summary_file, 'w') as f:
    json.dump(summary, f, indent=2, default=str)
print(f"‚úì Summary stats saved to: {summary_file}")

# Export cleanup CSV
cleanup_file = CONFIG['output_dir'] + 'cleanup_recommendations_all_flows.csv'
cleanup_cols = ['server', 'flow_name', 'processor_id', 'processor_name', 'processor_type',
                'recommendation', 'activity_rate_pct', 'days_since_active', 'last_run_status']
processor_metrics[cleanup_cols].to_csv(cleanup_file, index=False)
print(f"‚úì Cleanup recommendations saved to: {cleanup_file}")

print("\n" + "="*60)
print("‚úÖ DASHBOARD GENERATION COMPLETE!")
print("="*60)
print(f"\nMain dashboard: {CONFIG['output_file']}")
print(f"Summary stats: {summary_file}")
print(f"Cleanup CSV: {cleanup_file}")
print(f"\nüìä Total processors: {summary['total_processors']}")
print(f"üî¥ Cleanup candidates: {summary['cleanup_candidates']}")
print(f"‚ö†Ô∏è  Review required: {summary['review_required']}")
print(f"üü¢ Active: {len(keep_list)}")
print(f"\nOpen the HTML file in your browser to view the interactive dashboard!")