In [None]:
# 1. Setup & Configuration
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
from pyspark.sql.functions import col, to_timestamp, unix_timestamp, coalesce, initcap, regexp_replace, element_at, split, when, lit, count, avg, max as max_, date_trunc

# Configure Plotting Style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_context("notebook", font_scale=1.2)

# PATH CONFIGURATION
# We use the relative path 'Files/...' which maps correctly in Fabric Lakehouses
# This avoids the 'Spark_Ambiguous_MsSparkUtils_UseMountedPathFailure' error
CSV_PATH = "Files/monitor_hub_analysis"
print(f"üìÇ Data Source Path: {CSV_PATH}")

In [None]:
# 2. Load Data
try:
    # Match the master activities report
    path_pattern = f"{CSV_PATH}/activities_master_*.csv"
    print(f"‚è≥ Loading data from {path_pattern}...")
    
    # Read CSV with header & inferSchema
    # inferSchema is important to correctly detect Integers and Timestamps
    raw_df = spark.read.option("header", "true").option("inferSchema", "true").csv(path_pattern)
    
    print(f"‚úÖ Loaded {raw_df.count()} rows.")
    
except Exception as e:
    print(f"‚ùå Error loading data: {e}")
    print("   Ensure that 'Monitor_Hub_Analysis.ipynb' has been run at least once to generate the CSVs.")
    raw_df = None

In [None]:
# 3. Data Standardization
# Map raw CSV columns to a clean schema for analysis
# We use coalesce to handle potential column name variations between versions

if raw_df:
    def safe_col(c):
        return col(c) if c in raw_df.columns else lit(None)

    df = raw_df.select(
        coalesce(safe_col("workspace_name"), safe_col("WorkSpaceName"), safe_col("workspace_id")).alias("Workspace"),
        coalesce(safe_col("item_name"), safe_col("ItemName")).alias("Item_Name"),
        coalesce(safe_col("item_type"), safe_col("ItemType")).alias("Item_Type"),
        coalesce(safe_col("activity_type"), safe_col("Operation")).alias("Operation"),
        coalesce(safe_col("status"), safe_col("Status")).alias("Status"),
        coalesce(safe_col("start_time"), safe_col("CreationTime")).alias("Start_Time"),
        coalesce(safe_col("end_time"), safe_col("EndTime")).alias("End_Time"),
        coalesce(safe_col("duration_seconds"), safe_col("Duration")).cast("double").alias("Duration_Sec"),
        coalesce(safe_col("submitted_by"), safe_col("UserId")).alias("User_ID")
    )
    
    # Cache for performance since we'll query this multiple times
    df.cache()
    df.createOrReplaceTempView("fabric_activities")
    
    df.show(5)
else:
    print("‚ö†Ô∏è No data to process.")

In [None]:
# 4. High-Level KPIs

if raw_df:
    total_activities = df.count()
    failed_activities = df.filter(col("Status") == "Failed").count()
    failure_rate = (failed_activities / total_activities) * 100 if total_activities > 0 else 0
    
    unique_users = df.select("User_ID").distinct().count()
    unique_workspaces = df.select("Workspace").distinct().count()
    
    print("="*40)
    print("üìä EXECUTIVE SUMMARY")
    print("="*40)
    print(f"Total Activities:    {total_activities:,}")
    print(f"Total Failures:      {failed_activities:,}")
    print(f"Failure Rate:        {failure_rate:.2f}%")
    print(f"Active Users:        {unique_users:,}")
    print(f"Active Workspaces:   {unique_workspaces:,}")
    print("="*40)

In [None]:
# 5. Trend Analysis (Daily Volume)

if raw_df:
    # Aggregate by Day
    daily_trend = df.withColumn("Date", date_trunc("day", col("Start_Time"))) \
        .groupBy("Date", "Status") \
        .count() \
        .orderBy("Date") \
        .toPandas()
    
    # Pivot for plotting
    pivot_trend = daily_trend.pivot(index='Date', columns='Status', values='count').fillna(0)
    
    # Plot
    plt.figure(figsize=(14, 6))
    pivot_trend.plot(kind='bar', stacked=True, ax=plt.gca(), colormap='viridis')
    plt.title('Daily Activity Volume by Status')
    plt.xlabel('Date')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
# 6. Top Failing Workspaces

if raw_df:
    top_fail_workspaces = df.filter(col("Status") == "Failed") \
        .groupBy("Workspace") \
        .count() \
        .orderBy(col("count").desc()) \
        .limit(10) \
        .toPandas()
    
    plt.figure(figsize=(12, 6))
    sns.barplot(data=top_fail_workspaces, x='count', y='Workspace', palette='Reds_r')
    plt.title('Top 10 Workspaces by Failure Count')
    plt.xlabel('Failures')
    plt.show()

In [None]:
# 7. Longest Running Items (Performance Bottlenecks)

if raw_df:
    slowest_items = df.filter(col("Status") == "Succeeded") \
        .select("Workspace", "Item_Name", "Operation", "Duration_Sec") \
        .orderBy(col("Duration_Sec").desc()) \
        .limit(10) \
        .toPandas()
    
    print("üê¢ Top 10 Slowest Successful Operations:")
    display(slowest_items)