# Monitor Hub Analysis (Fix)

This notebook performs the analysis using the raw downloaded data directly, bypassing the potentially incomplete CSV reports.

## Fixes Implemented:
1.  **Workspace & Error Messages**: Merges detailed job history to populate missing fields.
2.  **User ID Recovery (Smart Merge)**: Correlates detailed jobs with base activity logs (by Item ID & Time) to preserve the original `User ID` instead of defaulting to "System".
3.  **Non-Destructive**: Runs entirely within this notebook, leaving the core library untouched to prevent breaking changes.

In [None]:
import os
import pandas as pd
from usf_fabric_monitoring.core.pipeline import MonitorHubPipeline
from usf_fabric_monitoring.core.data_loader import load_activities_from_directory

# Configuration
OUTPUT_DIR = "monitor_hub_analysis" 

# Initialize Pipeline (to access helper methods)
pipeline = MonitorHubPipeline(OUTPUT_DIR)

print(f"üìÇ Output Directory: {pipeline.output_directory}")

In [None]:
# 1. Load Raw Data (Skip API Extraction)

# A. Load Base Activities from 'raw_data/daily'
extraction_dir = pipeline._prepare_extraction_directory()
print(f"Loading raw activities from: {extraction_dir}")
activities = load_activities_from_directory(str(extraction_dir))
print(f"‚úÖ Loaded {len(activities)} base activities.")

# B. Load Detailed Jobs from 'fabric_item_details'
print("Loading detailed job history...")
detailed_jobs = pipeline._load_detailed_jobs()
print(f"‚úÖ Loaded {len(detailed_jobs)} detailed job records.")

# C. Optimized Smart Merge (Pandas)
import pandas as pd
import numpy as np

print("üîÑ Starting Optimized Smart Merge (Pandas)...")

# 1. Convert to DataFrames
df_activities = pd.DataFrame(activities)
df_jobs = pd.DataFrame(detailed_jobs)

# 2. Pre-process for Merge
# Ensure timestamps are datetime and UTC
def to_utc(df, col):
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], utc=True, errors='coerce')
    return df

df_activities = to_utc(df_activities, "start_time")
df_jobs = to_utc(df_jobs, "startTimeUtc")

# Filter out jobs without start time or item id
df_jobs = df_jobs.dropna(subset=["startTimeUtc", "itemId"])

# Rename job columns for merge preparation
# We map 'itemId' to 'item_id' for the join key
df_jobs = df_jobs.rename(columns={
    "startTimeUtc": "job_start_time",
    "itemId": "item_id", 
    "status": "job_status",
    "failureReason": "job_failure_reason"
})

# Sort for merge_asof (required)
df_activities = df_activities.sort_values("start_time")
df_jobs = df_jobs.sort_values("job_start_time")

# 3. Merge Asof
# Find the nearest job for each activity to enrich it
# Tolerance: 5 minutes (API logs vs Job History can drift)
merged_df = pd.merge_asof(
    df_activities,
    df_jobs,
    left_on="start_time",
    right_on="job_start_time",
    by="item_id",
    tolerance=pd.Timedelta("5min"),
    direction="nearest"
)

print(f"   - Merged {len(merged_df)} records.")

# 4. Enrich Data
# Extract error message from the job's failure details
def extract_error_msg(val):
    if pd.isna(val): return None
    if isinstance(val, dict): return val.get("message")
    return str(val)

def extract_error_code(val):
    if pd.isna(val): return None
    if isinstance(val, dict): return val.get("errorCode")
    return "Unknown"

# Ensure target columns exist before filling
for col_name in ["failure_reason", "error_message", "error_code"]:
    if col_name not in merged_df.columns:
        merged_df[col_name] = None

# Apply extraction if job data was found
if "job_failure_reason" in merged_df.columns:
    merged_df["job_error_message"] = merged_df["job_failure_reason"].apply(extract_error_msg)
    merged_df["job_error_code"] = merged_df["job_failure_reason"].apply(extract_error_code)
    
    # Coalesce with existing columns
    # If activity has no error info, take it from the job
    merged_df["failure_reason"] = merged_df["failure_reason"].fillna(merged_df["job_failure_reason"].astype(str))
    merged_df["error_message"] = merged_df["error_message"].fillna(merged_df["job_error_message"])
    merged_df["error_code"] = merged_df["error_code"].fillna(merged_df["job_error_code"])
    
    # Enrich other metadata
    if "_workspace_name" in merged_df.columns:
        merged_df["workspace_name"] = merged_df["workspace_name"].fillna(merged_df["_workspace_name"])
    if "_item_name" in merged_df.columns:
        merged_df["item_name"] = merged_df["item_name"].fillna(merged_df["_item_name"])
    if "_item_type" in merged_df.columns:
        merged_df["item_type"] = merged_df["item_type"].fillna(merged_df["_item_type"])
        
    # Update status: If job failed, the activity failed (even if API said InProgress)
    merged_df.loc[merged_df["job_status"] == "Failed", "status"] = "Failed"

# 5. Convert back to list of dicts for compatibility
merged_activities = merged_df.to_dict(orient="records")

print(f"‚úÖ Smart Merge Complete.")
print(f"   - Total Activities: {len(merged_activities)}")

In [None]:
# 2. Prepare DataFrame for Analysis (Pandas Fallback)

# Note: We are using Pandas directly because the local Spark environment 
# is experiencing connection issues. The data volume is small enough for Pandas.

import pandas as pd
import numpy as np

print("üîÑ Preparing Analysis DataFrame (Pandas)...")

# Convert to Pandas DataFrame
df_pd = pd.DataFrame(merged_activities)

# Ensure critical columns exist
expected_cols = ["workspace_name", "failure_reason", "error_message", "error_code", "submitted_by", "item_name", "item_type"]
for c in expected_cols:
    if c not in df_pd.columns:
        df_pd[c] = None

# Filter for Failures
final_df = df_pd[df_pd["status"] == "Failed"].copy()

count = len(final_df)
print(f"‚úÖ Filtered to {count} failures.")

In [None]:
# 3. Prepare Analysis DataFrame (Pandas)

# Helper for Coalesce
def coalesce_series(*series):
    result = series[0].copy()
    for s in series[1:]:
        result = result.fillna(s)
    return result

# Helper for User Name Extraction
def extract_user_name(user_id):
    if pd.isna(user_id) or not isinstance(user_id, str):
        return user_id
    try:
        # Extract part before @ and replace . with space
        name_part = user_id.split('@')[0]
        return name_part.replace('.', ' ').title()
    except:
        return user_id

# Select and Rename columns
analysis_df = pd.DataFrame()

# Workspace
analysis_df["Workspace"] = coalesce_series(
    final_df["workspace_name"], 
    final_df["workspace_id"]
).fillna("Unknown")

# Item Name
analysis_df["Item Name"] = final_df["item_name"].fillna("Unknown")

# Item Type
analysis_df["Item Type"] = final_df["item_type"].fillna("Unknown")

# Invoke Type
analysis_df["Invoke Type"] = final_df["activity_type"]

# Time & Duration
analysis_df["Start Time"] = final_df["start_time"]
analysis_df["End Time"] = final_df["end_time"]
analysis_df["Duration (s)"] = final_df["duration_seconds"]

# User ID
analysis_df["User ID"] = final_df["submitted_by"]

# User Name
analysis_df["User Name"] = final_df["submitted_by"].apply(extract_user_name)
# Fallback to User ID if extraction failed or was null
analysis_df["User Name"] = analysis_df["User Name"].fillna(analysis_df["User ID"])

# Error Details
analysis_df["Error Message"] = coalesce_series(
    final_df["failure_reason"], 
    final_df["error_message"], 
    final_df["error_code"]
).fillna("Unknown Error")

analysis_df["Error Code"] = final_df["error_code"]

print("‚úÖ Analysis DataFrame Prepared.")
print(analysis_df.head(5))

In [None]:
# 4. Execute Analysis (Pandas)

if not analysis_df.empty:
    # --- 1. Summary Statistics ---
    total_failures = len(analysis_df)
    unique_workspaces = analysis_df["Workspace"].nunique()
    unique_items = analysis_df["Item Name"].nunique()
    
    print(f"\nüìä SUMMARY STATISTICS")
    print(f"Total Failures: {total_failures}")
    print(f"Affected Workspaces: {unique_workspaces}")
    print(f"Affected Items: {unique_items}")

    # --- 2. Top 10 Failing Items ---
    print("\nüèÜ TOP 10 FAILING ITEMS")
    top_items = analysis_df.groupby(["Workspace", "Item Name", "Item Type"]) \
        .size() \
        .reset_index(name="count") \
        .sort_values("count", ascending=False) \
        .head(10)
    print(top_items.to_string(index=False))

    # --- 3. Failures by User ---
    print("\nüë§ FAILURES BY USER")
    user_stats = analysis_df.groupby("User Name") \
        .size() \
        .reset_index(name="count") \
        .sort_values("count", ascending=False)
    print(user_stats.to_string(index=False))

    # --- 4. Error Message Distribution ---
    print("\n‚ö†Ô∏è ERROR MESSAGE DISTRIBUTION")
    error_stats = analysis_df.groupby("Error Message") \
        .size() \
        .reset_index(name="count") \
        .sort_values("count", ascending=False)
    print(error_stats.to_string(index=False))

    # --- 5. Recent Failures (Last 20) ---
    print("\nüïí MOST RECENT FAILURES")
    recent_failures = analysis_df[["Start Time", "Workspace", "Item Name", "User Name", "Error Message"]] \
        .sort_values("Start Time", ascending=False) \
        .head(20)
    
    # Truncate long error messages for display
    pd.set_option('display.max_colwidth', 100)
    print(recent_failures.to_string(index=False))
else:
    print("No failure data found.")

In [None]:
analysis_df.head(5)

In [None]:
# 5. Investigate Missing End Times and Duration Issues

print("üîç INVESTIGATING DURATION AND END TIME ISSUES")
print("=" * 50)

# Check the original raw data structure
print("üìã SAMPLE RAW ACTIVITY STRUCTURE:")
if activities:
    sample_activity = activities[0]
    for key, value in sample_activity.items():
        print(f"  {key}: {value}")

print(f"\nüìä RAW ACTIVITIES DATA ANALYSIS:")
print(f"Total raw activities: {len(activities)}")

# Convert to DataFrame for analysis
raw_df = pd.DataFrame(activities)

# Check end_time availability in raw data
if 'end_time' in raw_df.columns:
    end_time_missing = raw_df['end_time'].isna().sum()
    end_time_total = len(raw_df)
    print(f"Missing end_time in raw data: {end_time_missing}/{end_time_total} ({end_time_missing/end_time_total*100:.1f}%)")
else:
    print("‚ùå 'end_time' column not found in raw activities")
    print("Available columns:", list(raw_df.columns))

# Check detailed jobs data for duration info
print(f"\nüìä DETAILED JOBS DATA ANALYSIS:")
jobs_df = pd.DataFrame(detailed_jobs)
print(f"Total detailed jobs: {len(detailed_jobs)}")

if detailed_jobs:
    sample_job = detailed_jobs[0]
    print("Sample job structure:")
    for key, value in sample_job.items():
        print(f"  {key}: {value}")

# Check for duration-related fields in jobs
duration_fields = ['duration', 'durationSeconds', 'endTime', 'endTimeUtc', 'startTime', 'startTimeUtc']
available_duration_fields = [field for field in duration_fields if field in jobs_df.columns]
print(f"\nAvailable duration-related fields in jobs: {available_duration_fields}")

for field in available_duration_fields:
    if field in jobs_df.columns:
        missing_count = jobs_df[field].isna().sum()
        total_count = len(jobs_df)
        print(f"  {field}: {missing_count}/{total_count} missing ({missing_count/total_count*100:.1f}%)")

In [None]:
# 6. Fix Duration Calculation Using Job Data

print("üîß IMPLEMENTING DURATION FIX")
print("=" * 40)

# Create a copy of merged_df to avoid modifying the original
fixed_df = merged_df.copy()

# Convert job times to datetime if they aren't already
if 'job_start_time' in fixed_df.columns:
    fixed_df['job_start_time'] = pd.to_datetime(fixed_df['job_start_time'], utc=True, errors='coerce')

if 'endTimeUtc' in fixed_df.columns:
    fixed_df['job_end_time'] = pd.to_datetime(fixed_df['endTimeUtc'], utc=True, errors='coerce')
else:
    # Create job_end_time from endTimeUtc if it exists in the merge
    job_columns = [col for col in fixed_df.columns if 'endTime' in col]
    print(f"Available end time columns: {job_columns}")
    
    if job_columns:
        end_time_col = job_columns[0]  # Use the first available end time column
        fixed_df['job_end_time'] = pd.to_datetime(fixed_df[end_time_col], utc=True, errors='coerce')

# Fix end_time: Use job end time when activity end time is missing
print("Fixing end_time...")
original_missing_end_time = fixed_df['end_time'].isna().sum()
print(f"  Activities missing end_time: {original_missing_end_time}")

if 'job_end_time' in fixed_df.columns:
    # Fill missing end_time with job_end_time
    fixed_df['end_time'] = fixed_df['end_time'].fillna(fixed_df['job_end_time'])
    
    after_fix_missing_end_time = fixed_df['end_time'].isna().sum()
    fixed_count = original_missing_end_time - after_fix_missing_end_time
    print(f"  Fixed {fixed_count} missing end times using job data")
    print(f"  Remaining missing end_time: {after_fix_missing_end_time}")

# Recalculate duration_seconds
print("Recalculating duration...")
def calculate_duration(start_time, end_time):
    if pd.isna(start_time) or pd.isna(end_time):
        return 0.0
    try:
        duration = (end_time - start_time).total_seconds()
        return max(0.0, duration)  # Ensure non-negative duration
    except:
        return 0.0

fixed_df['duration_seconds'] = fixed_df.apply(
    lambda row: calculate_duration(row['start_time'], row['end_time']), 
    axis=1
)

# Update the merged_activities list with fixed data
fixed_activities = fixed_df.to_dict(orient="records")

# Show improvement statistics
original_zero_duration = (merged_df['duration_seconds'] == 0.0).sum()
fixed_zero_duration = (fixed_df['duration_seconds'] == 0.0).sum()
improvement = original_zero_duration - fixed_zero_duration

print(f"\nüìà IMPROVEMENT STATISTICS:")
print(f"  Original zero duration records: {original_zero_duration}")
print(f"  Fixed zero duration records: {fixed_zero_duration}")  
print(f"  Records with duration restored: {improvement}")

# Update the global variables for downstream analysis
merged_activities = fixed_activities
merged_df = fixed_df

print("‚úÖ Duration fix applied successfully!")

In [None]:
# 7. Regenerate Analysis with Fixed Duration Data

print("üîÑ REGENERATING ANALYSIS WITH FIXED DURATION DATA")
print("=" * 50)

# Re-prepare DataFrame for Analysis with fixed data
df_pd_fixed = pd.DataFrame(merged_activities)

# Ensure critical columns exist
for c in expected_cols:
    if c not in df_pd_fixed.columns:
        df_pd_fixed[c] = None

# Filter for Failures
final_df_fixed = df_pd_fixed[df_pd_fixed["status"] == "Failed"].copy()

# Regenerate Analysis DataFrame with fixed duration
analysis_df_fixed = pd.DataFrame()

# Workspace  
analysis_df_fixed["Workspace"] = coalesce_series(
    final_df_fixed["workspace_name"], 
    final_df_fixed["workspace_id"]
).fillna("Unknown")

# Item Name
analysis_df_fixed["Item Name"] = final_df_fixed["item_name"].fillna("Unknown")

# Item Type
analysis_df_fixed["Item Type"] = final_df_fixed["item_type"].fillna("Unknown")

# Invoke Type
analysis_df_fixed["Invoke Type"] = final_df_fixed["activity_type"]

# Time & Duration (FIXED)
analysis_df_fixed["Start Time"] = final_df_fixed["start_time"]
analysis_df_fixed["End Time"] = final_df_fixed["end_time"]
analysis_df_fixed["Duration (s)"] = final_df_fixed["duration_seconds"]

# User ID
analysis_df_fixed["User ID"] = final_df_fixed["submitted_by"]

# User Name
analysis_df_fixed["User Name"] = final_df_fixed["submitted_by"].apply(extract_user_name)
analysis_df_fixed["User Name"] = analysis_df_fixed["User Name"].fillna(analysis_df_fixed["User ID"])

# Error Details
analysis_df_fixed["Error Message"] = coalesce_series(
    final_df_fixed["failure_reason"], 
    final_df_fixed["error_message"], 
    final_df_fixed["error_code"]
).fillna("Unknown Error")

analysis_df_fixed["Error Code"] = final_df_fixed["error_code"]

# Show duration improvement
zero_duration_original = (analysis_df["Duration (s)"] == 0.0).sum()
zero_duration_fixed = (analysis_df_fixed["Duration (s)"] == 0.0).sum()
non_zero_duration_fixed = (analysis_df_fixed["Duration (s)"] > 0.0).sum()

print(f"üìä DURATION ANALYSIS IMPROVEMENT:")
print(f"  Original analysis - Zero duration failures: {zero_duration_original}")
print(f"  Fixed analysis - Zero duration failures: {zero_duration_fixed}")
print(f"  Fixed analysis - Non-zero duration failures: {non_zero_duration_fixed}")
print(f"  Improvement: {zero_duration_original - zero_duration_fixed} failures now have duration data")

# Show sample of fixed data
print(f"\n‚úÖ SAMPLE OF FIXED ANALYSIS DATA:")
print(analysis_df_fixed[analysis_df_fixed["Duration (s)"] > 0].head(5))

In [None]:
# 8. Enhanced Duration-Based Analysis

print("‚è±Ô∏è ENHANCED DURATION-BASED ANALYSIS")
print("=" * 45)

# Filter for failures with valid durations
valid_duration_failures = analysis_df_fixed[analysis_df_fixed["Duration (s)"] > 0].copy()

if len(valid_duration_failures) > 0:
    print(f"üìä DURATION STATISTICS:")
    print(f"  Failures with duration data: {len(valid_duration_failures)}")
    print(f"  Average failure duration: {valid_duration_failures['Duration (s)'].mean():.2f} seconds")
    print(f"  Median failure duration: {valid_duration_failures['Duration (s)'].median():.2f} seconds") 
    print(f"  Max failure duration: {valid_duration_failures['Duration (s)'].max():.2f} seconds")
    print(f"  Min failure duration: {valid_duration_failures['Duration (s)'].min():.2f} seconds")
    
    # Duration percentiles
    percentiles = [25, 50, 75, 90, 95, 99]
    print(f"\n  Duration Percentiles:")
    for p in percentiles:
        value = valid_duration_failures['Duration (s)'].quantile(p/100)
        print(f"    {p}th percentile: {value:.2f}s")
    
    # Longest running failures
    print(f"\nüêå TOP 10 LONGEST RUNNING FAILURES:")
    longest_failures = valid_duration_failures.nlargest(10, "Duration (s)")
    for idx, row in longest_failures.iterrows():
        print(f"  {row['Duration (s)']:.1f}s - {row['Workspace']} / {row['Item Name']} ({row['Item Type']})")
    
    # Quick vs Long failures
    quick_threshold = 30  # 30 seconds
    long_threshold = 300  # 5 minutes
    
    quick_failures = len(valid_duration_failures[valid_duration_failures["Duration (s)"] <= quick_threshold])
    long_failures = len(valid_duration_failures[valid_duration_failures["Duration (s)"] >= long_threshold])
    
    print(f"\n‚ö° FAILURE CATEGORIES BY DURATION:")
    print(f"  Quick failures (‚â§{quick_threshold}s): {quick_failures}")
    print(f"  Long failures (‚â•{long_threshold}s): {long_failures}")
    print(f"  Medium failures: {len(valid_duration_failures) - quick_failures - long_failures}")
    
else:
    print("‚ùå No failures with valid duration data found")

print(f"\n‚úÖ Duration analysis complete!")

In [None]:
# Display the fixed analysis DataFrame
analysis_df_fixed.head(10)