In [None]:
# Import statements
import pandas as pd
import subprocess
import os
import json
from IPython.display import display, Image, Markdown

In [None]:
# Hardset vars
dataset_path = "data/processed/labeled_asset_dataset_enriched.csv"
config_path = "config/generation_params.json"

In [None]:
# Load the dataset for analysis, if it is not present, trigger a mlflow run using the default pipeline settings
try:
    df = pd.read_csv(dataset_path)
    print("✅ Dataset loaded.")
except FileNotFoundError:
    print(f"⚠️ {dataset_path} not found. Running MLflow pipeline to generate data...")
    result = subprocess.run(
        ["mlflow", "run", ".", "-e", "pipeline", "--env-manager=local"],
        capture_output=True, text=True, encoding="utf-8", errors="replace"
    )
    # Print the output, but ignore decode errors and replace weird chars
    print(result.stdout)
    if result.returncode != 0:
        print("❌ MLflow pipeline failed to run. Check the error above.")
        raise RuntimeError("MLflow pipeline execution failed")
    if os.path.exists(dataset_path):
        df = pd.read_csv(dataset_path)
        print("✅ Dataset generated and loaded.")
    else:
        raise FileNotFoundError(f"Dataset still not found at {dataset_path} after running pipeline.")

In [None]:
# Calculate presence rates
total_assets = len(df)
present_inventory = (df["missing_in_inventory"] == 0).sum()
present_ipam = (df["missing_in_ipam"] == 0).sum()
present_all = ((df["missing_in_inventory"] == 0) & (df["missing_in_ipam"] == 0)).sum()

In [None]:
# calculate percentages
pct_inventory = present_inventory / total_assets * 100
pct_ipam = present_ipam / total_assets * 100
pct_all = present_all / total_assets * 100

In [None]:
# Print high level stats of presence per system dataset
print(f"Total Observability Assets: {total_assets:,}")
print(f"Present in Inventory: {present_inventory:,} ({pct_inventory:.1f}%)")
print(f"Present in IPAM: {present_ipam:,} ({pct_ipam:.1f}%)")
print(f"Present in BOTH Inventory and IPAM: {present_all:,} ({pct_all:.1f}%)")

In [None]:
# Print table form level stats of presence per system dataset
summary = pd.DataFrame({
    "Metric": ["Present in Inventory", "Present in IPAM", "Present in BOTH"],
    "Count": [present_inventory, present_ipam, present_all],
    "Percent": [pct_inventory, pct_ipam, pct_all]
})
display(summary)

In [None]:
# Display the contents of the generation paramaters to display what failure rates were set at data generation
with open(config_path, "r") as f:
    params = json.load(f)

# Pretty-print as Markdown for notebook display
display(Markdown(f"### Contents of `{config_path}`:"))
display(Markdown(f"```json\n{json.dumps(params, indent=4)}\n```"))

In [None]:
# Display all images in the reports directory (including subdirectories)
reports_dir = "reports"
image_extensions = ('.png', '.jpg', '.jpeg', '.gif')

found_images = []
for root, dirs, files in os.walk(reports_dir):
    for file in files:
        if file.lower().endswith(image_extensions):
            found_images.append(os.path.join(root, file))

if found_images:
    display(Markdown("### Generated Report Images"))
    for img_path in found_images:
        rel_path = os.path.relpath(img_path, reports_dir)
        display(Markdown(f"**{rel_path}**"))
        display(Image(filename=img_path))
else:
    print(f"No report images found in '{reports_dir}' or its subdirectories.")