# 01 – Smart-Grid Data Overview

Quick-look analytics for interim and processed datasets (clean + augmented) plus optional PED2-derived features.

## Objectives
- Confirm ingestion outputs retain the expected feature count, timeline density, and label coverage.
- Compare interim vs cleaned/augmented versions to see how preprocessing shifts summary statistics.
- Visualize correlations, time-series windows, and synthetic fault behavior before feeding models.

In [None]:
from __future__ import annotations

import json
import sys
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

sns.set_theme(style="whitegrid")

# Resolve project root so we can import local modules when running from JupyterLab
CANDIDATES = [Path.cwd(), Path.cwd().parent, Path.cwd().parent.parent]
for candidate in CANDIDATES:
    if (candidate / "README.md").exists():
        PROJECT_ROOT = candidate.resolve()
        break
else:
    raise RuntimeError("Run this notebook from inside the repository (README.md not found).")

if str(PROJECT_ROOT / "src") not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT / "src"))

DATA_INTERIM = PROJECT_ROOT / "data" / "interim"
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
CONFIGS = PROJECT_ROOT / "configs"
print(f"Project root: {PROJECT_ROOT}")
print(f"Interim data dir: {DATA_INTERIM}")
print(f"Processed data dir: {DATA_PROCESSED}")

In [None]:
from smart_grid_fault_detection.data_prep import load_manifest

manifest_path = CONFIGS / "data_manifest.yaml"
if not manifest_path.exists():
    manifest_path = CONFIGS / "data_manifest.example.yaml"
    print("Using example manifest – update configs/data_manifest.yaml for production experiments.")

manifest = load_manifest(manifest_path)
manifest

In [None]:
INTERIM_PATH = manifest.output.interim_table
PROCESSED_PATH = DATA_PROCESSED / "smart_grid_clean.parquet"
AUGMENTED_PATH = DATA_PROCESSED / "smart_grid_augmented.parquet"

frames = {}
for label, path in {
    "interim": INTERIM_PATH,
    "clean": PROCESSED_PATH,
    "augmented": AUGMENTED_PATH,
}.items():
    if path.exists():
        frames[label] = pd.read_parquet(path)
        print(f"Loaded {label} frame from {path} -> {len(frames[label]):,} rows, {frames[label].shape[1]} columns")
    else:
        print(f"[warning] {label} path missing: {path}")

frames.keys()

In [None]:
def summarize(df: pd.DataFrame, label: str) -> pd.DataFrame:
    summary = pd.DataFrame(
        {
            "rows": [len(df)],
            "start": [df["timestamp"].min()],
            "end": [df["timestamp"].max()],
            "fault_rate": [df.get("fault_flag", pd.Series([0])).mean()],
            "features": [len(df.columns)],
        }
    )
    summary.index = [label]
    return summary

summary_tables = []
for label, df in frames.items():
    summary_tables.append(summarize(df, label))

if summary_tables:
    display(pd.concat(summary_tables))
else:
    print("No frames loaded – rerun ingestion/processing first.")

In [None]:
if "interim" in frames:
    display(frames["interim"].describe().T)

if "clean" in frames:
    display(frames["clean"].describe().T)

In [None]:
if "clean" in frames:
    faults = frames["clean"].groupby("fault_type").size().sort_values(ascending=False)
    display(faults.to_frame(name="count"))
else:
    print("Clean dataframe missing – cannot compute fault distribution.")

In [None]:
if "clean" in frames:
    subset = frames["clean"][
        ["timestamp", "load_mw", "voltage_kv", "frequency_hz", "fault_flag"]
    ].set_index("timestamp").sort_index()
    window = subset.iloc[: 96 * 2]
    ax = window[["load_mw", "voltage_kv"]].plot(subplots=False, figsize=(12, 4))
    ax.set_title("Load vs Voltage (first 2 days)")
    plt.show()
else:
    print("Clean dataframe missing – skip timeline plot.")

In [None]:
if "augmented" in frames:
    aug = frames["augmented"]
    corr_cols = [
        "load_mw",
        "voltage_kv",
        "frequency_hz",
        "reactive_power_mvar",
        "ambient_temp_c",
        "wind_speed_ms",
        "solar_mw",
    ]
    corr = aug[corr_cols].corr()
    plt.figure(figsize=(8, 6))
    sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
    plt.title("Correlation heatmap (augmented dataset)")
    plt.show()
else:
    print("Augmented dataframe missing – skip correlation heatmap.")

In [None]:
PED2_PATH = DATA_INTERIM / "ped2_training_features.parquet"
if PED2_PATH.exists():
    ped2 = pd.read_parquet(PED2_PATH)
    ped2.describe()[["grad_mean", "frame_absdiff_mean"]]
else:
    print("PED2 feature parquet not found – run ped2_converter first.")

In [None]:
if "augmented" in frames:
    aug = frames["augmented"].copy()
    aug["timestamp"] = pd.to_datetime(aug["timestamp"])
    window = aug.set_index("timestamp")["fault_flag"].resample("6H").mean()
    plt.figure(figsize=(10, 3))
    window.plot()
    plt.ylabel("Fault frequency (6h rolling mean)")
    plt.title("Synthetic fault density over time")
    plt.show()

## Next Steps
- Layer PCA or autoencoder latent projections on top of the cleaned dataframe and append the plots here.
- Export summary tables/figures to `reports/figures/` for inclusion in status updates.
- Repeat the analysis after ingesting real SCADA/PMU streams to quantify differences vs the synthetic baseline.