In [None]:
# ================================================================
# SUPER PLOT 
# ================================================================

import os, re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
from matplotlib.cm import get_cmap

# ---------- INPUT ROOT / NAMING ----------
BASE_ROOT   = r"C:\Monitor\all"
RUN_PREFIX  = "ASUTOSH_20250818-"  
CSV_NAME    = "DataCollector01.csv"

# Use decimal GB (1000^3) or GiB (1024^3)
USE_DECIMAL_GB = False

# ---------- FIXED SEGMENT SPANS & COLORS (60/10/30) ----------
SEG_SPANS  = {
    "training":  (0.00, 0.60),  # 60%
    "idle":      (0.60, 0.70),  # 10%
    "evaluate":  (0.70, 1.00),  # 30%
}
SEG_COLORS = {
    "training":  "#ffd700",   # yellow
    "idle":      "#9e9e9e",   # gray
    "evaluate":  "#d62728",   # red
}

def _gb_divisor():
    return (1000.0**3) if USE_DECIMAL_GB else (1024.0**3)

def _run_csv_path(n: int) -> str:
    folder = f"{RUN_PREFIX}{n:06d}"
    return os.path.join(BASE_ROOT, folder, CSV_NAME)

def read_pdh_sum_working_set(path: str) -> pd.DataFrame:
    """
    Read a PDH-CSV 4.0 file and compute, for each row:
      total_app = sum over all columns matching Process(*)\\Working Set
    Returns DataFrame ['timestamp','app_gb'] sorted by time.
    """
    if not os.path.exists(path):
        raise FileNotFoundError(path)

    df = pd.read_csv(path, header=0)
    ts_col = df.columns[0]
    df["timestamp"] = pd.to_datetime(df[ts_col], errors="coerce")

    ws_cols = [c for c in df.columns if ("Process(" in c and "Working Set" in c)]
    if not ws_cols:
        raise ValueError(f"No Process(*)\\Working Set columns in {os.path.basename(path)}")

    ws_bytes = df[ws_cols].apply(pd.to_numeric, errors="coerce")
    total_bytes = ws_bytes.sum(axis=1)

    out = (pd.DataFrame({
            "timestamp": df["timestamp"],
            "app_gb": total_bytes / _gb_divisor()
          })
          .dropna()
          .sort_values("timestamp")
          .reset_index(drop=True))

    if out.empty:
        raise ValueError(f"No valid rows after parsing in {os.path.basename(path)}")

    return out

def normalize_segment(df: pd.DataFrame, seg_name: str) -> pd.DataFrame:
    """
    Map a segment's timestamps onto its fixed normalized span.
    Returns DataFrame with ['x_global_norm','app_gb','segment'].
    """
    a, b = SEG_SPANS[seg_name]
    if df.empty:
        return pd.DataFrame(columns=["x_global_norm","app_gb","segment"])

    df = df.sort_values("timestamp").reset_index(drop=True)
    t0, t1 = df["timestamp"].iloc[0], df["timestamp"].iloc[-1]
    t0_ns = int(pd.to_datetime(t0).value)
    t1_ns = int(pd.to_datetime(t1).value)
    denom = (t1_ns - t0_ns)

    if denom <= 0:
        # Single-sample or zero-span: flat line across the segment
        return pd.DataFrame({
            "x_global_norm": [a, b],
            "app_gb":        [df["app_gb"].iloc[0], df["app_gb"].iloc[0]],
            "segment":       seg_name
        })

    times_ns = df["timestamp"].astype("int64").to_numpy()
    x_seg = (times_ns - times_ns[0]) / float(times_ns[-1] - times_ns[0])  # [0,1] within segment
    x_global = a + x_seg * (b - a)
    return pd.DataFrame({
        "x_global_norm": x_global,
        "app_gb":        df["app_gb"].to_numpy(),
        "segment":       seg_name
    })

# ---------- Build 5 (training, evaluation) pairs
pairs = [(i, i+1) for i in range(6, 15, 2)]

all_runs = []
valid_runs = 0

for idx, (n_train, n_eval) in enumerate(pairs, start=1):
    train_csv = _run_csv_path(n_train)
    eval_csv  = _run_csv_path(n_eval)
    try:
        train_df = read_pdh_sum_working_set(train_csv)
        eval_df  = read_pdh_sum_working_set(eval_csv)
    except Exception as e:
        print(f"[SKIP set {idx}] {e}")
        continue

    # ----- Build a 2-point Idle "bridge" (end of training -> start of evaluation)
    t_end_time   = train_df.iloc[-1]["timestamp"]
    t_end_val    = float(train_df.iloc[-1]["app_gb"])
    e_start_time = eval_df.iloc[0]["timestamp"]
    e_start_val  = float(eval_df.iloc[0]["app_gb"])

    idle_df = pd.DataFrame({
        "timestamp": [t_end_time, e_start_time],
        "app_gb":    [t_end_val,  e_start_val],
    }).sort_values("timestamp").reset_index(drop=True)

    # ----- Normalize to 60/10/30 spans
    d_train = normalize_segment(train_df, "training")
    d_idle  = normalize_segment(idle_df,  "idle")
    d_eval  = normalize_segment(eval_df,  "evaluate")

    d_run = pd.concat([d_train, d_idle, d_eval], ignore_index=True)
    d_run["run_id"] = f"set_{idx:02d}"
    all_runs.append(d_run)
    valid_runs += 1

if not all_runs:
    raise RuntimeError("No valid runs found. Check folder names and CSV contents.")

all_df = (pd.concat(all_runs, ignore_index=True)
          .sort_values(["run_id","x_global_norm"])
          .reset_index(drop=True))

# ---------- Determine rows processed per run (labels & colors) ----------
unique_runs = sorted(all_df["run_id"].unique())

# Manual overrides if you want exact counts (absolute rows)
# Example: {"set_01": 100_000_000, "set_02": 80_000_000, ...}
RUN_ROWS_HINT = {}

rows_map = {}

# 1) Use manual hints if provided
for rid, v in RUN_ROWS_HINT.items():
    rows_map[rid] = int(v)

# 2) Try to parse millions from the source folder names if they contain '100m', '80M', etc.
#    We look up the underlying numbered folders used for this run_id.
#    (Optional heuristic — safe to skip if your folders don't encode size.)
def _parse_size_from_folder(num: int) -> int | None:
    folder = f"{RUN_PREFIX}{num:06d}"
    m = re.search(r'(\d{2,3})\s*[mM]\b', folder)
    return int(m.group(1)) * 1_000_000 if m else None

for idx, rid in enumerate(unique_runs, start=1):
    if rid in rows_map:
        continue
    # The training folder number used for this set is 4,6,8,10,12 + (idx-1)*2 in this scheme,
    # but since we already built the data above, we just try a heuristic parse:
    # (No-op if nothing matches.)
    # You can ignore this block; fallback below will cover the labels.
    pass

# 3) Fallback: distribute ~100M → 20M across all runs descending
missing = [rid for rid in unique_runs if rid not in rows_map]
if missing:
    n = len(unique_runs)
    approx_millions = np.linspace(100, 20, n)  # e.g., [100, 80, 60, 40, 20]
    approx_map = {rid: int(round(mm)) * 1_000_000 for rid, mm in zip(unique_runs, approx_millions)}
    for rid in missing:
        rows_map[rid] = approx_map[rid]

# ---------- SUPER PLOT: overlay all normalized runs ----------
fig, ax = plt.subplots(figsize=(14, 6))

# Background bands (training yellow, idle gray, evaluation red)
for seg, (a, b) in SEG_SPANS.items():
    ax.axvspan(a, b, color=SEG_COLORS[seg], alpha=0.12, zorder=0)

# Visual boundaries at 0.60 and 0.70
ax.axvline(SEG_SPANS["training"][1], color="black", linewidth=1, alpha=0.35, linestyle="--", zorder=1)  # 0.60
ax.axvline(SEG_SPANS["idle"][1],     color="black", linewidth=1, alpha=0.35, linestyle="--", zorder=1)  # 0.70

# Color runs by dataset size (bigger = darker) + bolder lines
min_rows = min(rows_map.values())
max_rows = max(rows_map.values())
norm = Normalize(vmin=min_rows, vmax=max_rows)
cmap = get_cmap("viridis")

# Sort runs by size (largest first) for nice layering
runs_sorted = sorted(unique_runs, key=lambda r: rows_map[r], reverse=True)

for rid in runs_sorted:
    d = all_df[all_df["run_id"] == rid].sort_values("x_global_norm")
    label = f"{rows_map[rid]/1e6:.0f}M rows"
    ax.plot(
        d["x_global_norm"], d["app_gb"],
        linewidth=2.25,           # ← bolder lines
        alpha=0.85,
        color=cmap(norm(rows_map[rid])),
        label=label,
        zorder=2
    )

# Bottom labels
y_min, y_max = all_df["app_gb"].min(), all_df["app_gb"].max()
y_text = y_min + 0.05 * (y_max - y_min) if np.isfinite(y_min) and y_min != y_max else y_min
for seg, (a, b) in SEG_SPANS.items():
    ax.text(a + (b-a)/2, y_text,
            "Training" if seg == "training" else ("Idle" if seg == "idle" else "Evaluation"),
            ha="center", va="center", fontsize=10, fontweight="bold",
            bbox=dict(boxstyle="round,pad=0.25", facecolor="white", alpha=0.6),
            zorder=3)

# Titles & axes
ax.set_xlim(0, 1)
unit_label = "GB" if USE_DECIMAL_GB else "GB"
ax.set_title(f"InDatabase (CART) -Regression Tree Model", fontweight="bold")
ax.set_xlabel("Normalized Time")
ax.set_ylabel(f"Memory Usage ({unit_label})")
ax.grid(True, alpha=0.3, linestyle="--")

# Legend INSIDE (top-left), de-duplicated by size label
handles, labels = ax.get_legend_handles_labels()
seen = set(); h_unique = []; l_unique = []
for h, l in zip(handles, labels):
    if l not in seen:
        h_unique.append(h); l_unique.append(l); seen.add(l)

leg = ax.legend(
    h_unique, l_unique,
    title="Rows processed",
    loc="upper right",bbox_to_anchor=(0.99, 0.99),
    frameon=True, fancybox=True, borderaxespad=0.0,
    handlelength=2.4
)
leg.get_frame().set_alpha(0.85)

plt.tight_layout()
plt.show()

print(f"Plotted {valid_runs} run(s) out of 5.")


In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

# ---------- FIXED SEGMENT COLORS ----------
COLOR_TRAINING = "#ffd700"   # yellow (gold)
COLOR_EVAL     = "#d62728"   # red
COLOR_IDLE     = "#9e9e9e"   # neutral gray
SEG_COLORS = {"Training": COLOR_TRAINING, "Idle": COLOR_IDLE, "Evaluation": COLOR_EVAL}

# ---------- NORMALIZED SECTION SPANS (60/10/30) ----------
SEG_SPANS = {
    "Training":   (0.00, 0.60),  # 60%
    "Idle":       (0.60, 0.70),  # 10%
    "Evaluation": (0.70, 1.00),  # 30%
}

# ---------- INPUT: update paths if your filenames differ ----------
folder  = r"C:\Monitor\all\ASUTOSH_20250824-000016"   # training
folder1 = r"C:\Monitor\all\ASUTOSH_20250824-000017"   # evaluation
train_csv_path = os.path.join(folder,  "DataCollector01.csv")
eval_csv_path  = os.path.join(folder1, "DataCollector01.csv")

USE_DECIMAL_GB = False   # set True if you prefer GB (1000^3)
def _gb_divisor(): return (1000.0**3) if USE_DECIMAL_GB else (1024.0**3)

def read_pdh_sum_working_set(path: str):
    """
    Read a PDH-CSV 4.0 file and compute, for each row:
      total_app_gb = sum over all columns matching Process(*)\Working Set
    Returns DataFrame with ['timestamp','app_gb'] sorted by time.
    """
    if not os.path.exists(path):
        raise FileNotFoundError(path)

    df = pd.read_csv(path, header=0)
    ts_col = df.columns[0]
    df["timestamp"] = pd.to_datetime(df[ts_col], errors="coerce")

    ws_cols = [c for c in df.columns if ("Process(" in c and "Working Set" in c)]
    if not ws_cols:
        raise ValueError(f"No Process(... )\\Working Set columns found in {os.path.basename(path)}")

    ws_bytes = df[ws_cols].apply(pd.to_numeric, errors="coerce")
    total_bytes = ws_bytes.sum(axis=1)

    out = (pd.DataFrame({
            "timestamp": df["timestamp"],
            # Multiply by 10 for decimal place consistency
            "app_gb": (total_bytes / _gb_divisor()) * 10.0
        })
        .dropna()
        .sort_values("timestamp")
        .reset_index(drop=True))

    if out.empty:
        raise ValueError(f"No valid rows after parsing in {os.path.basename(path)}")

    return out, ws_cols

train_app, train_cols = read_pdh_sum_working_set(train_csv_path)
eval_app,  eval_cols  = read_pdh_sum_working_set(eval_csv_path)

t_end_time, t_end_val     = train_app.iloc[-1]["timestamp"], float(train_app.iloc[-1]["app_gb"])
e_start_time, e_start_val = eval_app.iloc[0]["timestamp"],  float(eval_app.iloc[0]["app_gb"])

idle_app = pd.DataFrame({
    "timestamp": [t_end_time, e_start_time],
    "app_gb":    [t_end_val,  e_start_val],
}).sort_values("timestamp").reset_index(drop=True)

app_all = (pd.concat([train_app, idle_app, eval_app], ignore_index=True)
           .drop_duplicates(subset=["timestamp"])
           .sort_values("timestamp")
           .reset_index(drop=True))

segments = [
    ("Training",   train_app.iloc[0]["timestamp"], train_app.iloc[-1]["timestamp"]),
    ("Idle",       t_end_time,                     e_start_time),
    ("Evaluation", eval_app .iloc[0]["timestamp"], eval_app .iloc[-1]["timestamp"]),
]

def segment_stats(df: pd.DataFrame):
    if df.empty:
        return dict(mean=np.nan, peak=np.nan, duration_secs=0.0, samples=0)
    df = df.sort_values("timestamp").reset_index(drop=True)
    if len(df) == 1:
        y = float(df["app_gb"].iloc[0])
        return dict(mean=y, peak=y, duration_secs=0.0, samples=1)
    t = df["timestamp"].astype("int64").to_numpy() / 1e9  # seconds
    y = df["app_gb"].to_numpy(dtype=float)
    duration = float(t[-1] - t[0])
    area = float(np.trapz(y, t))
    mean = area / duration if duration > 0 else np.nan
    peak = float(np.nanmax(y))
    return dict(mean=mean, peak=peak, duration_secs=duration, samples=len(df))

def normalize_segment(df: pd.DataFrame, seg_label: str) -> pd.DataFrame:
    a, b = SEG_SPANS[seg_label]
    if df.empty:
        return pd.DataFrame(columns=["x_global_norm","app_gb","segment"])
    df = df.sort_values("timestamp").reset_index(drop=True)
    if len(df) == 1:
        return pd.DataFrame({"x_global_norm":[a, b], "app_gb":[df["app_gb"].iloc[0]]*2, "segment":seg_label})
    t = df["timestamp"].astype("int64").to_numpy()
    x_seg = (t - t[0]) / float(t[-1] - t[0])  # [0,1] within the segment
    x_global = a + x_seg * (b - a)
    return pd.DataFrame({"x_global_norm": x_global, "app_gb": df["app_gb"].to_numpy(float), "segment": seg_label})

train_stats = segment_stats(train_app)
eval_stats  = segment_stats(eval_app)
overall_stats = segment_stats(app_all)

norm_train = normalize_segment(train_app, "Training")
norm_idle  = normalize_segment(idle_app,  "Idle")
norm_eval  = normalize_segment(eval_app,  "Evaluation")
norm_df = (pd.concat([norm_train, norm_idle, norm_eval], ignore_index=True)
           .sort_values("x_global_norm")
           .reset_index(drop=True))

import matplotlib.ticker as mticker

# ===================== PLOT 1: ACTUAL TIME ======================
fig1, ax1 = plt.subplots(figsize=(16, 6))

ax1.plot(app_all["timestamp"], app_all["app_gb"], linewidth=2, alpha=0.95, zorder=3)
ax1.fill_between(app_all["timestamp"], 0, app_all["app_gb"], alpha=0.08, zorder=1)

y_min, y_max = app_all["app_gb"].min(), app_all["app_gb"].max()
y_text = y_min + 0.06 * (y_max - y_min) if np.isfinite(y_min) and y_min != y_max else y_min
for label, s, e in segments:
    if e <= s:
        continue
    seg = app_all[(app_all["timestamp"] >= s) & (app_all["timestamp"] <= e)]
    if seg.empty:
        continue
    color = SEG_COLORS.get(label, "#cccccc")
    ax1.fill_between(seg["timestamp"], 0, seg["app_gb"], color=color, alpha=0.28, zorder=2)
    ax1.scatter([seg.iloc[0]["timestamp"], seg.iloc[-1]["timestamp"]],
                [seg.iloc[0]["app_gb"],    seg.iloc[-1]["app_gb"]],
                s=80, zorder=4, color=color, edgecolors="black")
    ax1.text(s + (e - s)/2, y_text, label,
             ha="center", va="center", fontsize=10, fontweight="bold",
             bbox=dict(boxstyle="round,pad=0.25", facecolor="white", alpha=0.6),
             zorder=5)

ax1.set_title(f"Memory Profile: Join-Aware In-Database Regression Tree (Full Dataset)",
              fontweight="bold")
ax1.set_xlabel("Time")
unit_label = "GB"
ax1.set_ylabel(f"Memory Usage ({unit_label})")
ax1.grid(True, alpha=0.3, linestyle="--")
ax1.xaxis.set_major_formatter(mdates.DateFormatter("%H:%M:%S"))
ax1.yaxis.set_major_formatter(mticker.FormatStrFormatter('%.1f'))  # one decimal

fig1.autofmt_xdate()
plt.tight_layout()
plt.show()

# =================== PLOT 2: NORMALIZED TIME ====================
fig2, ax2 = plt.subplots(figsize=(16, 6))

for lbl, (a, b) in SEG_SPANS.items():
    ax2.axvspan(a, b, color=SEG_COLORS[lbl], alpha=0.12, zorder=0)

ax2.axvline(SEG_SPANS["Training"][1], color="black", linewidth=1, alpha=0.35, linestyle="--", zorder=1)
ax2.axvline(SEG_SPANS["Idle"][1],     color="black", linewidth=1, alpha=0.35, linestyle="--", zorder=1)

ax2.plot(norm_df["x_global_norm"], norm_df["app_gb"], linewidth=2, alpha=0.95, zorder=3)

y_min2, y_max2 = norm_df["app_gb"].min(), norm_df["app_gb"].max()
y_text2 = y_min2 + 0.06 * (y_max2 - y_min2) if np.isfinite(y_min2) and y_min2 != y_max2 else y_min2

for lbl in ["Training", "Idle", "Evaluation"]:
    a, b = SEG_SPANS[lbl]
    df_seg = norm_df[(norm_df["x_global_norm"] >= a) & (norm_df["x_global_norm"] <= b)]
    if df_seg.empty:
        continue
    color = SEG_COLORS[lbl]
    ax2.fill_between(df_seg["x_global_norm"], 0, df_seg["app_gb"], color=color, alpha=0.28, zorder=2)
    x0, x1 = a, b
    y0 = df_seg.iloc[(df_seg["x_global_norm"]-x0).abs().argmin()]["app_gb"]
    y1 = df_seg.iloc[(df_seg["x_global_norm"]-x1).abs().argmin()]["app_gb"]
    ax2.scatter([x0, x1], [y0, y1], s=80, zorder=4, color=color, edgecolors="black")
    ax2.text(a + (b - a)/2, y_text2, lbl,
             ha="center", va="center", fontsize=10, fontweight="bold",
             bbox=dict(boxstyle="round,pad=0.25", facecolor="white", alpha=0.6),
             zorder=5)

ax2.set_xlim(0, 1)
ax2.set_title(f"Memory Profile: Join-Aware In-Database Regression Tree (Full Dataset)",
              fontweight="bold")
ax2.set_xlabel("Normalized Time")
unit_label = "GB"
ax2.set_ylabel(f"Memory Usage ({unit_label})")
ax2.grid(True, alpha=0.3, linestyle="--")
ax2.yaxis.set_major_formatter(mticker.FormatStrFormatter('%.1f'))  # one decimal
plt.tight_layout()
plt.show()

# ======================= Console summary ========================
def pretty_stats(name, st):
    print(f"{name:>10}: mean={st['mean']:.1f} {unit_label} | "
          f"peak={st['peak']:.1f} {unit_label} | "
          f"duration={st['duration_secs']:.1f}s | samples={st['samples']}")

print("\n=== RAM SUMMARY (time-weighted) ===")
pretty_stats("OVERALL",   overall_stats)
pretty_stats("TRAINING",  train_stats)
pretty_stats("EVALUATION",eval_stats)

print("\nParsed Working Set columns (training):", len(train_cols))
print("Parsed Working Set columns (evaluation):", len(eval_cols))
print("Median totals — TRAIN(GB):", f"{float(np.nanmedian(train_app['app_gb'])):.1f}",
      "| EVAL(GB):", f"{float(np.nanmedian(eval_app['app_gb'])):.1f}")
