In [None]:
# Phase 0: Environment Setup and Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

# Set random seeds for reproducibility
np.random.seed(42)

# Display settings
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
pd.set_option("display.width", None)
plt.style.use("seaborn-v0_8-darkgrid")

print("✅ Phase 0: Environment Setup Complete")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

In [None]:
print(
    "🚀 Starting Phase 1, Step 1: Load Raw Data & Type Conversion (Corrected Alarm Merge)"
)

# --- 1.1 Load tables ---
try:
    df_raw = pd.read_parquet("ClimateLog.parquet")
    df_ext_raw = pd.read_parquet("ClimateLogExtra.parquet")
    ahu_raw = pd.read_parquet("AHUClimateLog.parquet")
    mc_raw = pd.read_parquet("MachineCycle.parquet")
    mp_raw = pd.read_parquet("MachinePositions.parquet")
    error_log_temp = pd.read_parquet("ErrorLog.parquet")  # Has 'AlarmID'
    alarm_translation_temp = pd.read_parquet(
        "AlarmTranslation.parquet"
    )  # Has 'AlarmNumber', 'AlarmCode', 'Engels'
    print("  All raw Parquet files loaded.")
except FileNotFoundError as e:
    print(
        f"ERROR: Could not load one or more Parquet files. Please check paths. Error: {e}"
    )
    raise

# Merge ErrorLog and AlarmTranslation
el_raw = error_log_temp.copy()

# Option 1: Assume ErrorLog.AlarmID should map to AlarmTranslation.AlarmNumber
if (
    "AlarmID" in el_raw.columns
    and "AlarmNumber" in alarm_translation_temp.columns
    and "Engels" in alarm_translation_temp.columns
):
    print(
        "  Attempting to merge ErrorLog (on AlarmID) with AlarmTranslation (on AlarmNumber)..."
    )
    # To avoid issues if AlarmNumber is not unique in alarm_translation_temp, keep only first match
    translations_subset = alarm_translation_temp[
        ["AlarmNumber", "Engels"]
    ].drop_duplicates(subset=["AlarmNumber"])
    el_raw = pd.merge(
        el_raw,
        translations_subset,
        left_on="AlarmID",  # Key from ErrorLog
        right_on="AlarmNumber",  # Key from AlarmTranslation
        how="left",
    )
    if "Engels" in el_raw.columns:
        el_raw.rename(columns={"Engels": "AlarmDescription_Eng"}, inplace=True)
    else:
        el_raw["AlarmDescription_Eng"] = pd.NA  # Ensure column exists
    # Keep AlarmNumber from translation if merge was successful, or use original AlarmID
    if (
        "AlarmNumber" not in el_raw.columns and "AlarmID" in el_raw.columns
    ):  # If merge didn't add AlarmNumber, it implies AlarmID is the primary numeric ID we have.
        el_raw["AlarmNumber"] = el_raw[
            "AlarmID"
        ]  # Use AlarmID as the numeric representation if AlarmNumber didn't come from merge
    elif "AlarmNumber" not in el_raw.columns:  # If neither, create it as NA
        el_raw["AlarmNumber"] = pd.NA

# Option 2: Fallback or alternative - map ErrorLog.AlarmID to AlarmTranslation.AlarmCode
elif (
    "AlarmID" in el_raw.columns
    and "AlarmCode" in alarm_translation_temp.columns
    and "Engels" in alarm_translation_temp.columns
):
    print(
        "  Attempting to merge ErrorLog (on AlarmID) with AlarmTranslation (on AlarmCode as fallback)..."
    )
    translations_subset_ac = alarm_translation_temp[
        ["AlarmCode", "Engels", "AlarmNumber"]
    ].drop_duplicates(subset=["AlarmCode"])
    el_raw = pd.merge(
        el_raw,
        translations_subset_ac,
        left_on="AlarmID",  # Key from ErrorLog
        right_on="AlarmCode",  # Key from AlarmTranslation
        how="left",
    )
    if "Engels" in el_raw.columns:
        el_raw.rename(columns={"Engels": "AlarmDescription_Eng"}, inplace=True)
    else:
        el_raw["AlarmDescription_Eng"] = pd.NA
    # Ensure AlarmNumber exists, preferring the one from translation if available
    if "AlarmNumber" not in el_raw.columns and "AlarmID" in el_raw.columns:
        el_raw["AlarmNumber"] = el_raw["AlarmID"]
    elif "AlarmNumber" not in el_raw.columns:
        el_raw["AlarmNumber"] = pd.NA

else:
    print(
        "ERROR: Cannot determine correct keys to merge ErrorLog with AlarmTranslation for descriptions."
    )
    if "AlarmDescription_Eng" not in el_raw.columns:
        el_raw["AlarmDescription_Eng"] = pd.NA
    if "AlarmNumber" not in el_raw.columns:
        el_raw["AlarmNumber"] = (
            el_raw["AlarmID"] if "AlarmID" in el_raw.columns else pd.NA
        )

# Rename TimeStamp in ErrorLog to avoid conflict
if "TimeStamp" in el_raw.columns:
    el_raw.rename(columns={"TimeStamp": "ErrorLogRecordTimeStamp"}, inplace=True)

print(
    f"  ErrorLog processed. el_raw shape: {el_raw.shape}. Columns include: {el_raw.columns.to_list()}"
)

# --- 1.2 Convert epochs / numeric timestamps to datetime objects ---
print("\n  Converting timestamp columns to datetime objects...")
for df_temp, name in [
    (df_raw, "df_raw"),
    (df_ext_raw, "df_ext_raw"),
    (ahu_raw, "ahu_raw"),
]:
    if "TimeStamp" in df_temp.columns:
        if pd.api.types.is_numeric_dtype(df_temp["TimeStamp"]):
            df_temp["TimeStamp_epoch"] = df_temp["TimeStamp"]
            df_temp["TimeStamp_dt"] = pd.to_datetime(
                df_temp["TimeStamp"], unit="s", errors="coerce"
            )
            if df_temp["TimeStamp_dt"].isnull().any():
                print(
                    f"      Warning: NaNs produced during TimeStamp conversion for {name}."
                )
        elif pd.api.types.is_datetime64_any_dtype(df_temp["TimeStamp"]):
            print(
                f"    {name}['TimeStamp'] is already datetime. Creating TimeStamp_dt and TimeStamp_epoch."
            )
            df_temp["TimeStamp_dt"] = df_temp["TimeStamp"]
            df_temp["TimeStamp_epoch"] = (
                df_temp["TimeStamp"] - pd.Timestamp("1970-01-01")
            ) // pd.Timedelta("1s")
        else:
            print(
                f"    Warning: {name}['TimeStamp'] is not numeric or datetime. Cannot convert reliably."
            )
            df_temp["TimeStamp_dt"] = pd.NaT
            df_temp["TimeStamp_epoch"] = pd.NA
    else:
        print(f"    Warning: 'TimeStamp' column not found in {name}.")

for col_name_numeric, col_name_dt in [
    ("StartTimeStamp", "ErrorStartTime_dt"),
    ("EndTimeStamp", "ErrorEndTime_dt"),
]:
    if col_name_numeric in el_raw.columns:
        if pd.api.types.is_numeric_dtype(el_raw[col_name_numeric]):
            el_raw[col_name_dt] = pd.to_datetime(
                el_raw[col_name_numeric], unit="s", errors="coerce"
            )
            if el_raw[col_name_dt].isnull().any():
                print(f"      Warning: NaNs produced for {col_name_dt}.")
        elif pd.api.types.is_datetime64_any_dtype(
            el_raw[col_name_numeric]
        ):  # If already datetime
            el_raw[col_name_dt] = el_raw[col_name_numeric]
        else:
            print(
                f"    Warning: el_raw['{col_name_numeric}'] is not numeric or datetime."
            )
            el_raw[col_name_dt] = pd.NaT
    else:
        print(f"    Warning: '{col_name_numeric}' column not found in el_raw.")

# For mc_raw (MachineCycle)
if "StartDateTime" in mc_raw.columns:
    if pd.api.types.is_numeric_dtype(mc_raw["StartDateTime"]):
        mc_raw["StartDateTime_epoch"] = mc_raw["StartDateTime"]
        mc_raw["StartDateTime_dt"] = pd.to_datetime(
            mc_raw["StartDateTime"], unit="s", errors="coerce"
        )
        if mc_raw["StartDateTime_dt"].isnull().any():
            print("      Warning: NaNs produced for StartDateTime_dt.")
    elif pd.api.types.is_datetime64_any_dtype(mc_raw["StartDateTime"]):
        mc_raw["StartDateTime_dt"] = mc_raw["StartDateTime"]
        mc_raw["StartDateTime_epoch"] = (
            mc_raw["StartDateTime_dt"] - pd.Timestamp("1970-01-01")
        ) // pd.Timedelta("1s")
    else:
        print("    Warning: mc_raw['StartDateTime'] is not numeric or datetime.")
        mc_raw["StartDateTime_dt"] = pd.NaT
        mc_raw["StartDateTime_epoch"] = pd.NA
else:
    print("    Warning: 'StartDateTime' column not found in mc_raw.")

if "EndDateTime" in mc_raw.columns:
    if pd.api.types.is_numeric_dtype(mc_raw["EndDateTime"]):
        mc_raw["EndDateTime_epoch_raw"] = mc_raw["EndDateTime"]
        mc_raw["EndDateTime_numeric_for_dt"] = mc_raw["EndDateTime"].replace(0, np.nan)
        mc_raw["EndDateTime_dt"] = pd.to_datetime(
            mc_raw["EndDateTime_numeric_for_dt"], unit="s", errors="coerce"
        )
        if mc_raw["EndDateTime_dt"].isnull().any():
            print("      Warning: NaNs produced for EndDateTime_dt.")
    elif pd.api.types.is_datetime64_any_dtype(
        mc_raw["EndDateTime"]
    ):  # If it's already datetime (less likely if 0s are present)
        mc_raw["EndDateTime_dt"] = mc_raw["EndDateTime"]
        mc_raw["EndDateTime_epoch_raw"] = (
            mc_raw["EndDateTime_dt"] - pd.Timestamp("1970-01-01")
        ) // pd.Timedelta("1s")  # This will be NaT for original 0s if not handled
    else:
        print("    Warning: mc_raw['EndDateTime'] is not numeric or datetime.")
        mc_raw["EndDateTime_dt"] = pd.NaT
        mc_raw["EndDateTime_epoch_raw"] = pd.NA
else:
    print("    Warning: 'EndDateTime' column not found in mc_raw.")

print("\n✅ Step 1: Load Raw Data & Type Conversion Complete.")

In [None]:
print("\n--- Step 1.1: Inspecting Columns of Loaded Raw DataFrames ---")

dataframes_to_inspect = {
    "df_raw": df_raw if "df_raw" in locals() else None,
    "df_ext_raw": df_ext_raw if "df_ext_raw" in locals() else None,
    "ahu_raw": ahu_raw if "ahu_raw" in locals() else None,
    "mc_raw": mc_raw if "mc_raw" in locals() else None,
    "mp_raw": mp_raw if "mp_raw" in locals() else None,
    "el_raw": el_raw if "el_raw" in locals() else None,
    "alarm_translation_temp": alarm_translation_temp
    if "alarm_translation_temp" in locals()
    else None,  # Also inspect this
}

for name, df_instance in dataframes_to_inspect.items():
    print(f"\nColumns for DataFrame: {name}")
    if df_instance is not None and not df_instance.empty:
        print(df_instance.columns.tolist())
        print(f"Shape: {df_instance.shape}")
        # Print a few dtypes
        if len(df_instance.columns) > 0:
            print("Sample dtypes:")
            print(df_instance.head(1).dtypes.to_string())

    elif df_instance is not None and df_instance.empty:
        print("  DataFrame is empty.")
    else:
        print("  DataFrame not loaded or not defined in locals().")

print("\n--- End of Column Inspection ---")

In [None]:
# --- 1.3 Identify Setter Machines ---
print("\n🔍 Step 1.2: Identifying Setter Machines...")

# Check if mp_raw exists and has required columns
if (
    "mp_raw" in locals()
    and mp_raw is not None
    and "text" in mp_raw.columns
    and "MachineID" in mp_raw.columns
):
    # Setters have IDs starting with 'S' in MachinePositions
    setter_ids = mp_raw.loc[
        mp_raw.text.str.startswith("S", na=False), "MachineID"
    ].unique()
    print(f"✅ Found {len(setter_ids)} setter machines")
    print(
        f"   Setter IDs: {sorted(setter_ids)[:10]}..."
        if len(setter_ids) > 10
        else f"   Setter IDs: {sorted(setter_ids)}"
    )
else:
    print(
        "ERROR: Cannot identify setters. mp_raw not loaded or missing required columns."
    )
    setter_ids = []

# --- 1.4 Filter to Setters Only ---
if len(setter_ids) > 0:
    print("\n🔽 Step 1.3: Filtering all data to setter machines only...")

    # Create filtered datasets with existence checks
    df_setters = (
        df_raw[df_raw.MachineID.isin(setter_ids)].copy()
        if "df_raw" in locals() and "MachineID" in df_raw.columns
        else pd.DataFrame()
    )
    df_ext_setters = (
        df_ext_raw[df_ext_raw.MachineID.isin(setter_ids)].copy()
        if "df_ext_raw" in locals() and "MachineID" in df_ext_raw.columns
        else pd.DataFrame()
    )
    ahu_setters = (
        ahu_raw[ahu_raw.MachineID.isin(setter_ids)].copy()
        if "ahu_raw" in locals() and "MachineID" in ahu_raw.columns
        else pd.DataFrame()
    )
    mc_setters = (
        mc_raw[mc_raw.MachineID.isin(setter_ids)].copy()
        if "mc_raw" in locals() and "MachineID" in mc_raw.columns
        else pd.DataFrame()
    )
    el_setters = (
        el_raw[el_raw.MachineID.isin(setter_ids)].copy()
        if "el_raw" in locals() and "MachineID" in el_raw.columns
        else pd.DataFrame()
    )

    # Summary of filtered data
    print("\n📊 Filtered Data Summary:")
    if not df_setters.empty:
        print(f"  - ClimateLog setters: {df_setters.shape} (from {df_raw.shape})")
    if not df_ext_setters.empty:
        print(
            f"  - ClimateLogExtra setters: {df_ext_setters.shape} (from {df_ext_raw.shape})"
        )
    if not ahu_setters.empty:
        print(f"  - AHUClimateLog setters: {ahu_setters.shape} (from {ahu_raw.shape})")
    if not mc_setters.empty:
        print(f"  - MachineCycle setters: {mc_setters.shape} (from {mc_raw.shape})")
    if not el_setters.empty:
        print(f"  - ErrorLog setters: {el_setters.shape} (from {el_raw.shape})")

    print("\n✅ Phase 1: Data Ingestion & Setter Filtering Complete!")
else:
    print("ERROR: No setter machines found. Cannot proceed with filtering.")

In [None]:
# Phase 0: Enhanced EDA Helper Functions
def perform_detailed_eda(df, table_name, sample_size=None):
    """Perform detailed exploratory data analysis on a dataframe"""
    print(f"\n{'=' * 60}")
    print(f"📊 Detailed EDA for {table_name}")
    print(f"{'=' * 60}")

    # Sample if requested
    if sample_size and len(df) > sample_size:
        df_sample = df.sample(n=sample_size, random_state=42)
        print(
            f"Note: Sampling {sample_size} rows from {len(df)} total rows for analysis"
        )
    else:
        df_sample = df

    # Basic info
    print(f"\nShape: {df.shape}")
    print("\nMemory Usage:")
    print(df.memory_usage(deep=True).sort_values(ascending=False).head(10))

    # Column types summary
    print("\nColumn Type Summary:")
    print(df.dtypes.value_counts())

    # Missing values
    missing = df.isnull().sum()
    missing_pct = (missing / len(df) * 100).round(2)
    missing_df = pd.DataFrame(
        {"Missing_Count": missing, "Missing_Percentage": missing_pct}
    ).sort_values("Missing_Count", ascending=False)

    if missing_df["Missing_Count"].sum() > 0:
        print("\nMissing Values (Top 10):")
        print(missing_df[missing_df["Missing_Count"] > 0].head(10))
    else:
        print("\nNo missing values found!")

    # Basic statistics for numeric columns
    numeric_cols = df_sample.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        print("\nNumeric Column Statistics (first 10 columns):")
        print(df_sample[numeric_cols[:10]].describe().round(2))

    # Unique value counts for categorical columns
    categorical_cols = df_sample.select_dtypes(include=["object", "category"]).columns
    if len(categorical_cols) > 0:
        print("\nCategorical Column Unique Values:")
        for col in categorical_cols[:5]:  # First 5 categorical columns
            unique_count = df_sample[col].nunique()
            print(f"  - {col}: {unique_count} unique values")
            if unique_count <= 10:
                print(f"    Values: {df_sample[col].value_counts().to_dict()}")

    return missing_df


# Test EDA on a sample
if "df_setters" in locals() and not df_setters.empty:
    perform_detailed_eda(df_setters, "ClimateLog (Setters)", sample_size=50000)

In [None]:
print("🚀 Starting Phase 2: Cycle Splits & Alarm Detection")

# --- Configuration for Critical Alarms ---
# Using the comprehensive list from your code
CRITICAL_ALARM_NAMES = [
    "Connection lost error",
    "RS422 connection lost",
    "Clocktime lost error",
    "RTC slow error",
    "sensor error",
    "diagnose error",
    "Temperature high error",
    "Temperature low error",
    "Fan alarm",
    "Power failure",
    "Incubation program lost",
    "Emergency stop",
    "setpoints lost",
    "stack overflow",
    "program lost warning",
    "user int error",
    "can bus warning",
]
print(
    f"📋 Using {len(CRITICAL_ALARM_NAMES)} defined critical alarm names for filtering."
)

# Additional critical alarms from the original plan (if they exist in your data)
# You can add: "temperatuur hoog", "temperature decrease", "cooling failure"

# Check prerequisites
if "mc_setters" not in locals() or mc_setters.empty:
    print(
        "❌ Error: 'mc_setters' is not available or empty. Cannot proceed with Phase 2."
    )
    mc_clean = pd.DataFrame()
    mc_alarm = pd.DataFrame()
else:
    print(f"✅ Input mc_setters shape: {mc_setters.shape}")

    # --- 2.1 Filter for Completed Setter Runs ---
    mc_temp = mc_setters.copy()

    # Filter for CycleType == 0 (normal setter cycles)
    if "CycleType" in mc_temp.columns:
        mc_temp = mc_temp[mc_temp["CycleType"] == 0]
        print(f"   Filtered to CycleType==0: {mc_temp.shape[0]} cycles")
    else:
        print("   ⚠️ Warning: 'CycleType' column not found in mc_setters.")

    # Filter for completed cycles (EndDateTime exists)
    if "EndDateTime_dt" in mc_temp.columns:
        mc_temp = mc_temp[mc_temp["EndDateTime_dt"].notna()]
        print(f"   Filtered to completed cycles: {mc_temp.shape[0]} cycles")
    else:
        print(
            "   ❌ Warning: 'EndDateTime_dt' column not found. Cannot filter for completed cycles."
        )
        mc_temp = pd.DataFrame()

    # Calculate cycle duration
    if (
        not mc_temp.empty
        and "StartDateTime_dt" in mc_temp.columns
        and "EndDateTime_dt" in mc_temp.columns
    ):
        mc_temp["duration_days"] = (
            mc_temp["EndDateTime_dt"] - mc_temp["StartDateTime_dt"]
        ).dt.total_seconds() / 86400
        print(f"   ✅ Calculated duration for {mc_temp.shape[0]} cycles")

        # Show duration statistics
        print("\n   Duration Statistics (all completed cycles):")
        print(f"     Mean: {mc_temp['duration_days'].mean():.2f} days")
        print(f"     Std:  {mc_temp['duration_days'].std():.2f} days")
        print(f"     Min:  {mc_temp['duration_days'].min():.2f} days")
        print(f"     Max:  {mc_temp['duration_days'].max():.2f} days")
    else:
        print(
            "   ⚠️ Warning: Could not calculate duration due to missing datetime columns."
        )
        mc_temp["duration_days"] = np.nan

    mc_completed_setters_all_durations = mc_temp

In [None]:
# --- 2.2 Identify CycleIDs affected by critical alarms ---
alarm_cycle_ids = set()

if "el_setters" not in locals() or el_setters.empty:
    print("⚠️ Warning: 'el_setters' (ErrorLog for setters) is not available or empty.")
    print("   Assuming no cycles have critical alarms.")
elif not all(
    col in el_setters.columns
    for col in ["MachineID", "ErrorStartTime_dt", "AlarmDescription_Eng"]
):
    print("⚠️ Warning: 'el_setters' is missing required columns.")
    print(f"   Available columns: {el_setters.columns.tolist()}")
elif not mc_completed_setters_all_durations.empty:
    # Filter for critical alarms
    critical_error_log = el_setters[
        el_setters["AlarmDescription_Eng"].isin(CRITICAL_ALARM_NAMES)
    ]
    print(f"\n🚨 Found {len(critical_error_log)} critical error log entries")

    if len(critical_error_log) > 0:
        # Show distribution of critical alarms
        print("\n   Top 10 Critical Alarms:")
        alarm_counts = (
            critical_error_log["AlarmDescription_Eng"].value_counts().head(10)
        )
        for alarm, count in alarm_counts.items():
            print(f"     - {alarm}: {count}")

    if not critical_error_log.empty:
        print("\n🔍 Checking for critical alarms overlapping with cycle durations...")
        print(f"   Processing {len(mc_completed_setters_all_durations)} cycles...")

        # Progress tracking
        cycles_processed = 0

        # Iterate through cycles to find overlaps
        for _, cycle_row in mc_completed_setters_all_durations.iterrows():
            machine_id = cycle_row["MachineID"]
            cycle_id = cycle_row["CycleID"]
            c_start = cycle_row["StartDateTime_dt"]
            c_end = cycle_row["EndDateTime_dt"]

            if pd.isna(c_start) or pd.isna(c_end):
                continue

            # Filter errors for the same machine
            machine_critical_errors = critical_error_log[
                critical_error_log["MachineID"] == machine_id
            ]

            # Check if any error overlaps with this cycle
            for _, error_row in machine_critical_errors.iterrows():
                e_start = error_row["ErrorStartTime_dt"]
                if pd.isna(e_start):
                    continue

                # Check if error start time is within the cycle duration
                if c_start <= e_start <= c_end:
                    alarm_cycle_ids.add(cycle_id)
                    break  # Found one critical error, no need to check more

            cycles_processed += 1
            if cycles_processed % 500 == 0:
                print(
                    f"     Processed {cycles_processed}/{len(mc_completed_setters_all_durations)} cycles..."
                )

        print(f"\n   ✅ Finished processing {cycles_processed} cycles")
        print(f"   🚨 Identified {len(alarm_cycle_ids)} cycles with critical alarms")
    else:
        print("   ℹ️ No critical error log entries found matching CRITICAL_ALARM_NAMES.")
else:
    print("   ℹ️ No completed setter cycles to check for alarms.")

In [None]:
# --- 2.3 Split into mc_clean and mc_alarm ---
if not mc_completed_setters_all_durations.empty:
    # Split based on alarm presence
    mc_alarm_temp = mc_completed_setters_all_durations[
        mc_completed_setters_all_durations["CycleID"].isin(list(alarm_cycle_ids))
    ].copy()

    mc_clean_temp = mc_completed_setters_all_durations[
        ~mc_completed_setters_all_durations["CycleID"].isin(list(alarm_cycle_ids))
    ].copy()

    print("\n📊 Initial split:")
    print(f"   - Cycles with alarms: {mc_alarm_temp.shape[0]}")
    print(f"   - Clean cycles: {mc_clean_temp.shape[0]}")
else:
    mc_alarm_temp = pd.DataFrame()
    mc_clean_temp = pd.DataFrame()

# --- 2.4 Apply 18-21 day duration filter ---
duration_min_days = 18
duration_max_days = 21

print(
    f"\n🔽 Applying duration filter ({duration_min_days}-{duration_max_days} days)..."
)

# Filter clean cycles
if not mc_clean_temp.empty and "duration_days" in mc_clean_temp.columns:
    mc_clean = mc_clean_temp[
        (mc_clean_temp["duration_days"] >= duration_min_days)
        & (mc_clean_temp["duration_days"] <= duration_max_days)
    ].copy()
    dropped_clean = len(mc_clean_temp) - len(mc_clean)
    print(
        f"   ✅ mc_clean: {len(mc_clean)} cycles (dropped {dropped_clean} outside duration range)"
    )
else:
    mc_clean = pd.DataFrame()
    print("   ⚠️ mc_clean_temp is empty or missing 'duration_days'")

# Filter alarm cycles
if not mc_alarm_temp.empty and "duration_days" in mc_alarm_temp.columns:
    mc_alarm = mc_alarm_temp[
        (mc_alarm_temp["duration_days"] >= duration_min_days)
        & (mc_alarm_temp["duration_days"] <= duration_max_days)
    ].copy()
    dropped_alarm = len(mc_alarm_temp) - len(mc_alarm)
    print(
        f"   ✅ mc_alarm: {len(mc_alarm)} cycles (dropped {dropped_alarm} outside duration range)"
    )
else:
    mc_alarm = pd.DataFrame()
    print("   ⚠️ mc_alarm_temp is empty or missing 'duration_days'")

# Final summary
print("\n📊 Final Phase 2 Results:")
print(
    f"   - Clean cycles (18-21 days): {mc_clean.shape if 'mc_clean' in locals() and not mc_clean.empty else '0 cycles'}"
)
print(
    f"   - Alarm cycles (18-21 days): {mc_alarm.shape if 'mc_alarm' in locals() and not mc_alarm.empty else '0 cycles'}"
)

print("\n✅ Phase 2: Cycle Splits & Alarm Detection Complete!")

In [None]:
# --- Visualizing Cycle Durations ---
print("\n📊 Visualizing Cycle Durations...")

# Set plot style
plt.style.use("ggplot")

# Create figure with subplots
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# --- Plot for mc_clean ---
if (
    "mc_clean" in locals()
    and not mc_clean.empty
    and "duration_days" in mc_clean.columns
):
    print("\n✅ Clean Cycles Duration Statistics:")
    print(mc_clean["duration_days"].describe().round(2))

    ax1 = axes[0]
    mc_clean["duration_days"].plot(
        kind="hist", bins=30, edgecolor="black", alpha=0.7, color="green", ax=ax1
    )
    ax1.set_title("Histogram of Cycle Durations for Clean Cycles", fontsize=14)
    ax1.set_xlabel("Cycle Duration (days)", fontsize=12)
    ax1.set_ylabel("Number of Cycles", fontsize=12)
    ax1.axvline(18, color="red", linestyle="--", linewidth=2, label="18 days (min)")
    ax1.axvline(21, color="red", linestyle="--", linewidth=2, label="21 days (max)")
    ax1.axvline(
        mc_clean["duration_days"].mean(),
        color="blue",
        linestyle="-",
        linewidth=2,
        label=f"Mean: {mc_clean['duration_days'].mean():.2f}",
    )
    ax1.legend()
    ax1.grid(True, linestyle=":", alpha=0.7)
else:
    axes[0].text(
        0.5,
        0.5,
        "No clean cycles data available",
        ha="center",
        va="center",
        fontsize=14,
    )
    axes[0].set_title("Clean Cycles - No Data")

# --- Plot for mc_alarm ---
if (
    "mc_alarm" in locals()
    and not mc_alarm.empty
    and "duration_days" in mc_alarm.columns
):
    print("\n🚨 Alarm Cycles Duration Statistics:")
    print(mc_alarm["duration_days"].describe().round(2))

    ax2 = axes[1]
    mc_alarm["duration_days"].plot(
        kind="hist", bins=30, edgecolor="black", alpha=0.7, color="red", ax=ax2
    )
    ax2.set_title("Histogram of Cycle Durations for Alarm Cycles", fontsize=14)
    ax2.set_xlabel("Cycle Duration (days)", fontsize=12)
    ax2.set_ylabel("Number of Cycles", fontsize=12)
    ax2.axvline(18, color="darkred", linestyle="--", linewidth=2, label="18 days (min)")
    ax2.axvline(21, color="darkred", linestyle="--", linewidth=2, label="21 days (max)")
    ax2.axvline(
        mc_alarm["duration_days"].mean(),
        color="blue",
        linestyle="-",
        linewidth=2,
        label=f"Mean: {mc_alarm['duration_days'].mean():.2f}",
    )
    ax2.legend()
    ax2.grid(True, linestyle=":", alpha=0.7)
else:
    axes[1].text(
        0.5,
        0.5,
        "No alarm cycles data available",
        ha="center",
        va="center",
        fontsize=14,
    )
    axes[1].set_title("Alarm Cycles - No Data")

plt.tight_layout()
plt.show()

# Summary comparison
if (
    "mc_clean" in locals()
    and not mc_clean.empty
    and "mc_alarm" in locals()
    and not mc_alarm.empty
):
    print("\n📊 Duration Comparison:")
    print(f"   Clean cycles mean duration: {mc_clean['duration_days'].mean():.2f} days")
    print(f"   Alarm cycles mean duration: {mc_alarm['duration_days'].mean():.2f} days")