#### Lab 1 : Streaming Data for Predictive Maintenance with Linear Regression - Based Alerts
Albright Maduka  

CSCN 8010

##### Import necessary libraries


In [8]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sqlalchemy import create_engine, text
import psycopg2
import os


##### 1. Database Integration

###### 1.1 Connecting to the neon database

In [5]:
# NeonDB connection string
conn_str= 'postgresql://neondb_owner:npg_X0Fy7gmpTsWN@ep-withered-mud-adujci27-pooler.c-2.us-east-1.aws.neon.tech/Group1workshop?sslmode=require&channel_binding=require'

# SQLAlchemy engine
engine = create_engine(conn_str)

# Psycopg2 connection (using SSL)
raw_conn = psycopg2.connect(
 dbname="Group1workshop",
 user="neondb_owner",
 password="npg_X0Fy7gmpTsWN",
host="ep-withered-mud-adujci27-pooler.c-2.us-east-1.aws.neon.tech",
port="5432",
sslmode="require"
 )

print("Connection to NeonDatabase successful!")

Connection to NeonDatabase successful!


###### 1.2 Checking the Table Structure (columns and datatypes)

In [6]:
# Text helps to safely define Sturctured Query Language (SQL) queries
# Pandas helps to manipulate dataframes
# engine is the connection to the database

# Replace my table_name with staging_measurements
table_name = "staging_measurements"

# SQL query to list columns and datatypes
q = text(f""" 
SELECT column_name, data_type
FROM information_schema.columns
WHERE table_name = '{table_name}'
ORDER BY ordinal_position;
""") 

# Run the query with SQLAlchemy engine
df = pd.read_sql(q, engine)
print(df)


  column_name                 data_type
0       trait                      text
1       axis1          double precision
2       axis2          double precision
3       axis3          double precision
4       axis4          double precision
5       axis5          double precision
6       axis6          double precision
7       axis7          double precision
8       axis8          double precision
9        time  timestamp with time zone


###### 1.3 Safe Time Conversion, Downloading and Preprocessing staging measurements from NeonDB and Saving as CSV

In [22]:
# Table to read from
TABLE = "staging_measurements"

# Read the table into a DataFrame
df = pd.read_sql(f'SELECT * FROM "{TABLE}";', engine)
print("Rows loaded:", len(df))
print(df.head())

# converting the time in my staging_measurements table to ISO 8601 format

# checks if the table has a time column
if "time" in df.columns:
    df["time"] = pd.to_datetime(df["time"], errors="coerce", utc=True) # errors=coerce checks for invalid times and sets them to Nat (missing)
    if df["time"].notna().any():
        # Create __time_s (elapsed seconds from the first timestamp)
        t0 = df["time"].min() # where t0 is the first timestamp
        df["__time_s"] = (df["time"] - t0).dt.total_seconds() # __time_s time is coverted to seconds

        # Store time back in ISO 8601 string format
        df["time"] = df["time"].dt.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
    else:
        raise ValueError("`time` column could not be parsed to datetimes.") #If pandas could not successfully convert any of the time values into valid dates/times,
else:
    raise KeyError("No `time` column found in the data.")

# Keep only the columns needed for Lab 1
axis_cols = [c for c in df.columns if c.lower().startswith("axis")]
keep_cols = ["time", "__time_s"] + axis_cols
df = df[keep_cols]

print("Filtered columns:", df.columns.tolist())

# saving the original training data to CSV
os.makedirs("./data", exist_ok=True)
out_path = "./data/original_training_data.csv"
df.to_csv(out_path, index=False)

print(f"Data exported to ./data/original_training_data.csv")




Rows loaded: 79344
     trait  axis1  axis2  axis3  axis4  axis5  axis6  axis7  axis8  \
0  current    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
1  current    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
2  current    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
3  current    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
4  current    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   

                              time  
0 2022-10-17 12:18:23.660000+00:00  
1 2022-10-17 12:18:25.472000+00:00  
2 2022-10-17 12:18:27.348000+00:00  
3 2022-10-17 12:18:29.222000+00:00  
4 2022-10-17 12:18:31.117000+00:00  
Filtered columns: ['time', '__time_s', 'axis1', 'axis2', 'axis3', 'axis4', 'axis5', 'axis6', 'axis7', 'axis8']
Data exported to ./data/original_training_data.csv


##### 2. Streaming simulation

###### 2.1 Displaying the maximum and minimum current values in the axis of the original data and synthetic data

In [23]:

# Load datasets
train_df = pd.read_csv("./data/original_training_data.csv") # Original training data
test_df  = pd.read_csv("./data/synthetic_data.csv") # Synthetic testing data

# Choose axis columns (axis1..axis8)
axis_cols = [c for c in train_df.columns if c.lower().startswith("axis")]

# Print max & min for each axis in TRAIN
print(" Original Data (TRAIN) - Max & Min")
for col in axis_cols:
    print(f"{col} -> Max: {train_df[col].max()}, Min: {train_df[col].min()}") # Display max and min for each axis

# Print max & min for each axis in TEST
print("\nSynthetic Data (TEST) - Max & Min")
for col in axis_cols:
    print(f"{col} -> Max: {test_df[col].max()}, Min: {test_df[col].min()}") # Display max and min for each axis


 Original Data (TRAIN) - Max & Min
axis1 -> Max: 23.6093, Min: 0.0
axis2 -> Max: 51.71323, Min: 0.0
axis3 -> Max: 41.85556, Min: 0.0
axis4 -> Max: 15.6663, Min: 0.0
axis5 -> Max: 20.75076, Min: 0.0
axis6 -> Max: 20.93142, Min: 0.0
axis7 -> Max: 8.10848, Min: 0.0
axis8 -> Max: 5.90564, Min: 0.0

Synthetic Data (TEST) - Max & Min
axis1 -> Max: 8.855649127469295, Min: -8.923729773835545
axis2 -> Max: 32.52315117280435, Min: -22.41087609326153
axis3 -> Max: 26.0314144349465, Min: -17.733853606750458
axis4 -> Max: 6.576776803235276, Min: -6.26844347861212
axis5 -> Max: 8.67940667883882, Min: -8.315460084765235
axis6 -> Max: 8.25967787182789, Min: -7.396414203649386
axis7 -> Max: 9.27350726792753, Min: -8.341754711305176
axis8 -> Max: 1.7599017520357634, Min: -1.592010091392268


###### 2.2 Calculating Range (Max - Min) for both datasets

In [24]:

# Load your datasets
train_df = pd.read_csv("./data/original_training_data.csv")
test_df  = pd.read_csv("./data/synthetic_data.csv")

# Choose axis columns (axis1..axis8 typically)
axis_cols = [c for c in train_df.columns if c.lower().startswith("axis")]

# Compute range for each axis in TRAIN
train_ranges = {col: train_df[col].max() - train_df[col].min() for col in axis_cols} #finding the range of each axis

# Compute range for each axis in TEST
test_ranges = {col: test_df[col].max() - test_df[col].min() for col in axis_cols} #finding the range of each axis

print(" Range in Original (Training) Data:")
for col, r in train_ranges.items():
    print(f"{col}: {r}")

print("\n Range in Synthetic (Testing) Data:")
for col, r in test_ranges.items():
    print(f"{col}: {r}")


 Range in Original (Training) Data:
axis1: 23.6093
axis2: 51.71323
axis3: 41.85556
axis4: 15.6663
axis5: 20.75076
axis6: 20.93142
axis7: 8.10848
axis8: 5.90564

 Range in Synthetic (Testing) Data:
axis1: 17.779378901304838
axis2: 54.934027266065875
axis3: 43.765268041696956
axis4: 12.845220281847396
axis5: 16.994866763604055
axis6: 15.656092075477275
axis7: 17.615261979232706
axis8: 3.3519118434280313


###### 2.3 Generating synthetic data that has same mean and standard deviation as the original training data

In [25]:
# Reloading the original training data 
df = pd.read_csv("./data/original_training_data.csv")

# Set random seed for reproducibility
np.random.seed(42)

# It creates the Synthetic Dataset with the same mean and standard deviation as the original dataset.
synthetic_df = pd.DataFrame()  # Makes a new empty DataFrame where your synthetic data will be stored.

for col in df.columns:
    if np.issubdtype(df[col].dtype, np.number):  # numeric columns
        mu, sigma = df[col].mean(), df[col].std()  # mean (mu) and standard deviation (sigma)
        synthetic_df[col] = np.random.normal(mu, sigma, len(df))  # random normal using mu & sigma
    else:
        # Keep non-numeric columns (like timestamps or IDs) unchanged
        synthetic_df[col] = df[col]

# Save synthetic data to a new CSV file in the data folder
synthetic_df.to_csv("./data/synthetic_data.csv", index=False)
print("Synthetic data saved:", synthetic_df.shape)

# Validation: compare ranges
train_ranges = {c: df[c].max() - df[c].min() for c in df.columns if "axis" in c}
test_ranges  = {c: synthetic_df[c].max() - synthetic_df[c].min() for c in train_ranges}

print("\nRange comparison:")
for c in train_ranges:
    print(f"{c}: Train={train_ranges[c]:.2f}, Test={test_ranges[c]:.2f}")


Synthetic data saved: (79344, 10)

Range comparison:
axis1: Train=23.61, Test=18.51
axis2: Train=51.71, Test=56.39
axis3: Train=41.86, Test=44.08
axis4: Train=15.67, Test=12.96
axis5: Train=20.75, Test=19.18
axis6: Train=20.93, Test=15.21
axis7: Train=8.11, Test=19.22
axis8: Train=5.91, Test=3.74


##### 3. Regression Models & Residual Analysis

###### 3.1 Creating a folder path for the Artifacts (output results)

In [26]:
# create artifacts
DATA_DIR = Path("./data")
OUT_DIR  = Path("./artifacts"); OUT_DIR.mkdir(parents=True, exist_ok=True) # used create the artifacts folder

TRAIN_CSV = DATA_DIR / "original_training_data.csv" 
TEST_CSV  = DATA_DIR / "synthetic_data.csv"


###### 3.2 Loading Train & Test and Converting Time to seconds.

In [27]:
# Reload datasets
train_df = pd.read_csv(TRAIN_CSV)
test_df  = pd.read_csv(TEST_CSV)

# Makes sure that the dataframe has time column and it hasnt already been converted (no __times
if "time" in train_df.columns and "__time_s" not in train_df.columns:
    
    # converting the training timestamps to real datetime values
    train_df["time"] = pd.to_datetime(train_df["time"], errors="coerce", utc=True)
    # where t0 is the first timestamp
    # __time_s is the elapsed time in seconds since the start time and __time_s = (time - t0). dt.total_seconds() converts all the time to seconds
    t0 = train_df["time"].min()
    train_df["__time_s"] = (train_df["time"] - t0).dt.total_seconds()

    # while this is for the test or synthetic data, we want to use the same reference start time (t0) as the training data
    test_df["time"] = pd.to_datetime(test_df["time"], errors="coerce", utc=True)
    test_df["__time_s"] = (test_df["time"] - t0).dt.total_seconds()

# Save back to CSVs
train_df.to_csv(TRAIN_CSV, index=False)
test_df.to_csv(TEST_CSV, index=False)

# confirmation message that i has been converted
print("Converted timestamps to seconds (__time_s) in both train/test CSVs")


Converted timestamps to seconds (__time_s) in both train/test CSVs


###### 3.3 Train Univariate Linear Regression (Time → Axis values)

In [28]:
# Detect axis columns
axis_cols = [c for c in train_df.columns if c.lower().startswith("axis")]

# Train one linear regression model per axis
models = {}
params_rows = []

for col in axis_cols:
    # where x =time in seconds and where y = axis values (current)
    X = train_df[["__time_s"]].values
    y = pd.to_numeric(train_df[col], errors="coerce").values
    mask = ~np.isnan(X).ravel() & ~np.isnan(y) # where mask removes missing values so that the model trains cleanly

    # function that fits a linear regression (y= slope * X(time) + intercept)
    lr = LinearRegression().fit(X[mask].reshape(-1,1), y[mask]) # Train model
    models[col] = lr

    params_rows.append({
        # extract the slope as lr.coef_[0] and intercept as lr.intercept_
        "axis": col,
        "slope": float(lr.coef_[0]),
        "intercept": float(lr.intercept_)
    })

# Save model parameters to CSV
params_df = pd.DataFrame(params_rows).sort_values("axis")
params_df.to_csv(OUT_DIR / "model_params.csv", index=False)
print("Model parameters saved:", OUT_DIR / "model_params.csv")


Model parameters saved: artifacts\model_params.csv


###### 3.4 Residuals on Original Training Data

In [29]:
train_resid = train_df[["__time_s"]].copy()
sigma_rows = []

for col in axis_cols:
    lr = models[col]
    X = train_df[["__time_s"]].values
    y = pd.to_numeric(train_df[col], errors="coerce").values
    mask = ~np.isnan(X).ravel() & ~np.isnan(y)

    y_hat = np.full(y.shape, np.nan, dtype=float)
    y_hat[mask] = lr.predict(X[mask].reshape(-1,1))

    resid = y - y_hat
    train_resid[f"{col}_resid"] = resid

    sigma_rows.append({"axis": col, "sigma": float(np.nanstd(resid, ddof=1))})

# Create DataFrame of sigma values
sigma_df = pd.DataFrame(sigma_rows)
# Save training residuals
train_resid.to_csv(OUT_DIR / "train_residuals.csv", index=False)


##### 4. Testing Residuals with Thresholds (Anomaly Detection)

In [30]:
T_SECONDS = 2  # event must persist ≥30s

thresholds = sigma_df.copy()
thresholds["MinC"] = thresholds["sigma"] * 2.0   # Alert
thresholds["MaxC"] = thresholds["sigma"] * 3.0   # Error
thresholds["T_seconds"] = T_SECONDS

# Saving Thresholds to CSV
thresholds.to_csv(OUT_DIR / "thresholds.csv", index=False)
print("Thresholds saved:", OUT_DIR / "thresholds.csv")
display(thresholds)


Thresholds saved: artifacts\thresholds.csv


Unnamed: 0,axis,sigma,MinC,MaxC,T_seconds
0,axis1,2.162104,4.324209,6.486313,2
1,axis2,6.879731,13.759462,20.639193,2
2,axis3,5.111851,10.223702,15.335554,2
3,axis4,1.574861,3.149722,4.724583,2
4,axis5,2.100172,4.200344,6.300515,2
5,axis6,1.815453,3.630907,5.44636,2
6,axis7,2.166759,4.333518,6.500278,2
7,axis8,0.423068,0.846136,1.269204,2


##### 5. Alerts & Errors Implementation

###### 5.1 Computing Residuals (errors) on Synthetic Data

In [31]:
# where test residuals dataframe = test_reside
# where __time_s is the time in seconds
test_resid = test_df[["__time_s"]].copy()

for col in axis_cols: # Loop through each axis column
    lr = models[col] #using linear regression model
    X = test_df[["__time_s"]].values
    y = pd.to_numeric(test_df[col], errors="coerce").values
    mask = ~np.isnan(X).ravel() & ~np.isnan(y) # Remove NaNs
    # where mask is a boolean array (true or false)

    y_hat = np.full(y.shape, np.nan, dtype=float) 
    y_hat[mask] = lr.predict(X[mask].reshape(-1,1)) # Predict only where we have valid data

    # computing residuals
    test_resid[f"{col}_resid"] = y - y_hat # residuals= observed or original - predicted

#saving test residuals as csv
test_resid.to_csv(OUT_DIR / "test_residuals.csv", index=False)
print("Test residuals saved:", OUT_DIR / "test_residuals.csv")


Test residuals saved: artifacts\test_residuals.csv


###### 5.2 Detect Alerts & Errors

In [34]:
# This function finds continuous runs of True values in a condition
# 1st function: find_runs
def find_runs(time_series, mask_bool, min_sec):
    # converts the true/false values into 1s and 0s
    arr = mask_bool.to_numpy().astype(int)
    
    # where np.diff checks where the values change
    # where it goes from 0 to 1 or 1 to o ie a run starts or end 
    change = np.diff(arr, prepend=0)
    starts = np.where(change == 1)[0] # where the condition changes from False to True
    ends   = np.where(change == -1)[0] - 1 # where the condition changes from True to False
    if mask_bool.iloc[-1]:
        ends = np.append(ends, len(mask_bool)-1)


    events = [] # list to store events
    for s, e in zip(starts, ends):
        dur = float(time_series.iloc[e] - time_series.iloc[s])
        # only keep it if the run lasted for at least min_sec seconds.
        if dur >= min_sec:
            events.append((s, e, dur)) # where s is the start index, e is the end index, dur is the duration in seconds
    return events

# 2nd function: log_axis_events
# This function logs events for a given axis based on residual thresholds
# This function checks for alerts and errors events and records them in details
def log_axis_events(df, axis, MinC, MaxC, Tsec):
    # Get the residuals for the axis and the time column
    resid = df[f"{axis}_resid"]
    time_series = df["__time_s"]

    # start an empty list to collects all the events for this axis
    rows = []
   
    # checks for alerts and errors using the minimum and maximum thresholds conditions
    for level, cond in [("ALERT", resid >= MinC), ("ERROR", resid >= MaxC)]:
        for s, e, dur in find_runs(time_series, cond, Tsec):
            # Take residuals during the runs (from start to end).
            seg = resid.iloc[s:e+1]

            # Records one event row using axis, level, start_time, end_time, duration_sec, peak_residual
            rows.append({
                "axis": axis, # The axis being analyzed
                "level": level, # ALERT or ERROR
                "start_time": float(time_series.iloc[s]),
                "end_time": float(time_series.iloc[e]),
                "duration_sec": dur,
                "peak_residual": float(seg.max())
            })
    return rows # Return list of event rows

# Apply across all axes
# Convert the thresholds DataFrame into a dictionary so you can quickly look up MinC, MaxC, T_seconds for each axis.
thr = thresholds.set_index("axis").to_dict(orient="index")
# loops through all axes: in these way grabs its thresholds, runs the log_axis_events function, and collects all the events into log_rows
log_rows = []
for axis in axis_cols:
    a = thr[axis]
    log_rows += log_axis_events(test_resid, axis, a["MinC"], a["MaxC"], a["T_seconds"])

# Save alerts log
alerts_log = pd.DataFrame(log_rows)
alerts_log.to_csv(OUT_DIR / "alerts_log.csv", index=False)
print("Alerts log saved:", OUT_DIR / "alerts_log.csv")
display(alerts_log.head())




Alerts log saved: artifacts\alerts_log.csv


Unnamed: 0,axis,level,start_time,end_time,duration_sec,peak_residual
0,axis1,ALERT,68286.899595,73641.429023,5354.529428,4.649761
1,axis1,ALERT,13271.60479,78780.400914,65508.796124,4.634452
2,axis1,ALERT,28441.145904,69022.573897,40581.427993,4.795733
3,axis1,ALERT,49865.8225,51275.686105,1409.863605,4.524827
4,axis1,ALERT,56948.090909,74296.401604,17348.310695,5.60765


##### 6. Visualization & Dashboard

###### 6.1 Comparing Observed vs Regression (Visualization)

In [35]:
# Building a fast thresholds lookup for plotting
thr = thresholds.set_index("axis").to_dict(orient="index")

# saving the plots in the plots folder
PLOTS_DIR = (OUT_DIR / "plots"); PLOTS_DIR.mkdir(parents=True, exist_ok=True)

# defining the plotting funcion
def plot_test_observed_vs_regression(test_df, test_resid_df, axis, lr, path): 
    # where test_df is the test or synthetic data, test_resid_df is the test residuals dataframe, axis is the axis column, lr is the linear regression model, path is the file path to save the plot
    t = test_df["__time_s"].values
    y = pd.to_numeric(test_df[axis], errors="coerce").values
    m = ~np.isnan(t) & ~np.isnan(y)
    t = t[m]; y = y[m]

    # Predict using the regriession line
    yhat = lr.predict(t.reshape(-1,1)) # give scikit learn  the required 2D shape

    # get residuals
    resid = test_resid_df[f"{axis}_resid"].loc[m].values
    MinC  = thr[axis]["MinC"]
    MaxC  = thr[axis]["MaxC"]

   # plotting
    plt.figure(figsize=(10,5))
    plt.scatter(t, y, s=6, alpha=0.6, label="Observed (TEST)")
    plt.plot(t, yhat, lw=1.5, label="Regression")
   
    # Highlight points exceeding thresholds
    over_min = resid >= MinC
    over_max = resid >= MaxC
    if over_min.any():
        plt.scatter(t[over_min], y[over_min], s=14, marker="o", label="≥ MinC (Alert)")
    if over_max.any():
        plt.scatter(t[over_max], y[over_max], s=18, marker="x", label="≥ MaxC (Error)")

    # labels
    plt.title(f"{axis}: Observed vs Regression (TEST)")
    plt.xlabel("elapsed seconds"); plt.ylabel(axis); plt.legend()
    plt.tight_layout(); plt.savefig(path, dpi=150); plt.close()

# saving the plots in the plots folder
for axis in axis_cols:
    plot_test_observed_vs_regression(test_df, test_resid, axis, models[axis],
                                     PLOTS_DIR / f"{axis}_test_observed_vs_regression.png")

print("TEST observed-vs-regression plots saved →", PLOTS_DIR)


TEST observed-vs-regression plots saved → artifacts\plots


###### 6.2 Training Regression plot (visualization)

In [161]:
# creates a folder called plots inside my artifacts folder
PLOTS_DIR = (OUT_DIR / "plots"); PLOTS_DIR.mkdir(parents=True, exist_ok=True)

# function to plot one axis at a time
def plot_train_regression(df, axis, lr, path):
    X = df[["__time_s"]].values # where x is time in seconds.
    y = pd.to_numeric(df[axis], errors="coerce").values # where y is the axis observed values.
    m = ~np.isnan(X).ravel() & ~np.isnan(y) #mask to filter out NaN values
    X = X[m]; y = y[m]
    yhat = lr.predict(X) #yhat is the predicted values from the linear regression model.

    # creates the plot
    plt.figure(figsize=(8,5))
    # Scatter in blue
    plt.scatter(X.ravel(), y, s=6, alpha=0.6, color="royalblue", label="actual observed training data")
    # Regression line in orange
    plt.plot(X.ravel(), yhat, linewidth=2, color="darkorange", label="Regression line (the model's predicted trend)")

   #labels axes, adds title and legend 
    plt.xlabel("elapsed seconds")
    plt.ylabel(axis)
    plt.title(f"TRAIN: {axis} vs time (linear fit)")
    plt.legend()
    plt.tight_layout()
    plt.savefig(path, dpi=150)
    plt.close()
 
 #Loops over all 8 axes, and saves 8 training regression plots.
for axis in axis_cols:
    plot_train_regression(train_df, axis, models[axis], PLOTS_DIR / f"{axis}_train_regression.png")

print("TRAIN regression plots saved →", PLOTS_DIR) 


TRAIN regression plots saved → artifacts\plots


###### 6.3 Testing Residuals with Thresholds and events or testing the system for anomalie (Visualization)

In [164]:
# Create a dictionary of events (alerts/errors) by axis
# collects events per axis
events_by_axis = {a: [] for a in axis_cols}
if len(alerts_log) > 0: # if there are any alerts or errors logged
    for a in axis_cols:
        events_by_axis[a] = alerts_log[alerts_log["axis"] == a].to_dict(orient="records")

# function to plot residuals for one axis at a time
def plot_test_residuals(df, axis, MinC, MaxC, events, path):
    t = df["__time_s"].values  # Where x is time in seconds
    r = df[f"{axis}_resid"].values # where r is residuals = observed - predicted 

# plots the residuals  curve in steel blue. And draws alert threshold (green dashed line) and error threshold (red dash-dot line).
    plt.figure(figsize=(10,5))
    # Residual curve in steelblue
    plt.plot(t, r, linewidth=1.2, color="steelblue", label="residual line")
    # Threshold lines
    plt.axhline(MinC, linestyle="--", color="green", label="MinC (Alert threshold)")
    plt.axhline(MaxC, linestyle="-.", color="red", label="MaxC (Error threshold)")

    # Event markers: red for ERROR, orange for ALERT
    # loops through the list of logged events for that axis
    for ev in events:

        # Find the middle time of the event 
        mid_t = (float(ev["start_time"]) + float(ev["end_time"])) / 2

        # Take the peak residual (largest deviation during the event)
        mid_r = ev["peak_residual"]

        # Plot a marker at that point
        if ev["level"] == "ERROR":
            plt.scatter([mid_t], [mid_r], s=40, marker="o", color="red", label="ERROR Event")
        else:
            plt.scatter([mid_t], [mid_r],color="orange", label="ALERT Event")
            # Where Mid_t is xaxis, Mid_r is yaxis, s=40 is the size of the marker

    plt.xlabel("elapsed seconds")
    plt.ylabel("residual (observed - predicted)")
    plt.title(f"TEST residuals: {axis}")
    plt.legend()
    plt.tight_layout()
    plt.savefig(path, dpi=150)
    plt.close()

#Loops through all axes and saves residual plots with thresholds & events.
for axis in axis_cols:
    a = thr[axis]
    plot_test_residuals(test_resid, axis, a["MinC"], a["MaxC"], events_by_axis[axis], PLOTS_DIR / f"{axis}_test_residuals.png")

print("TEST residual plots saved →", PLOTS_DIR)

TEST residual plots saved → artifacts\plots


###### Note: Why is because there are no errors and only alerts is because the synthetic data did not produce residuals large enough and long enough to exceed the error threshold (MaxC)

###### 6.4 Summary Dashoard of Alerts and Errors

In [163]:
# check if alerts_log is empty
if alerts_log.empty:
    summary = pd.DataFrame(columns=["axis","alerts","errors","longest_event_s"])
 # where alerts_log is empty, create an empty summary dataframe with the specified columns

# otherwise, create a summary dataframe with counts of alerts and errors, and the longest event duration for each axis
else:
    counts = alerts_log.groupby(["axis","level"]).size().unstack(fill_value=0)
    longest = alerts_log.groupby("axis")["duration_sec"].max().rename("longest_event_s")
    summary = pd.concat([counts.reindex(columns=["ALERT","ERROR"]).rename(columns=str.lower), longest],
                        axis=1).reset_index()

summary.to_csv(OUT_DIR / "summary_dashboard.csv", index=False)
print("Saved summary_dashboard.csv"); print(summary.head())


Saved summary_dashboard.csv
    axis  alert  error  longest_event_s
0  axis1      8    NaN     46411.627930
1  axis2      9    NaN     65509.208948
2  axis3     14    NaN     81480.079580
3  axis4      9    NaN     53146.592441
4  axis5     11    NaN     98870.678102
