# Import Libraries and Load Data

In [1]:
# Import necessary libraries
import pandas as pd

# Load your dataset (update path accordingly)
df = pd.read_csv("C:/Users/98939/Downloads/MASTER_DATASET.csv")

# Show shape and first rows to verify
print(f"Dataset shape: {df.shape}")
df.head()

Dataset shape: (100000, 43)


Unnamed: 0,Timestamp,Time,Pressure_In,Temperature_In,Flow_Rate,Pressure_Out,Temperature_Out,Efficiency,Power_Consumption,Vibration,...,Velocity,Viscosity,Phase_Angle,vib_min,vib_rms,Pressure_In_Filtered,Temperature_In_Filtered,Label,Flow_Rate_Filtered,Vibration_Filtered
0,2025-01-01 00:00:00,0,3.524836,25.880595,12.156184,17.436829,203.077815,0.813001,5803.345268,1.151771,...,30.063063,1e-05,183.382071,0.523325,1.151771,3.524836,25.880595,1.0,12.156184,1.151771
1,2025-01-01 00:00:01,1,3.503083,23.692395,12.010572,17.406759,197.494853,0.823139,5554.713072,1.010584,...,31.895808,1e-05,183.198709,0.523325,1.010584,3.511784,24.567675,0.0,12.068817,1.067059
2,2025-01-01 00:00:02,2,3.552351,25.41644,11.907004,17.954038,203.341908,0.820196,5657.678081,0.949326,...,31.00597,1e-05,183.712548,0.523325,0.949326,3.536748,25.089992,0.0,11.96924,0.994608
3,2025-01-01 00:00:03,3,3.606039,24.210525,11.920992,18.21332,201.438404,0.819783,5644.959322,1.079461,...,30.242627,1e-05,180.863281,0.523325,1.079461,3.579546,24.546792,0.0,11.93944,1.047017
4,2025-01-01 00:00:04,4,3.528026,24.48664,12.045394,17.854835,200.631142,0.825436,5630.174826,1.068124,...,29.277585,1e-05,184.095677,0.523325,1.068124,3.547708,24.50962,0.0,12.004917,1.060061


# Define Physical Bound Rules

In [2]:
# Physical Bound Rules Explanation:
# These rules check if each feature value lies within physically plausible ranges.
# For example, pressure cannot be negative, efficiency should be between 0 and 1, vibration within sensor limits, etc.
# The thresholds here are based on data exploration and domain knowledge.

# Adding physical bound checks as boolean columns (True if rule passes, False otherwise)

df["rule_pressure_in"] = df["Pressure_In"].between(
    3.0, 4.0
)  # Pressure expected in range 3.0 to 4.0
df["rule_temperature_in"] = df["Temperature_In"].between(
    10, 30
)  # Temperature in degrees Celsius
df["rule_flow_rate"] = df["Flow_Rate"].between(11.0, 13.0)  # Flow rate reasonable range
df["rule_pressure_diff"] = (df["Pressure_Out"] - df["Pressure_In"]).between(
    12, 16
)  # Pressure difference range from data stats
df["rule_efficiency"] = df["Efficiency"].between(
    0, 1
)  # Efficiency should be between 0 and 1
df["rule_vibration"] = df["Vibration"].between(
    0.5, 1.3
)  # Vibration sensor range based on data
df["rule_ambient_temp"] = df["Ambient_Temperature"].between(
    15, 35
)  # Ambient temp reasonable range
df["rule_power"] = df["Power_Consumption"].between(
    4000, 7000
)  # Power consumption expected range

# Check how many samples violate physical bounds
physical_violations = (
    df[[col for col in df.columns if col.startswith("rule_")]].apply(lambda x: ~x).sum()
)
print("Physical Bound Rule Violations per Feature:")
print(physical_violations)

Physical Bound Rule Violations per Feature:
rule_pressure_in          0
rule_temperature_in       0
rule_flow_rate            0
rule_pressure_diff      706
rule_efficiency           0
rule_vibration         7658
rule_ambient_temp         0
rule_power                0
dtype: int64


# Define Rate-of-Change (RoC) Rules

In [3]:
# Rate-of-Change Rules Explanation:
# These rules ensure the measured parameters do not change too abruptly between consecutive time steps.
# Sudden spikes or drops can indicate sensor errors or abnormal operation.

# Thresholds for maximum allowed absolute change between consecutive rows:
temp_thresh = 2.5  # Max allowed temp change between time steps
vib_thresh = 0.15  # Max allowed vibration change
pressure_thresh = 0.3  # Max allowed pressure change

# Calculate absolute differences (rate of change) between consecutive rows for relevant features
df["delta_temperature"] = df["Temperature_In"].diff().abs()
df["delta_vibration"] = df["Vibration"].diff().abs()
df["delta_pressure"] = df["Pressure_In"].diff().abs()

# Check if change is within threshold (True if valid)
df["rule_temp_roc"] = df["delta_temperature"] < temp_thresh
df["rule_vib_roc"] = df["delta_vibration"] < vib_thresh
df["rule_pressure_roc"] = df["delta_pressure"] < pressure_thresh

# For the first row, where diff() results in NaN, fill with True (no prior value to compare)
df[["rule_temp_roc", "rule_vib_roc", "rule_pressure_roc"]] = df[
    ["rule_temp_roc", "rule_vib_roc", "rule_pressure_roc"]
].fillna(True)

# Count violations in rate-of-change rules
roc_violations = (
    df[["rule_temp_roc", "rule_vib_roc", "rule_pressure_roc"]].apply(lambda x: ~x).sum()
)
print("Rate-of-Change Rule Violations per Feature:")
print(roc_violations)

Rate-of-Change Rule Violations per Feature:
rule_temp_roc        7686
rule_vib_roc         3747
rule_pressure_roc       2
dtype: int64


# Combine All Rules for Final Decision

In [6]:
# Combine all rule checks to get an overall rule pass/fail for each row
rule_cols = [col for col in df.columns if col.startswith("rule_")]
df["all_rules_pass"] = df[rule_cols].all(axis=1)

# How many rows fully pass all rules?
print(
    f"Number of samples passing all rules: {df['all_rules_pass'].sum()} out of {len(df)}"
)

# Flag samples that fail any rule for further inspection or cleaning
df["rule_violation_flag"] = ~df["all_rules_pass"]

# Summary of flagged rows
print(f"Number of samples violating any rule: {df['rule_violation_flag'].sum()}")

Number of samples passing all rules: 0 out of 100000
Number of samples violating any rule: 100000


# visualize

In [11]:
# Step 1: Define all rule names by type
# These rules refer to physical limits of each feature
physical_rules = [
    "rule_pressure_in",
    "rule_temperature_in",
    "rule_flow_rate",
    "rule_pressure_diff",
    "rule_efficiency",
    "rule_vibration",
    "rule_ambient_temp",
    "rule_power",
]

# These rules refer to how rapidly a value can change (rate-of-change)
rate_change_rules = [
    "rule_temp_roc",
    "rule_vib_roc",
    "rule_pressure_roc",
    "rule_flow_roc",
]

# Combine all rule names
all_rules = physical_rules + rate_change_rules

# Step 2: Create summary of violated samples
summary_list = []

for rule in all_rules:
    if rule in df.columns:
        # Count the number of samples violating this rule (False)
        count_violated = (~df[rule]).sum()

        # Identify the type of rule
        rule_type = "Physical" if rule in physical_rules else "Rate-of-Change"

        # Append results
        summary_list.append(
            {
                "Rule_Name": rule,
                "Rule_Type": rule_type,
                "Violation_Count": count_violated,
            }
        )

# Step 3: Convert to DataFrame and sort by violation count
summary_df = pd.DataFrame(summary_list)
summary_df = summary_df.sort_values(by="Violation_Count", ascending=False).reset_index(
    drop=True
)

# Step 4: Display final rule violation summary table
summary_df

Unnamed: 0,Rule_Name,Rule_Type,Violation_Count
0,rule_temp_roc,Rate-of-Change,7686
1,rule_vibration,Physical,7658
2,rule_vib_roc,Rate-of-Change,3747
3,rule_pressure_diff,Physical,706
4,rule_pressure_roc,Rate-of-Change,2
5,rule_pressure_in,Physical,0
6,rule_temperature_in,Physical,0
7,rule_flow_rate,Physical,0
8,rule_efficiency,Physical,0
9,rule_ambient_temp,Physical,0


# Handling Violations

In [None]:
# Next Steps Explanation:
# After detecting which rows violate rules, possible actions include:
# 1. Remove violating rows (simplest but may lose data)
# 2. Impute/correct violating values based on nearby data or domain logic
# 3. Flag for manual review or anomaly detection models

# Example 1: Removing violating rows
df_cleaned = df[df["all_rules_pass"]].copy()
print(f"Cleaned dataset shape after removing violations: {df_cleaned.shape}")

# Example 2: Impute violations with rolling mean (window=3)
# Only applying to physical bounds violations for demonstration
for feature, rule in [
    ("Pressure_In", "rule_pressure_in"),
    ("Temperature_In", "rule_temperature_in"),
    ("Flow_Rate", "rule_flow_rate"),
    ("Vibration", "rule_vibration"),
]:
    mask = ~df[rule]
    df.loc[mask, feature] = df[feature].rolling(window=3, center=True).mean()[mask]

# After imputation, re-check rules (optional)
# You can rerun rule checks or just inspect corrected samples

# Example 3: Flagging for anomaly detection or further review
df["anomaly_flag"] = ~df["all_rules_pass"]

# Save flagged samples for separate analysis
df_flagged = df[df["anomaly_flag"]]
print(f"Flagged samples for further analysis: {df_flagged.shape[0]}")