In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# === Load and prepare dataset ===
df = pd.read_csv("/content/drive/MyDrive/CWDS_DEMO/BATADAL Datasets/BATADAL_normal.csv")  # Normal behavior data
sensor_cols = [col for col in df.columns if col not in ['DATETIME', 'ATT_FLAG']]

# Predict future value of target variable (e.g., L_T1)
df['L_T1_next'] = df['L_T1'].shift(-1)
df.dropna(inplace=True)  # Remove last row with NaN target

X = df[sensor_cols]
y = df['L_T1_next']

# Scale features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# === Helper function to evaluate models ===
def evaluate(model_name, y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_absolute_error(y_true, y_pred))
    print(f"\n🔍 {model_name} Performance:")
    print(f"R² Score: {r2:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"RMSE: {rmse:.4f}")
    return model_name, r2, mae, rmse

results = []

# === 1. Linear Regression ===
lr = LinearRegression()
lr.fit(X_train, y_train)
results.append(evaluate("Linear Regression", y_test, lr.predict(X_test)))

# === 2. Random Forest ===
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
results.append(evaluate("Random Forest", y_test, rf.predict(X_test)))

# === 3. XGBoost ===
xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xgb.fit(X_train, y_train)
results.append(evaluate("XGBoost", y_test, xgb.predict(X_test)))

# === 4. MLP Regressor (SKLearn) ===
mlp = MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42)
mlp.fit(X_train, y_train)
results.append(evaluate("MLPRegressor", y_test, mlp.predict(X_test)))

# === 5. Keras Feedforward DNN ===
dnn = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)
])
dnn.compile(optimizer='adam', loss='mae')
dnn.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)
y_pred_dnn = dnn.predict(X_test).flatten()
results.append(evaluate("Keras DNN", y_test, y_pred_dnn))

# === Show all results ===
results_df = pd.DataFrame(results, columns=["Model", "R2", "MAE", "RMSE"])
print("\n📊 Summary of All Models:")
print(results_df.sort_values("R2", ascending=False).reset_index(drop=True))



🔍 Linear Regression Performance:
R² Score: 0.9973
MAE: 0.0420
RMSE: 0.2050

🔍 Random Forest Performance:
R² Score: 0.9984
MAE: 0.0340
RMSE: 0.1845

🔍 XGBoost Performance:
R² Score: 0.9989
MAE: 0.0302
RMSE: 0.1737

🔍 MLPRegressor Performance:
R² Score: 0.9963
MAE: 0.0531
RMSE: 0.2305


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

🔍 Keras DNN Performance:
R² Score: 0.9977
MAE: 0.0390
RMSE: 0.1976

📊 Summary of All Models:
               Model        R2       MAE      RMSE
0            XGBoost  0.998853  0.030182  0.173730
1      Random Forest  0.998358  0.034043  0.184507
2          Keras DNN  0.997731  0.039037  0.197579
3  Linear Regression  0.997310  0.042005  0.204951
4       MLPRegressor  0.996325  0.053107  0.230450


Several Targets

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# === Load and prepare dataset ===
df = pd.read_csv("/content/drive/MyDrive/CWDS_DEMO/BATADAL Datasets/BATADAL_normal.csv")  # Normal behavior data
sensor_cols = [col for col in df.columns if col not in ['DATETIME', 'ATT_FLAG']]

# === List of target sensors to test ===
target_sensors = ['L_T1', 'L_T2', 'P_J280']

# === Helper function to evaluate models ===
def evaluate(model_name, y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_absolute_error(y_true, y_pred))
    return model_name, r2, mae, rmse

all_results = []

for target_sensor in target_sensors:
    print(f"\n=== Evaluating target: {target_sensor}_next ===")
    df_temp = df.copy()
    df_temp[f'{target_sensor}_next'] = df_temp[target_sensor].shift(-1)
    df_temp.dropna(inplace=True)

    X = df_temp[sensor_cols]
    y = df_temp[f'{target_sensor}_next']

    # Scale features
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Store metrics per model
    results = []

    # 1. Linear Regression
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    results.append(evaluate("Linear Regression", y_test, lr.predict(X_test)))

    # 2. Random Forest
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    results.append(evaluate("Random Forest", y_test, rf.predict(X_test)))

    # 3. XGBoost
    xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
    xgb.fit(X_train, y_train)
    results.append(evaluate("XGBoost", y_test, xgb.predict(X_test)))

    # 4. MLP Regressor
    mlp = MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42)
    mlp.fit(X_train, y_train)
    results.append(evaluate("MLPRegressor", y_test, mlp.predict(X_test)))

    # 5. Keras DNN
    dnn = Sequential([
        Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(32, activation='relu'),
        Dense(1)
    ])
    dnn.compile(optimizer='adam', loss='mae')
    dnn.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)
    y_pred_dnn = dnn.predict(X_test).flatten()
    results.append(evaluate("Keras DNN", y_test, y_pred_dnn))

    # Add target label and store
    for res in results:
        all_results.append((res[0], target_sensor, res[1], res[2], res[3]))

# === Convert to DataFrame and aggregate ===
all_df = pd.DataFrame(all_results, columns=["Model", "Target", "R2", "MAE", "RMSE"])

print("\n📊 Full Results by Target:")
print(all_df)

summary = all_df.groupby("Model").mean().reset_index()
print("\n📈 Average Performance Across Targets:")
print(summary.sort_values("R2", ascending=False))



=== Evaluating target: L_T1_next ===


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

=== Evaluating target: L_T2_next ===


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

=== Evaluating target: P_J280_next ===


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

📊 Full Results by Target:
                Model  Target         R2       MAE      RMSE
0   Linear Regression    L_T1   0.997310  0.042005  0.204951
1       Random Forest    L_T1   0.998358  0.034043  0.184507
2             XGBoost    L_T1   0.998853  0.030182  0.173730
3        MLPRegressor    L_T1   0.996325  0.053107  0.230450
4           Keras DNN    L_T1   0.998117  0.037194  0.192858
5   Linear Regression    L_T2   0.991543  0.090447  0.300745
6       Random Forest    L_T2   0.997504  0.048041  0.219183
7             XGBoost    L_T2   0.998225  0.044987  0.212101
8        MLPRegressor    L_T2   0.993631  0.085258  0.291990
9           Keras DNN    L_T2   0.995885  0.057627  0.240055
10  Linear Regression  P_J280   0.864456  0.001320  0.036334
11      Random Forest  P_J280   0.972027  0.000378  0.019454
12            XGBoost  P_J280   0.966822  0.000419  0.020481
13       MLPRegressor  P_J280 -16.392766  0.01

TypeError: agg function failed [how->mean,dtype->object]

In [None]:
# === Convert to DataFrame and aggregate ===
all_df = pd.DataFrame(all_results, columns=["Model", "Target", "R2", "MAE", "RMSE"])

print("\n📊 Full Results by Target:")
print(all_df)

summary = all_df.groupby("Model")[["R2", "MAE", "RMSE"]].mean().reset_index()
print("\n📈 Average Performance Across Targets:")
print(summary.sort_values("R2", ascending=False))



📊 Full Results by Target:
                Model  Target         R2       MAE      RMSE
0   Linear Regression    L_T1   0.997310  0.042005  0.204951
1       Random Forest    L_T1   0.998358  0.034043  0.184507
2             XGBoost    L_T1   0.998853  0.030182  0.173730
3        MLPRegressor    L_T1   0.996325  0.053107  0.230450
4           Keras DNN    L_T1   0.998117  0.037194  0.192858
5   Linear Regression    L_T2   0.991543  0.090447  0.300745
6       Random Forest    L_T2   0.997504  0.048041  0.219183
7             XGBoost    L_T2   0.998225  0.044987  0.212101
8        MLPRegressor    L_T2   0.993631  0.085258  0.291990
9           Keras DNN    L_T2   0.995885  0.057627  0.240055
10  Linear Regression  P_J280   0.864456  0.001320  0.036334
11      Random Forest  P_J280   0.972027  0.000378  0.019454
12            XGBoost  P_J280   0.966822  0.000419  0.020481
13       MLPRegressor  P_J280 -16.392766  0.019797  0.140702
14          Keras DNN  P_J280  -2.001980  0.009590  0.0979

In [None]:
import joblib

# Save the trained XGBoost model
joblib.dump(xgb, "xgboost_predictor.pkl")

# Save the list of features used
joblib.dump(sensor_cols, "xgboost_features.pkl")

print("✅ XGBoost model and feature list saved.")


✅ XGBoost model and feature list saved.


WITH AREAL'S DATASET

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# === Load and prepare dataset ===
df = pd.read_excel("/content/drive/MyDrive/CWDS_DEMO/Areal Dataset/Data without attacks.xlsx")  # Normal behavior data
sensor_cols = [col for col in df.columns if col not in ['Date ','Hours', 'ATT_FLAG']]

# === List of target sensors to test ===
target_sensors = ['TANKLEVEL','OUTPUTFLOW','RESERVETANKVOLUME']

# === Helper function to evaluate models ===
def evaluate(model_name, y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_absolute_error(y_true, y_pred))
    return model_name, r2, mae, rmse

all_results = []

for target_sensor in target_sensors:
    print(f"\n=== Evaluating target: {target_sensor}_next ===")
    df_temp = df.copy()
    df_temp[f'{target_sensor}_next'] = df_temp[target_sensor].shift(-1)
    df_temp.dropna(inplace=True)

    X = df_temp[sensor_cols]
    y = df_temp[f'{target_sensor}_next']

    # Scale features
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Store metrics per model
    results = []

    # 1. Linear Regression
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    results.append(evaluate("Linear Regression", y_test, lr.predict(X_test)))

    # 2. Random Forest
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    results.append(evaluate("Random Forest", y_test, rf.predict(X_test)))

    # 3. XGBoost
    xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
    xgb.fit(X_train, y_train)
    results.append(evaluate("XGBoost", y_test, xgb.predict(X_test)))

    # 4. MLP Regressor
    mlp = MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42)
    mlp.fit(X_train, y_train)
    results.append(evaluate("MLPRegressor", y_test, mlp.predict(X_test)))

    # 5. Keras DNN
    dnn = Sequential([
        Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(32, activation='relu'),
        Dense(1)
    ])
    dnn.compile(optimizer='adam', loss='mae')
    dnn.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)
    y_pred_dnn = dnn.predict(X_test).flatten()
    results.append(evaluate("Keras DNN", y_test, y_pred_dnn))

    # Add target label and store
    for res in results:
        all_results.append((res[0], target_sensor, res[1], res[2], res[3]))



=== Evaluating target: TANKLEVEL_next ===


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

=== Evaluating target: OUTPUTFLOW_next ===


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step

=== Evaluating target: RESERVETANKVOLUME_next ===


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


In [None]:
# === Convert to DataFrame and aggregate ===
all_df = pd.DataFrame(all_results, columns=["Model", "Target", "R2", "MAE", "RMSE"])

print("\n📊 Full Results by Target:")
print(all_df)

summary = all_df.groupby("Model")[["R2", "MAE", "RMSE"]].mean().reset_index()
print("\n📈 Average Performance Across Targets:")
print(summary.sort_values("R2", ascending=False))



📊 Full Results by Target:
                Model             Target        R2        MAE      RMSE
0   Linear Regression          TANKLEVEL  0.999995   0.001503  0.038764
1       Random Forest          TANKLEVEL  0.999987   0.004911  0.070079
2             XGBoost          TANKLEVEL  0.999976   0.008254  0.090853
3        MLPRegressor          TANKLEVEL  0.999853   0.016569  0.128720
4           Keras DNN          TANKLEVEL  0.999989   0.004387  0.066235
5   Linear Regression         OUTPUTFLOW  0.982759  11.540820  3.397178
6       Random Forest         OUTPUTFLOW  0.993423   1.702474  1.304789
7             XGBoost         OUTPUTFLOW  0.994071   1.767303  1.329399
8        MLPRegressor         OUTPUTFLOW  0.990095   7.525835  2.743326
9           Keras DNN         OUTPUTFLOW  0.981772   4.897777  2.213092
10  Linear Regression  RESERVETANKVOLUME  0.999999   0.061581  0.248155
11      Random Forest  RESERVETANKVOLUME  0.999987   0.290753  0.539215
12            XGBoost  RESERVETANKVOL

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# === Load datasets ===
df_normal = pd.read_excel("/content/drive/MyDrive/CWDS_DEMO/Areal Dataset/Data without attacks.xlsx")
df_attack = pd.read_excel("/content/drive/MyDrive/CWDS_DEMO/Areal Dataset/Data with attacks.xlsx")

print("✅ Normal shape:", df_normal.shape)
print("✅ Attack shape:", df_attack.shape)

# Merge datasets
df_full = pd.concat([df_normal, df_attack], ignore_index=True)
print("✅ Merged dataset shape:", df_full.shape)

# === Prepare columns ===
sensor_cols = [col for col in df_full.columns if col not in ['Date', 'Hours', 'ATT_FLAG','DATETIME']]

# === Define target sensors ===
target_sensors = ['TANKLEVEL', 'OUTPUTFLOW', 'RESERVETANKVOLUME']

# === Evaluate helper ===
def evaluate(model_name, y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return model_name, r2, mae, rmse

all_results = []

for target_sensor in target_sensors:
    print(f"\n=== Target: {target_sensor}_next ===")
    df_temp = df_full.copy()

    # Create prediction target (next time step)
    df_temp[f'{target_sensor}_next'] = df_temp[target_sensor].shift(-1)
    df_temp.dropna(inplace=True)

    X = df_temp[sensor_cols]
    y = df_temp[f'{target_sensor}_next']

    # Scale features
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    # Split train/test
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    results = []

    # Linear Regression
    lr = LinearRegression().fit(X_train, y_train)
    results.append(evaluate("Linear Regression", y_test, lr.predict(X_test)))

    # Random Forest
    rf = RandomForestRegressor(n_estimators=100, random_state=42).fit(X_train, y_train)
    results.append(evaluate("Random Forest", y_test, rf.predict(X_test)))

    # XGBoost
    xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42).fit(X_train, y_train)
    results.append(evaluate("XGBoost", y_test, xgb.predict(X_test)))

    # MLP Regressor
    mlp = MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42).fit(X_train, y_train)
    results.append(evaluate("MLPRegressor", y_test, mlp.predict(X_test)))

    # Keras DNN
    dnn = Sequential([
        Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(32, activation='relu'),
        Dense(1)
    ])
    dnn.compile(optimizer='adam', loss='mae')
    dnn.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)
    y_pred_dnn = dnn.predict(X_test).flatten()
    results.append(evaluate("Keras DNN", y_test, y_pred_dnn))

    # Save results
    for res in results:
        all_results.append((res[0], target_sensor, res[1], res[2], res[3]))

# Create results dataframe
results_df = pd.DataFrame(all_results, columns=["Model", "Target", "R2", "MAE", "RMSE"])
print("\n📊 Results per target:")
print(results_df)

# Average results
avg_results = results_df.groupby("Model")[["R2", "MAE", "RMSE"]].mean().reset_index()
print("\n📈 Average performance across targets:")
print(avg_results)


✅ Normal shape: (11521, 40)
✅ Attack shape: (11747, 41)
✅ Merged dataset shape: (23268, 42)

=== Target: TANKLEVEL_next ===


DTypePromotionError: The DType <class 'numpy.dtypes.DateTime64DType'> could not be promoted by <class 'numpy.dtypes.Float64DType'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtypes.DateTime64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Float64DType'>)

In [None]:
import pandas as pd
import joblib
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# === Load ATTACK dataset only ===
df_attack = pd.read_excel("/content/drive/MyDrive/CWDS_DEMO/Areal Dataset/Data with attacks.xlsx")
print("✅ Attack dataset shape:", df_attack.shape)

# === Drop non-numeric columns (datetime, labels) ===
non_features = ['Date', 'Hours', 'ATT_FLAG']
if 'DATETIME' in df_attack.columns:
    non_features.append('DATETIME')

sensor_cols = [col for col in df_attack.columns if col not in non_features]
print(f"✅ Selected sensor features: {sensor_cols}")

# === Define target sensors ===
target_sensors = ['TANKLEVEL', 'OUTPUTFLOW', 'RESERVETANKVOLUME']

# === Evaluate helper ===
def evaluate(model_name, y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return model_name, r2, mae, rmse

all_results = []

for target_sensor in target_sensors:
    print(f"\n=== Target: {target_sensor}_next ===")
    df_temp = df_attack.copy()

    # Create next-step prediction target
    df_temp[f"{target_sensor}_next"] = df_temp[target_sensor].shift(-1)
    df_temp.dropna(inplace=True)

    X = df_temp[sensor_cols]
    y = df_temp[f"{target_sensor}_next"]

    # Ensure X has only numeric columns
    X_numeric = X.select_dtypes(include=[np.number])
    print(f"✅ X shape for {target_sensor}: {X_numeric.shape}")

    # Scale features
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X_numeric)

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    results = []

    # Linear Regression
    lr = LinearRegression().fit(X_train, y_train)
    results.append(evaluate("Linear Regression", y_test, lr.predict(X_test)))

    # Random Forest
    rf = RandomForestRegressor(n_estimators=100, random_state=42).fit(X_train, y_train)
    results.append(evaluate("Random Forest", y_test, rf.predict(X_test)))

    # XGBoost
    xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42).fit(X_train, y_train)
    results.append(evaluate("XGBoost", y_test, xgb.predict(X_test)))

    # SAVE the model + features
    joblib.dump(xgb, f"xgb_model_{target_sensor.lower()}.pkl")
    joblib.dump(sensor_cols, f"xgb_features_{target_sensor.lower()}.pkl")
    print(f"✅ Saved model and features for {target_sensor}")

    # MLP Regressor
    mlp = MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42).fit(X_train, y_train)
    results.append(evaluate("MLPRegressor", y_test, mlp.predict(X_test)))

    # Keras DNN
    dnn = Sequential([
        Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(32, activation='relu'),
        Dense(1)
    ])
    dnn.compile(optimizer='adam', loss='mae')
    dnn.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)
    y_pred_dnn = dnn.predict(X_test).flatten()
    results.append(evaluate("Keras DNN", y_test, y_pred_dnn))

    # Save results
    for res in results:
        all_results.append((res[0], target_sensor, res[1], res[2], res[3]))

# Create results dataframe
results_df = pd.DataFrame(all_results, columns=["Model", "Target", "R2", "MAE", "RMSE"])
print("\n📊 Results per target:")
print(results_df)

# Average results
avg_results = results_df.groupby("Model")[["R2", "MAE", "RMSE"]].mean().reset_index()
print("\n📈 Average performance across targets:")
print(avg_results)


✅ Attack dataset shape: (11747, 41)
✅ Selected sensor features: ['CONSUMERFLOW.DEFECT', 'CONSUMERFLOW', 'DEFECT.PUMP1', 'DEFECT.PUMP2', 'DEFECT.PUMP3', 'DEFECT.PUMP4', 'ENTRYFLOW.DEFECT', 'ENTRYFLOW', 'FLOW.PUMP1', 'FLOW.PUMP2', 'FLOW.PUMP3', 'FLOW.PUMP4', 'INPUTVALVE.CLOSE', 'INPUTVALVE.DEFECT.OPEN', 'INPUTVALVE.FDC.CLOSE', 'INPUTVALVE.FDC.OPEN', 'INPUTVALVE.OPEN', 'OUTPUTFLOW.DEFECT', 'OUTPUTFLOW', 'OUTPUTVALVE.CLOSE', 'OUTPUTVALVE.DEFECT.OPEN', 'OUTPUTVALVE.FDC.CLOSE', 'OUTPUTVALVE.FDC.OPEN', 'OUTPUTVALVE.OPEN', 'RESERVETANKVOLUME.DEFECT', 'RESERVETANKVOLUME', 'STATE.PUMP1', 'STATE.PUMP2', 'STATE.PUMP3', 'STATE.PUMP4', 'TANKLEVEL.DEFECT', 'TANKLEVEL.HIGH', 'TANKLEVEL.LOW', 'TANKLEVEL', 'CURRENT.FLOW.PUMP1', 'CURRENT.FLOW.PUMP2', 'CURRENT.FLOW.PUMP3', 'CURRENT.FLOW.PUMP4']

=== Target: TANKLEVEL_next ===
✅ X shape for TANKLEVEL: (11746, 38)
✅ Saved model and features for TANKLEVEL


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

=== Target: OUTPUTFLOW_next ===
✅ X shape for OUTPUTFLOW: (11746, 38)
✅ Saved model and features for OUTPUTFLOW


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

=== Target: RESERVETANKVOLUME_next ===
✅ X shape for RESERVETANKVOLUME: (11746, 38)
✅ Saved model and features for RESERVETANKVOLUME


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

📊 Results per target:
                Model             Target        R2        MAE       RMSE
0   Linear Regression          TANKLEVEL  0.999929   0.003938   0.017489
1       Random Forest          TANKLEVEL  0.999743   0.007598   0.033173
2             XGBoost          TANKLEVEL  0.999761   0.010768   0.031979
3        MLPRegressor          TANKLEVEL  0.999852   0.014393   0.025135
4           Keras DNN          TANKLEVEL  0.999830   0.016835   0.026992
5   Linear Regression         OUTPUTFLOW  0.983734  11.498421  45.425436
6       Random Forest         OUTPUTFLOW  0.992886   1.724894  30.041569
7             XGBoost         OUTPUTFLOW  0.994157   1.451572  27.224812
8        MLPRegressor         OUTPUTFLOW  0.985660   8.988362  42.651763
9           Keras DNN         OUTPUTFLOW  0.982567   5.240664  47.026741
10  Linear Regression  RESERVETANKVOLUME  0.999969   0.205285   0.940493
11      Random Forest  RESER

In [None]:
import joblib

joblib.dump(xgb, f"xgb_model_{target_sensor.lower()}.pkl")
joblib.dump(sensor_cols, f"xgb_features_{target_sensor.lower()}.pkl")


['xgb_features_reservetankvolume.pkl']

Random Forest

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# === Load ATTACK dataset only ===
df_attack = pd.read_excel("/content/drive/MyDrive/CWDS_DEMO/Areal Dataset/Data with attacks.xlsx")
print("✅ Attack dataset shape:", df_attack.shape)

# === Drop non-numeric columns (datetime, labels) ===
non_features = ['Date', 'Hours', 'ATT_FLAG']
if 'DATETIME' in df_attack.columns:
    non_features.append('DATETIME')

sensor_cols = [col for col in df_attack.columns if col not in non_features]
print(f"✅ Selected sensor features: {sensor_cols}")

# === Define target sensors ===
target_sensors = ['TANKLEVEL', 'OUTPUTFLOW', 'RESERVETANKVOLUME']

# === Evaluate helper ===
def evaluate(model_name, y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return model_name, r2, mae, rmse

✅ Attack dataset shape: (11747, 41)
✅ Selected sensor features: ['CONSUMERFLOW.DEFECT', 'CONSUMERFLOW', 'DEFECT.PUMP1', 'DEFECT.PUMP2', 'DEFECT.PUMP3', 'DEFECT.PUMP4', 'ENTRYFLOW.DEFECT', 'ENTRYFLOW', 'FLOW.PUMP1', 'FLOW.PUMP2', 'FLOW.PUMP3', 'FLOW.PUMP4', 'INPUTVALVE.CLOSE', 'INPUTVALVE.DEFECT.OPEN', 'INPUTVALVE.FDC.CLOSE', 'INPUTVALVE.FDC.OPEN', 'INPUTVALVE.OPEN', 'OUTPUTFLOW.DEFECT', 'OUTPUTFLOW', 'OUTPUTVALVE.CLOSE', 'OUTPUTVALVE.DEFECT.OPEN', 'OUTPUTVALVE.FDC.CLOSE', 'OUTPUTVALVE.FDC.OPEN', 'OUTPUTVALVE.OPEN', 'RESERVETANKVOLUME.DEFECT', 'RESERVETANKVOLUME', 'STATE.PUMP1', 'STATE.PUMP2', 'STATE.PUMP3', 'STATE.PUMP4', 'TANKLEVEL.DEFECT', 'TANKLEVEL.HIGH', 'TANKLEVEL.LOW', 'TANKLEVEL', 'CURRENT.FLOW.PUMP1', 'CURRENT.FLOW.PUMP2', 'CURRENT.FLOW.PUMP3', 'CURRENT.FLOW.PUMP4']


In [None]:
from sklearn.ensemble import RandomForestRegressor

df_temp = df_attack.copy()
for target_sensor in target_sensors:
    df_temp[f"{target_sensor}_next"] = df_temp[target_sensor].shift(-1)
df_temp.dropna(inplace=True)

X = df_temp[sensor_cols]
Y = df_temp[[f"{target}_next" for target in target_sensors]]

scaler = MinMaxScaler().fit(X)
X_scaled = scaler.transform(X)

X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=42)

rf_multi = RandomForestRegressor(n_estimators=100, random_state=42).fit(X_train, Y_train)


joblib.dump(rf_multi, "rf_multi_model.pkl")
joblib.dump(sensor_cols, "rf_multi_features.pkl")


['rf_multi_features.pkl']

retraining


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# === Load ATTACK dataset only ===
df_attack = pd.read_excel("/content/drive/MyDrive/CWDS_DEMO/Areal Dataset/Data with attacks.xlsx")
print("✅ Attack dataset shape:", df_attack.shape)

✅ Attack dataset shape: (11747, 41)


In [None]:
selected_features = [
    'CONSUMERFLOW.DEFECT', 'CONSUMERFLOW', 'DEFECT.PUMP1', 'DEFECT.PUMP2', 'DEFECT.PUMP3', 'DEFECT.PUMP4',
    'ENTRYFLOW.DEFECT', 'ENTRYFLOW', 'FLOW.PUMP1', 'FLOW.PUMP2', 'FLOW.PUMP3', 'FLOW.PUMP4',
    'INPUTVALVE.CLOSE', 'INPUTVALVE.DEFECT.OPEN', 'INPUTVALVE.FDC.CLOSE', 'INPUTVALVE.FDC.OPEN', 'INPUTVALVE.OPEN',
    'OUTPUTFLOW.DEFECT', 'OUTPUTFLOW', 'OUTPUTVALVE.CLOSE', 'OUTPUTVALVE.DEFECT.OPEN',
    'OUTPUTVALVE.FDC.CLOSE', 'OUTPUTVALVE.FDC.OPEN', 'OUTPUTVALVE.OPEN', 'RESERVETANKVOLUME.DEFECT',
    'RESERVETANKVOLUME', 'STATE.PUMP1', 'STATE.PUMP2', 'STATE.PUMP3', 'STATE.PUMP4', 'TANKLEVEL.DEFECT',
    'TANKLEVEL.HIGH', 'TANKLEVEL.LOW', 'TANKLEVEL', 'CURRENT.FLOW.PUMP1', 'CURRENT.FLOW.PUMP2',
    'CURRENT.FLOW.PUMP3', 'CURRENT.FLOW.PUMP4'
]

In [None]:
from sklearn.feature_selection import VarianceThreshold
# Extract feature matrix
X = df_attack[selected_features]

# Apply variance threshold filtering
threshold = 1e-3  # filter out near-constant features
selector = VarianceThreshold(threshold=threshold)
X_filtered = selector.fit_transform(X)

# Get the names of retained features
retained_feature_names = [feature for i, feature in enumerate(X.columns) if selector.get_support()[i]]

# Optional: print or use the filtered DataFrame
X_filtered_df = pd.DataFrame(X_filtered, columns=retained_feature_names)
print("Retained Features:", retained_feature_names)


Retained Features: ['CONSUMERFLOW', 'ENTRYFLOW', 'FLOW.PUMP1', 'FLOW.PUMP2', 'FLOW.PUMP3', 'OUTPUTFLOW', 'OUTPUTVALVE.FDC.CLOSE', 'OUTPUTVALVE.FDC.OPEN', 'RESERVETANKVOLUME.DEFECT', 'RESERVETANKVOLUME', 'STATE.PUMP1', 'STATE.PUMP2', 'STATE.PUMP3', 'STATE.PUMP4', 'TANKLEVEL.DEFECT', 'TANKLEVEL.HIGH', 'TANKLEVEL', 'CURRENT.FLOW.PUMP1', 'CURRENT.FLOW.PUMP2', 'CURRENT.FLOW.PUMP3', 'CURRENT.FLOW.PUMP4']


In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor


# Retained features from earlier step
features = ['CONSUMERFLOW', 'ENTRYFLOW', 'FLOW.PUMP1', 'FLOW.PUMP2', 'FLOW.PUMP3', 'OUTPUTFLOW',
            'OUTPUTVALVE.FDC.CLOSE', 'OUTPUTVALVE.FDC.OPEN', 'RESERVETANKVOLUME.DEFECT',
            'RESERVETANKVOLUME', 'STATE.PUMP1', 'STATE.PUMP2', 'STATE.PUMP3', 'STATE.PUMP4',
            'TANKLEVEL.DEFECT', 'TANKLEVEL.HIGH', 'TANKLEVEL',
            'CURRENT.FLOW.PUMP1', 'CURRENT.FLOW.PUMP2', 'CURRENT.FLOW.PUMP3', 'CURRENT.FLOW.PUMP4']

targets = ['TANKLEVEL', 'OUTPUTFLOW', 'RESERVETANKVOLUME']
results = []

for target in targets:
    df_temp = df_attack.copy()
    df_temp[f"{target}_next"] = df_temp[target].shift(-1)
    df_temp.dropna(inplace=True)

    X = df_temp[features]
    y = df_temp[f"{target}_next"]

    # Scale features
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    # Split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Train XGBoost model
    model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
    model.fit(X_train, y_train)

    # Save model using XGBoost's native format (version-safe)
    model.save_model(f"xgb_model_{target.lower()}.json")

    # Save feature list
    joblib.dump(features, f"xgb_features_{target.lower()}.pkl")

    # Evaluate
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    results.append((target, r2, mae, rmse))

# Show results
print("\n=== Model Performance ===")
for target, r2, mae, rmse in results:
    print(f"{target}: R2={r2:.4f}, MAE={mae:.4f}, RMSE={rmse:.4f}")



=== Model Performance ===
TANKLEVEL: R2=0.9998, MAE=0.0108, RMSE=0.0320
OUTPUTFLOW: R2=0.9942, MAE=1.4516, RMSE=27.2248
RESERVETANKVOLUME: R2=0.9976, MAE=0.9245, RMSE=8.2347
