In [130]:
# =====================================================
# PIPELINE FAULT DETECTION MODEL
# =====================================================
# Author: Oluwakpelumi
# Dataset: scada_pipeline.csv
# Goal: Predict whether the pipeline is Faulty or Normal
# =====================================================

# 1️⃣ IMPORT LIBRARIES

In [129]:
import pandas as pd 
import numpy as np
import joblib
import seaborn as sns
import matplotlib.pyplot as plt 

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV ,RandomizedSearchCV

In [None]:
# 2️⃣ LOAD DATA
path = r"C:\Users\PC\Desktop\PIPE Leakage\Project 12 SCADA kagggle\scada_pipeline.csv"
df = pd.read_csv(path)

In [None]:
print(df.head())
print(df.info())
print(df.describe().T)
print(df.columns.tolist())

printdf.nunique()
printdf.isna().sum

printdf.isnull().sum()

print(df.target).value_counts()


In [13]:
fault = df[df.target==1]

In [14]:
fault.info()

<class 'pandas.core.frame.DataFrame'>
Index: 306 entries, 3 to 999
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   timestamp           306 non-null    object 
 1   segment_id          306 non-null    int64  
 2   pressure            306 non-null    float64
 3   flow_rate           306 non-null    float64
 4   temperature         306 non-null    float64
 5   valve_status        306 non-null    int64  
 6   pump_state          306 non-null    int64  
 7   pump_speed          306 non-null    float64
 8   compressor_state    306 non-null    int64  
 9   energy_consumption  306 non-null    float64
 10  alarm_triggered     306 non-null    int64  
 11  event_type          306 non-null    object 
 12  target              306 non-null    int64  
dtypes: float64(5), int64(6), object(2)
memory usage: 33.5+ KB


In [15]:
fault.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
segment_id,306.0,25.908497,14.395746,1.0,13.0,26.0,38.75,50.0
pressure,306.0,79.260131,15.022996,43.11,69.045,77.315,90.875,115.3
flow_rate,306.0,3.987451,1.369277,1.01,2.905,4.26,4.995,7.53
temperature,306.0,32.065621,1.933288,25.92,30.7,32.115,33.32,37.26
valve_status,306.0,0.905229,0.555897,0.0,1.0,1.0,1.0,2.0
pump_state,306.0,0.660131,0.47444,0.0,0.0,1.0,1.0,1.0
pump_speed,306.0,841.801307,622.553981,0.0,0.0,1132.6,1356.525,1623.7
compressor_state,306.0,0.611111,0.488297,0.0,0.0,1.0,1.0,1.0
energy_consumption,306.0,29.280065,10.595217,7.02,21.8325,30.19,35.47,58.18
alarm_triggered,306.0,0.705882,0.456391,0.0,0.0,1.0,1.0,1.0


In [16]:
no_fault = df[df.target==0]

In [17]:
no_fault.info()

<class 'pandas.core.frame.DataFrame'>
Index: 694 entries, 0 to 998
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   timestamp           694 non-null    object 
 1   segment_id          694 non-null    int64  
 2   pressure            694 non-null    float64
 3   flow_rate           694 non-null    float64
 4   temperature         694 non-null    float64
 5   valve_status        694 non-null    int64  
 6   pump_state          694 non-null    int64  
 7   pump_speed          694 non-null    float64
 8   compressor_state    694 non-null    int64  
 9   energy_consumption  694 non-null    float64
 10  alarm_triggered     694 non-null    int64  
 11  event_type          694 non-null    object 
 12  target              694 non-null    int64  
dtypes: float64(5), int64(6), object(2)
memory usage: 75.9+ KB


In [18]:
no_fault.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
segment_id,694.0,25.674352,14.302363,1.0,14.0,25.0,38.0,50.0
pressure,694.0,74.854352,4.931706,61.37,71.5625,74.88,78.14,88.82
flow_rate,694.0,4.522896,0.490921,3.07,4.1825,4.54,4.85,6.15
temperature,694.0,32.111873,1.996763,27.08,30.6825,32.125,33.445,38.52
valve_status,694.0,0.920749,0.546062,0.0,1.0,1.0,1.0,2.0
pump_state,694.0,0.701729,0.457829,0.0,0.0,1.0,1.0,1.0
pump_speed,694.0,979.251441,644.581231,0.0,0.0,1336.75,1435.85,1678.8
compressor_state,694.0,0.595101,0.491227,0.0,0.0,1.0,1.0,1.0
energy_consumption,694.0,25.783559,8.410025,5.42,20.785,25.245,33.48,39.49
alarm_triggered,694.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
df["event_type"].nunique()

5

In [20]:
df["event_type"].unique()

array(['normal', 'blockage', 'degradation', 'surge', 'leak'], dtype=object)

In [21]:
df["event_type"].value_counts()

event_type
normal         694
degradation    135
leak            65
surge           61
blockage        45
Name: count, dtype: int64

In [22]:
fault.head(20)

Unnamed: 0,timestamp,segment_id,pressure,flow_rate,temperature,valve_status,pump_state,pump_speed,compressor_state,energy_consumption,alarm_triggered,event_type,target
3,1/1/2024 0:00,16,98.49,1.61,30.61,2,1,1368.7,1,37.07,1,blockage,1
8,1/1/2024 0:00,12,69.77,4.4,33.24,1,0,0.0,1,24.92,0,degradation,1
14,1/1/2024 0:00,29,96.02,2.04,28.7,1,0,0.0,1,23.88,1,blockage,1
20,1/1/2024 0:00,4,88.65,1.4,29.61,1,1,1623.7,0,24.31,1,blockage,1
21,1/1/2024 0:00,3,84.09,5.45,30.02,1,1,1383.2,0,20.71,1,surge,1
23,1/1/2024 0:00,10,88.19,5.96,30.33,1,1,1334.5,1,33.84,1,surge,1
28,1/1/2024 0:00,45,101.75,2.32,29.35,1,1,1262.9,0,21.4,1,blockage,1
32,1/1/2024 0:00,8,101.79,1.49,30.67,1,0,0.0,1,21.95,1,blockage,1
36,1/1/2024 0:00,13,79.14,1.51,33.58,2,1,1326.9,1,29.59,1,blockage,1
42,1/1/2024 0:00,7,66.93,4.4,30.97,0,1,1288.9,1,31.39,1,leak,1


In [23]:
(df.alarm_triggered).value_counts()

alarm_triggered
0    784
1    216
Name: count, dtype: int64

In [24]:
(no_fault.alarm_triggered).value_counts()

alarm_triggered
0    694
Name: count, dtype: int64

In [25]:
(fault.alarm_triggered).value_counts()

alarm_triggered
1    216
0     90
Name: count, dtype: int64

In [26]:
print("Exporing FAULT")

Exporing FAULT


In [27]:
alarm = (fault[fault.alarm_triggered == 1])

In [28]:
no_alarm = (fault[fault.alarm_triggered == 0])

In [29]:
(alarm[["alarm_triggered", "event_type", "target"]]).head(20)

Unnamed: 0,alarm_triggered,event_type,target
3,1,blockage,1
14,1,blockage,1
20,1,blockage,1
21,1,surge,1
23,1,surge,1
28,1,blockage,1
32,1,blockage,1
36,1,blockage,1
42,1,leak,1
43,1,leak,1


In [30]:
(no_alarm[["alarm_triggered", "event_type", "target"]]).head(20)

Unnamed: 0,alarm_triggered,event_type,target
8,0,degradation,1
62,0,degradation,1
88,0,degradation,1
97,0,degradation,1
101,0,degradation,1
103,0,degradation,1
125,0,degradation,1
130,0,degradation,1
131,0,degradation,1
135,0,degradation,1


In [31]:
(alarm["event_type"]).value_counts()

event_type
leak           65
surge          61
blockage       45
degradation    45
Name: count, dtype: int64

In [32]:
(no_alarm["event_type"]).value_counts()

event_type
degradation    90
Name: count, dtype: int64

In [33]:
features1 =  [
 'pressure',
 'flow_rate',
 'temperature',
 'valve_status',
 'pump_state',
 'pump_speed',
 'compressor_state',
 'energy_consumption'
 ]

In [34]:
target1 = "target"

In [35]:
df[features1].corrwith(df["target"])

pressure              0.214237
flow_rate            -0.275917
temperature          -0.010788
valve_status         -0.013038
pump_state           -0.041412
pump_speed           -0.098904
compressor_state      0.015060
energy_consumption    0.173902
dtype: float64

In [36]:
print("=========To predict failure==============")
# MODEL A: BINARY CLASSIFICATION (FAULTY vs NORMAL)
# =============================================================================



In [37]:
X1 = df[features1]

In [38]:
y1 = df[target1]

In [39]:
X1_train,  X1_test, y1_train, y1_test  = train_test_split(X1,y1 ,test_size=0.2, stratify=y1, random_state=12)

In [40]:
X1_train.shape

(800, 8)

In [41]:
X1_test.shape

(200, 8)

In [42]:
rfc = RandomForestClassifier(n_estimators=200, max_depth=10, class_weight="balanced", random_state=12 )

In [43]:
rfc.fit(X1_train,y1_train)

In [44]:
y1_pred = rfc.predict(X1_test)

In [45]:
print("\n=== RandomForest Results (Clean) ===")


=== RandomForest Results (Clean) ===


In [46]:
accuracy_score(y1_test,y1_pred)

0.965

In [47]:
roc_auc_score(y1_test,y1_pred)

0.9564217478476235

In [48]:
confusion_matrix(y1_test,y1_pred)

array([[136,   3],
       [  4,  57]], dtype=int64)

In [49]:
print(classification_report(y1_test,y1_pred))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97       139
           1       0.95      0.93      0.94        61

    accuracy                           0.96       200
   macro avg       0.96      0.96      0.96       200
weighted avg       0.96      0.96      0.96       200



In [50]:
print("\n=== Tuning BaseModel ===")


=== Tuning BaseModel ===


In [51]:
rf = RandomForestClassifier()

In [52]:
param_grid=[{
   "n_estimators":[100,200,500],
    "max_depth":[5,10,15,None],
    "min_samples_leaf":[1,2,4],
    "class_weight":["balanced"]
}]

In [53]:
"""param_grid=[{
   "n_estimators":[100, 200,500],
    "max_depth":[5,10,15,None ],
    "class_weight":["balanced"]
}]"""

'param_grid=[{\n   "n_estimators":[100, 200,500],\n    "max_depth":[5,10,15,None ],\n    "class_weight":["balanced"]\n}]'

In [54]:
gridsearch = GridSearchCV(
    estimator = rf,
    param_grid = param_grid,
    cv = 5,
    n_jobs = -1,
    scoring = "accuracy"
)

In [55]:
best_rfc = gridsearch.fit(X1_train,y1_train)

In [56]:
best_rfc.best_params_

{'class_weight': 'balanced',
 'max_depth': 10,
 'min_samples_leaf': 2,
 'n_estimators': 200}

In [57]:
best_rfc.best_score_

0.9350000000000002

In [58]:
pred2 = best_rfc.predict(X1_test)

In [59]:
accuracy_score(y1_test,pred2)

0.975

In [60]:
roc_auc_score(y1_test,pred2)

0.9636159924519401

In [61]:
confusion_matrix(y1_test,pred2)

array([[138,   1],
       [  4,  57]], dtype=int64)

In [62]:
print(classification_report(y1_test,pred2))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98       139
           1       0.98      0.93      0.96        61

    accuracy                           0.97       200
   macro avg       0.98      0.96      0.97       200
weighted avg       0.98      0.97      0.97       200



In [63]:
model_data ={
    "model":best_rfc,
    "features": X1_train.columns.tolist(),
    "features_dtypes": X1_train.dtypes.to_dict()
}

In [64]:
joblib.dump(model_data, "m1_fault_detection.plk")

['m1_fault_detection.plk']

In [65]:
print("=========To predict what type of Event==============")



In [66]:
#========THIS MODEL WILL BE TRAIN ON THE TARGET=1 (Faulty) DATA===========

In [67]:
fault.info()

<class 'pandas.core.frame.DataFrame'>
Index: 306 entries, 3 to 999
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   timestamp           306 non-null    object 
 1   segment_id          306 non-null    int64  
 2   pressure            306 non-null    float64
 3   flow_rate           306 non-null    float64
 4   temperature         306 non-null    float64
 5   valve_status        306 non-null    int64  
 6   pump_state          306 non-null    int64  
 7   pump_speed          306 non-null    float64
 8   compressor_state    306 non-null    int64  
 9   energy_consumption  306 non-null    float64
 10  alarm_triggered     306 non-null    int64  
 11  event_type          306 non-null    object 
 12  target              306 non-null    int64  
dtypes: float64(5), int64(6), object(2)
memory usage: 33.5+ KB


In [68]:
fault_df = fault.copy()

In [69]:
features2 = fault_df.iloc[:,2:10]

In [70]:
features2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 306 entries, 3 to 999
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   pressure            306 non-null    float64
 1   flow_rate           306 non-null    float64
 2   temperature         306 non-null    float64
 3   valve_status        306 non-null    int64  
 4   pump_state          306 non-null    int64  
 5   pump_speed          306 non-null    float64
 6   compressor_state    306 non-null    int64  
 7   energy_consumption  306 non-null    float64
dtypes: float64(5), int64(3)
memory usage: 21.5 KB


In [71]:
(fault_df["event_type"]).unique()

array(['blockage', 'degradation', 'surge', 'leak'], dtype=object)

In [72]:
(fault_df["event_type"]).value_counts()

event_type
degradation    135
leak            65
surge           61
blockage        45
Name: count, dtype: int64

In [73]:
labelencoder = LabelEncoder()

In [74]:
fault_df["fault_type"] = labelencoder.fit_transform(fault_df["event_type"])

In [75]:
print("\nLabelEncoder classes:")
print(labelencoder.classes_)


LabelEncoder classes:
['blockage' 'degradation' 'leak' 'surge']


In [77]:
# Get the mapping between numbers and original labels
label_mapping = dict(zip(labelencoder.classes_, range(len(labelencoder.classes_))))
print("Label Encoder Mapping:")
for label, number in label_mapping.items():
    print(f"  {number} → {label}")

Label Encoder Mapping:
  0 → blockage
  1 → degradation
  2 → leak
  3 → surge


In [78]:
fault_df.columns.tolist()

['timestamp',
 'segment_id',
 'pressure',
 'flow_rate',
 'temperature',
 'valve_status',
 'pump_state',
 'pump_speed',
 'compressor_state',
 'energy_consumption',
 'alarm_triggered',
 'event_type',
 'target',
 'fault_type']

In [79]:
fault_df.head()

Unnamed: 0,timestamp,segment_id,pressure,flow_rate,temperature,valve_status,pump_state,pump_speed,compressor_state,energy_consumption,alarm_triggered,event_type,target,fault_type
3,1/1/2024 0:00,16,98.49,1.61,30.61,2,1,1368.7,1,37.07,1,blockage,1,0
8,1/1/2024 0:00,12,69.77,4.4,33.24,1,0,0.0,1,24.92,0,degradation,1,1
14,1/1/2024 0:00,29,96.02,2.04,28.7,1,0,0.0,1,23.88,1,blockage,1,0
20,1/1/2024 0:00,4,88.65,1.4,29.61,1,1,1623.7,0,24.31,1,blockage,1,0
21,1/1/2024 0:00,3,84.09,5.45,30.02,1,1,1383.2,0,20.71,1,surge,1,3


In [80]:
target2 = fault_df.iloc[:,-1]

In [81]:
target2.info()

<class 'pandas.core.series.Series'>
Index: 306 entries, 3 to 999
Series name: fault_type
Non-Null Count  Dtype
--------------  -----
306 non-null    int32
dtypes: int32(1)
memory usage: 3.6 KB


In [82]:
X2 = features2

In [83]:
y2 = target2

In [84]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2,y2, test_size=0.2, stratify=y2 , random_state=12)

In [85]:
print(f"X2 (test and Train) shape:{X2_test.shape, X2_train.shape}")

X2 (test and Train) shape:((62, 8), (244, 8))


In [86]:
print(f"y2 (test and Train) shape:{y2_test.shape, y2_train.shape}")

y2 (test and Train) shape:((62,), (244,))


In [87]:
rfc1 = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    class_weight="balanced",
    random_state=12
)

In [88]:
rfc1.fit(X2_train,y2_train)

In [89]:
y2_pred = rfc1.predict(X2_test)

In [90]:
print(f"{accuracy_score(y2_test,y2_pred):.3f}")

0.935


In [91]:
confusion_matrix(y2_test,y2_pred)

array([[ 8,  0,  1,  0],
       [ 0, 24,  0,  3],
       [ 0,  0, 13,  0],
       [ 0,  0,  0, 13]], dtype=int64)

In [92]:
print(classification_report(y2_test,y2_pred))

              precision    recall  f1-score   support

           0       1.00      0.89      0.94         9
           1       1.00      0.89      0.94        27
           2       0.93      1.00      0.96        13
           3       0.81      1.00      0.90        13

    accuracy                           0.94        62
   macro avg       0.94      0.94      0.94        62
weighted avg       0.95      0.94      0.94        62



In [93]:
#MODEL TUNNING

In [94]:
rf1 = RandomForestClassifier()

In [95]:
paramgrid2 = [{
    "n_estimators":[200,500,1000],
    "class_weight":["balanced"],
    "max_depth":[5,10,15,None],
    "min_samples_leaf":[1,2,4],
}]

In [96]:
gridsearch2 = GridSearchCV(
    estimator = rf1 ,
    param_grid = paramgrid2 ,
    cv = 5,
    n_jobs = -1,
    scoring = "accuracy"
)

In [97]:
rfc2 = gridsearch2.fit(X2_train,y2_train)

In [98]:
gridsearch2.best_params_

{'class_weight': 'balanced',
 'max_depth': 15,
 'min_samples_leaf': 1,
 'n_estimators': 200}

In [99]:
rfc2.best_params_

{'class_weight': 'balanced',
 'max_depth': 15,
 'min_samples_leaf': 1,
 'n_estimators': 200}

In [100]:
gridsearch2.best_score_

0.9795068027210885

In [101]:
rfc2.best_score_

0.9795068027210885

In [102]:
pred4 = rfc2.predict(X2_test)

In [103]:
pred3 = gridsearch2.predict(X2_test)

In [104]:
print(f"{accuracy_score(y2_test,pred4):.3f}")

0.952


In [105]:
print(f"{accuracy_score(y2_test,pred3):.3f}")

0.952


In [106]:
confusion_matrix(y2_test,pred3)

array([[ 8,  0,  1,  0],
       [ 0, 25,  0,  2],
       [ 0,  0, 13,  0],
       [ 0,  0,  0, 13]], dtype=int64)

In [107]:
print(classification_report(y2_test,pred3))

              precision    recall  f1-score   support

           0       1.00      0.89      0.94         9
           1       1.00      0.93      0.96        27
           2       0.93      1.00      0.96        13
           3       0.87      1.00      0.93        13

    accuracy                           0.95        62
   macro avg       0.95      0.95      0.95        62
weighted avg       0.96      0.95      0.95        62



In [108]:
model_data2 ={
    "model":rfc2,
    "features": X1_train.columns.tolist(),
    "features_dtypes": X1_train.dtypes.to_dict()
}
joblib.dump(model_data2, "m2_fault_type.plk")

['m2_fault_type.plk']

In [109]:
##TRaining Fault type using another algorithm

In [110]:
gb = GradientBoostingClassifier(
    n_estimators = 300,
    learning_rate=0.05,
    max_depth=5,
    max_features="sqrt",
    subsample=0.8,
    random_state=12
)


In [111]:
gb.fit(X2_train,y2_train)

In [112]:
gp_pred = gb.predict(X2_test)

In [113]:
accuracy_score(y2_test,gp_pred)

0.9516129032258065

In [114]:
confusion_matrix(y2_test,gp_pred)

array([[ 9,  0,  0,  0],
       [ 0, 24,  1,  2],
       [ 0,  0, 13,  0],
       [ 0,  0,  0, 13]], dtype=int64)

In [115]:
print(classification_report(y2_test,gp_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       1.00      0.89      0.94        27
           2       0.93      1.00      0.96        13
           3       0.87      1.00      0.93        13

    accuracy                           0.95        62
   macro avg       0.95      0.97      0.96        62
weighted avg       0.96      0.95      0.95        62



In [116]:
#MODEL TUNNING

In [117]:
"""paramgrid3 = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'max_features': ['sqrt', None]
}"""

"paramgrid3 = {\n    'n_estimators': [100, 200, 300],\n    'learning_rate': [0.01, 0.05, 0.1],\n    'max_depth': [3, 5, 7],\n    'subsample': [0.8, 1.0],\n    'max_features': ['sqrt', None]\n}"

In [118]:
paramgrid3 = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
}

In [119]:
gridsearch3 = GridSearchCV(
    estimator=gb,
    param_grid=paramgrid3,
    scoring='accuracy',
    cv=2,
    n_jobs=-1,
)

In [120]:
gridsearch3.fit(X2_train,y2_train)

In [121]:
gridsearch3.best_score_

0.9754098360655737

In [122]:
gridsearch3.best_params_

{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 300}

In [123]:
gp_pred2 = gridsearch3.predict(X2_test)

In [124]:
accuracy_score(y2_test,gp_pred2)

0.9354838709677419

In [125]:
confusion_matrix(y2_test,gp_pred2)

array([[ 9,  0,  0,  0],
       [ 0, 23,  2,  2],
       [ 0,  0, 13,  0],
       [ 0,  0,  0, 13]], dtype=int64)

In [126]:
print(classification_report(y2_test,gp_pred2))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         9
           1       1.00      0.85      0.92        27
           2       0.87      1.00      0.93        13
           3       0.87      1.00      0.93        13

    accuracy                           0.94        62
   macro avg       0.93      0.96      0.94        62
weighted avg       0.94      0.94      0.94        62



In [127]:
model_data3 = {
    "model" : gb,
    "features" : X2_train.columns.tolist(),
    "features_dtypes" : X2_train.dtypes.to_dict()
}

In [128]:
joblib.dump(model_data3, "m3_fault_type.pkl")

['m3_fault_type.pkl']