In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("/kaggle/input/anomaly/anomaly_detection (1).csv")

# Parse timestamp if needed
df["timestamp"] = pd.to_datetime(df["timestamp"])
df = df.sort_values("timestamp").reset_index(drop=True)

# Just in case: calculate diffs if not already present
if "tds_diff" not in df.columns:
    df["tds_diff"] = df["TDS"].diff()

if "temp_diff" not in df.columns:
    df["temp_diff"] = df["temperature"].diff()

if "ph_diff" not in df.columns:
    df["ph_diff"] = df["pH"].diff()

# Step 1: Define smart rule-based logic for maintenance flag
df["maintenance_needed"] = (
    (df["tds_diff"].abs() > 100) |     # Sudden TDS jump
    (df["temp_diff"].abs() > 5) |      # Rapid temp change
    (df["ph_diff"].abs() > 0.5)        # pH fluctuation
).astype(int)

# Show distribution of labels
print("🔧 Maintenance Needed Distribution:")
print(df["maintenance_needed"].value_counts())

# Save updated dataset (optional)
df.to_csv("maintenance_labeled_data.csv", index=False)
print("\n✅ Labeled data saved as 'maintenance_labeled_data.csv'")


🔧 Maintenance Needed Distribution:
maintenance_needed
1    2857
0     441
Name: count, dtype: int64

✅ Labeled data saved as 'maintenance_labeled_data.csv'


In [2]:
import pandas as pd
import numpy as np

# Load your dataset
df = pd.read_csv("/kaggle/input/anomaly/anomaly_detection (1).csv")
df["timestamp"] = pd.to_datetime(df["timestamp"])
df = df.sort_values("timestamp").reset_index(drop=True)

# If diff features missing, calculate
df["tds_diff"] = df["TDS"].diff()
df["temp_diff"] = df["temperature"].diff()
df["ph_diff"] = df["pH"].diff()

# Label logic: Smart rule-based trigger
df["maintenance_needed"] = (
    (df["tds_diff"].abs() > 100) |
    (df["temp_diff"].abs() > 5) |
    (df["ph_diff"].abs() > 0.5)
).astype(int)


  return op(a, b)


In [3]:
# Lag-based memory features
df["TDS_t-1"] = df["TDS"].shift(1)
df["TDS_t-2"] = df["TDS"].shift(2)
df["Temp_t-1"] = df["temperature"].shift(1)
df["Temp_t-2"] = df["temperature"].shift(2)
df["pH_t-1"] = df["pH"].shift(1)
df["pH_t-2"] = df["pH"].shift(2)

# Drop missing rows after shift
df = df.dropna().reset_index(drop=True)


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Choose features for maintenance prediction
features = [
    "TDS", "temperature", "pH",
    "tds_diff", "temp_diff", "ph_diff",
    "TDS_t-1", "TDS_t-2", "Temp_t-1", "Temp_t-2",
    "pH_t-1", "pH_t-2"
]

X = df[features]
y = df["maintenance_needed"]

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

# Train classifier
clf = RandomForestClassifier(n_estimators=150, max_depth=7, random_state=42)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
print("📊 Classification Report:\n", classification_report(y_test, y_pred))
print("🔍 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


📊 Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.89      0.94        38
           1       0.99      1.00      0.99       334

    accuracy                           0.99       372
   macro avg       0.99      0.95      0.97       372
weighted avg       0.99      0.99      0.99       372

🔍 Confusion Matrix:
 [[ 34   4]
 [  0 334]]


In [5]:
def predict_maintenance(tds, temp, ph, tds_t1, tds_t2, temp_t1, temp_t2, ph_t1, ph_t2):
    input_data = pd.DataFrame([{
        "TDS": tds,
        "temperature": temp,
        "pH": ph,
        "tds_diff": tds - tds_t1,
        "temp_diff": temp - temp_t1,
        "ph_diff": ph - ph_t1,
        "TDS_t-1": tds_t1,
        "TDS_t-2": tds_t2,
        "Temp_t-1": temp_t1,
        "Temp_t-2": temp_t2,
        "pH_t-1": ph_t1,
        "pH_t-2": ph_t2
    }])
    return clf.predict(input_data)[0]



In [6]:
# Example
status = predict_maintenance(
    tds=800, temp=42.5, ph=6.6,
    tds_t1=700, tds_t2=690,
    temp_t1=36.5, temp_t2=35.2,
    ph_t1=7.0, ph_t2=7.2
)

print("🔮 Maintenance needed:" if status else "✅ All Clear")


🔮 Maintenance needed:


In [7]:
status = predict_maintenance(
    tds=550, temp=33.2, ph=7.0,
    tds_t1=552, tds_t2=551,
    temp_t1=33.0, temp_t2=32.9,
    ph_t1=6.95, ph_t2=6.97
)

print("🔧 Maintenance Alert:" if status else "✅ All Clear")


✅ All Clear


In [8]:
import joblib
joblib.dump(clf, "predictive_maintenance_model.pkl")
print("💾 Model saved as predictive_maintenance_model.pkl")


💾 Model saved as predictive_maintenance_model.pkl
