In [1]:
import pandas as pd
from sklearn.ensemble import IsolationForest
import joblib



# STEP 1: LOAD AND SORT DATA

In [2]:



df = pd.read_csv("/kaggle/input/shuf-co-data-water/shu_cor_water_Q.csv")
df["timestamp"] = pd.to_datetime(df["timestamp"])
df = df.sort_values("timestamp").reset_index(drop=True)




# ENGINEER FEATURES



In [3]:

# Basic engineered features
df["ph_dev"] = abs(df["pH"] - 7)
df["tds_temp_ratio"] = df["TDS"] / df["temperature"]
df["turbidity_x_ph"] = df["turbidity"] * df["pH"]
df["temp_sqr"] = df["temperature"] ** 2

# Spike detection / trend features
df["tds_diff"] = df["TDS"].diff().fillna(0)
df["temp_diff"] = df["temperature"].diff().fillna(0)
df["ph_diff"] = df["pH"].diff().fillna(0)
df["tds_rollmean_3"] = df["TDS"].rolling(window=3).mean().bfill()
df["ph_rollstd_3"] = df["pH"].rolling(window=3).std().fillna(0)

# Final selected features
feature_cols = [
    "TDS", "turbidity", "temperature", "pH",
    "ph_dev", "tds_temp_ratio", "turbidity_x_ph", "temp_sqr",
    "tds_diff", "temp_diff", "ph_diff",
    "tds_rollmean_3", "ph_rollstd_3"
]

X_all = df[feature_cols]
X_train = df[df["status"] == "Safe"][feature_cols]  # Only train on safe rows




# STEP 3: TRAIN ISOLATION FOREST

In [4]:


iso_model = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)
iso_model.fit(X_train)





# STEP 4: PREDICT + ADD LABELS

In [5]:

df["anomaly_score"] = iso_model.decision_function(X_all)
df["anomaly_raw"] = iso_model.predict(X_all)
df["anomaly_status"] = df["anomaly_raw"].apply(lambda x: "Anomaly" if x == -1 else "Normal")
df = df.round(2)


In [6]:
df

Unnamed: 0,timestamp,TDS,turbidity,temperature,pH,status,cause,ph_dev,tds_temp_ratio,turbidity_x_ph,temp_sqr,tds_diff,temp_diff,ph_diff,tds_rollmean_3,ph_rollstd_3,anomaly_score,anomaly_raw,anomaly_status
0,2025-01-01 00:00:00,735.91,13.30,46.09,6.74,Warning,"TDS Warning, Turbidity Warning, Temp Warning",0.26,15.97,89.64,2124.29,0.00,0.00,0.00,682.39,0.00,-0.00,-1,Anomaly
1,2025-01-01 00:10:00,717.53,16.98,51.26,6.88,Warning,High Temp,0.12,14.00,116.82,2627.59,-18.38,5.17,0.14,682.39,0.00,-0.03,-1,Anomaly
2,2025-01-01 00:20:00,593.72,4.30,27.94,7.40,Safe,,0.40,21.25,31.82,780.64,-123.81,-23.32,0.52,682.39,0.35,0.01,1,Normal
3,2025-01-01 00:30:00,477.20,16.48,39.42,6.79,Warning,Turbidity Warning,0.21,12.11,111.90,1553.94,-116.52,11.48,-0.61,596.15,0.33,0.05,1,Normal
4,2025-01-01 00:40:00,392.80,11.23,35.66,6.28,Safe,Turbidity Warning,0.72,11.02,70.52,1271.64,-84.40,-3.76,-0.51,487.91,0.56,0.02,1,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3293,2025-01-23 20:50:00,517.60,5.55,32.68,6.77,Safe,,0.23,15.84,37.57,1067.98,-48.25,-6.21,-0.18,486.71,0.14,0.14,1,Normal
3294,2025-01-23 21:00:00,546.86,14.22,40.55,6.89,Safe,"Turbidity Warning, Temp Warning",0.11,13.49,97.98,1644.30,29.26,7.87,0.12,543.44,0.09,0.07,1,Normal
3295,2025-01-23 21:10:00,709.69,12.48,38.83,7.10,Safe,"TDS Warning, Turbidity Warning",0.10,18.28,88.61,1507.77,162.83,-1.72,0.21,591.38,0.17,0.05,1,Normal
3296,2025-01-23 21:20:00,546.08,6.74,38.71,6.65,Safe,,0.35,14.11,44.82,1498.46,-163.61,-0.12,-0.45,600.88,0.23,0.10,1,Normal


# STEP 5: SAVE RESULTS & MODEL

In [7]:


df.to_csv("anomaly_detection.csv", index=False)
joblib.dump(iso_model, "anomaly_detection_model.pkl")

print("💾 Model Saved")

💾 Model Saved
