In [None]:
# AquaSentinel ML Model Training & Evaluation

This notebook trains and evaluates machine learning models for the AquaSentinel water quality monitoring system.

## Models Included:
1. **Water Quality Predictor** - Determines water potability
2. **Filter Saturation Predictor** - Predicts filter replacement timing
3. **Anomaly Detector** - Identifies unusual sensor readings
4. **Quality Optimizer** - Provides optimization recommendations


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# ML imports
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score,
    mean_squared_error, r2_score, mean_absolute_error
)
from sklearn.impute import SimpleImputer
import joblib

# Local imports
from ml_utils import WaterQualityPredictor, DataProcessor
from ml_model import FilterSaturationPredictor, AnomalyDetector
from sensors import SensorReading

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

print("✅ Libraries imported successfully!")
print(f"📊 Training started at: {datetime.now()}")


In [None]:
# Water Quality Prediction Model Training

This notebook trains a machine learning model to predict water potability based on various water quality parameters.

## Dataset
Using the Kaggle Water Quality Dataset: https://www.kaggle.com/datasets/adityakadiwal/water-potability

## Features
- pH: pH level of water
- Hardness: Capacity of water to precipitate soap
- Solids: Total dissolved solids
- Chloramines: Amount of chloramines
- Sulfate: Amount of sulfates dissolved
- Conductivity: Electrical conductivity
- Organic_carbon: Amount of organic carbon
- Trihalomethanes: Amount of trihalomethanes
- Turbidity: Measure of light emitting property

## Target
- Potability: 1 if water is safe for consumption, 0 otherwise


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.impute import SimpleImputer
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('default')
sns.set_palette('husl')

print("Libraries imported successfully!")


In [None]:
# Load the dataset
try:
    # Try to load from Kaggle dataset
    df = pd.read_csv('water_potability.csv')
    print("Dataset loaded from file successfully!")
except FileNotFoundError:
    print("Dataset file not found. Generating synthetic data...")
    # Generate synthetic data if file not available
    from ml_utils import WaterQualityPredictor
    predictor = WaterQualityPredictor()
    df = predictor.generate_synthetic_data(2000)
    print("Synthetic dataset generated successfully!")

# Display basic information about the dataset
print(f"\nDataset shape: {df.shape}")
print(f"\nColumn names: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()


In [None]:
# Data preprocessing and model training
print("Starting data preprocessing and model training...")

# Separate features and target
X = df.drop('potability', axis=1)
y = df['potability']

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_imputed, y, test_size=0.2, random_state=42, stratify=y
)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10)
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

print(f"Model trained successfully!")
print(f"Test accuracy: {accuracy:.4f}")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 5 Most Important Features:")
print(feature_importance.head())


In [None]:
# Save the trained model and scaler
print("Saving trained model and scaler...")

# Save the model
joblib.dump(model, 'model.pkl')
print("Model saved as 'model.pkl'")

# Save the scaler
joblib.dump(scaler, 'scaler.pkl')
print("Scaler saved as 'scaler.pkl'")

# Test the saved model
print("\nTesting the saved model...")
loaded_model = joblib.load('model.pkl')
loaded_scaler = joblib.load('scaler.pkl')

# Test with sample data
sample_data = {
    'ph': 7.2,
    'hardness': 180.0,
    'solids': 20000.0,
    'chloramines': 7.5,
    'sulfate': 250.0,
    'conductivity': 400.0,
    'organic_carbon': 14.0,
    'trihalomethanes': 70.0,
    'turbidity': 3.5
}

# Prepare features
features = np.array([[sample_data[col] for col in X.columns]])
scaled_features = loaded_scaler.transform(features)

# Make prediction
prediction = loaded_model.predict(scaled_features)[0]
prediction_proba = loaded_model.predict_proba(scaled_features)[0]
confidence = max(prediction_proba) * 100

print(f"\nSample prediction:")
print(f"Input: {sample_data}")
print(f"Prediction: {'Potable' if prediction == 1 else 'Non-potable'}")
print(f"Confidence: {confidence:.1f}%")

print("\nModel training and saving completed successfully!")
print("The model is now ready for deployment!")
