# Water Quality Prediction using Deep Learning Neural Networks
### Central Pollution Control Board (CPCB) Dataset

This notebook builds Deep Learning Neural Networks (Multi-Layer Perceptrons) to predict:
1. **Water Quality Index (WQI)** – Regression task
2. **Water Quality Classification** – Multi-class classification task

**Dataset:** 19,029 records with 15 water quality indicators measured across India (2019–2022)

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.metrics import (
    r2_score, mean_squared_error, mean_absolute_error,
    accuracy_score, f1_score, classification_report,
    confusion_matrix
)

print('All libraries imported successfully.')
print(f'NumPy  : {np.__version__}')
print(f'Pandas : {pd.__version__}')

## 2. Load and Explore the Dataset

In [None]:
df = pd.read_csv('water_quality.csv')
print('Dataset Shape:', df.shape)
df.head()

In [None]:
print('Column Data Types:')
print(df.dtypes)
print('\nMissing Values per Column:')
print(df.isnull().sum())

In [None]:
print('Descriptive Statistics:')
df.describe()

In [None]:
print('Water Quality Classification Distribution:')
print(df['Water Quality Classification'].value_counts())

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

class_counts = df['Water Quality Classification'].value_counts()
axes[0].bar(class_counts.index, class_counts.values,
            color=sns.color_palette('viridis', len(class_counts)))
axes[0].set_title('Water Quality Classification Distribution', fontsize=13, fontweight='bold')
axes[0].set_xlabel('Classification')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=20)

axes[1].hist(df['WQI'], bins=60, color='steelblue', edgecolor='white')
axes[1].set_title('WQI Distribution', fontsize=13, fontweight='bold')
axes[1].set_xlabel('WQI')
axes[1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 3. Data Preprocessing

In [None]:
FEATURE_COLS = ['Year', 'pH', 'EC', 'CO3', 'HCO3', 'Cl', 'SO4',
                'NO3', 'TH', 'Ca', 'Mg', 'Na', 'K', 'F', 'TDS']

TARGET_REGRESSION     = 'WQI'
TARGET_CLASSIFICATION = 'Water Quality Classification'

required_cols = FEATURE_COLS + [TARGET_REGRESSION, TARGET_CLASSIFICATION]
df_clean = df[required_cols].dropna().reset_index(drop=True)

print(f'Original rows : {len(df)}')
print(f'After cleaning: {len(df_clean)}')
print(f'Rows dropped  : {len(df) - len(df_clean)}')

In [None]:
X = df_clean[FEATURE_COLS].values
y_reg   = df_clean[TARGET_REGRESSION].values
y_class = df_clean[TARGET_CLASSIFICATION].values

le = LabelEncoder()
y_class_enc = le.fit_transform(y_class)

print('Classes       :', le.classes_)
print('Encoded labels:', np.unique(y_class_enc))
print(f'Features: {X.shape[1]}, Samples: {X.shape[0]}')

In [None]:
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X, y_reg, test_size=0.2, random_state=42
)

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X, y_class_enc, test_size=0.2, random_state=42, stratify=y_class_enc
)

print(f'Regression     — Train: {X_train_r.shape}, Test: {X_test_r.shape}')
print(f'Classification — Train: {X_train_c.shape}, Test: {X_test_c.shape}')

In [None]:
scaler_r = StandardScaler()
X_train_r_sc = scaler_r.fit_transform(X_train_r)
X_test_r_sc  = scaler_r.transform(X_test_r)

scaler_c = StandardScaler()
X_train_c_sc = scaler_c.fit_transform(X_train_c)
X_test_c_sc  = scaler_c.transform(X_test_c)

print('StandardScaler applied. Features have zero mean and unit variance.')

## 4. Exploratory Data Analysis

In [None]:
fig, ax = plt.subplots(figsize=(14, 10))
corr_matrix = df_clean[FEATURE_COLS + [TARGET_REGRESSION]].corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f', cmap='coolwarm',
            center=0, ax=ax, linewidths=0.5, annot_kws={'size': 8})
ax.set_title('Feature Correlation Heatmap (with WQI)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
wqi_corr = corr_matrix['WQI'].drop('WQI').abs().sort_values(ascending=False)
plt.figure(figsize=(9, 5))
wqi_corr.plot(kind='bar', color='steelblue', edgecolor='white')
plt.title('Feature Correlation with WQI (Absolute)', fontsize=13, fontweight='bold')
plt.ylabel('Absolute Correlation')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
print('Top 5 correlated features with WQI:')
print(wqi_corr.head())

In [None]:
plt.figure(figsize=(10, 5))
order = df_clean.groupby('Water Quality Classification')['WQI'].median().sort_values().index
sns.boxplot(data=df_clean, x='Water Quality Classification', y='WQI',
            order=order, palette='viridis')
plt.title('WQI Distribution by Water Quality Class', fontsize=13, fontweight='bold')
plt.xticks(rotation=20)
plt.tight_layout()
plt.show()

## 5. Model 1 — WQI Regression (Deep Neural Network)

**Architecture:** `Input(15)` → `Dense(512, ReLU)` → `Dense(256, ReLU)` → `Dense(128, ReLU)` → `Dense(64, ReLU)` → `Output(1, Linear)`

In [None]:
reg_model = MLPRegressor(
    hidden_layer_sizes=(512, 256, 128, 64),
    activation='relu',
    solver='adam',
    learning_rate_init=0.001,
    batch_size=256,
    max_iter=500,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=20,
    random_state=42,
    verbose=False
)

print('Architecture: 15 -> 512 -> 256 -> 128 -> 64 -> 1')
print('Optimizer   : Adam (lr=0.001) | Early Stopping | Batch=256')
print('Training...')
reg_model.fit(X_train_r_sc, y_train_r)
print(f'Done. Iterations: {reg_model.n_iter_}')

In [None]:
y_pred_train_r = reg_model.predict(X_train_r_sc)
y_pred_test_r  = reg_model.predict(X_test_r_sc)

r2_train  = r2_score(y_train_r, y_pred_train_r)
r2_test   = r2_score(y_test_r,  y_pred_test_r)
rmse_test = np.sqrt(mean_squared_error(y_test_r, y_pred_test_r))
mae_test  = mean_absolute_error(y_test_r, y_pred_test_r)

print('=' * 50)
print('    WQI REGRESSION METRICS')
print('=' * 50)
print(f'  R² Score (Train) : {r2_train:.4f}')
print(f'  R² Score (Test)  : {r2_test:.4f}')
print(f'  RMSE     (Test)  : {rmse_test:.4f}')
print(f'  MAE      (Test)  : {mae_test:.4f}')
print('=' * 50)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(reg_model.loss_curve_, color='steelblue', linewidth=2)
axes[0].set_title('Regression — Training Loss Curve', fontweight='bold')
axes[0].set_xlabel('Iterations')
axes[0].set_ylabel('MSE Loss')
axes[0].grid(True, alpha=0.3)

axes[1].scatter(y_test_r, y_pred_test_r, alpha=0.25, s=8, color='steelblue')
mn, mx = y_test_r.min(), y_test_r.max()
axes[1].plot([mn, mx], [mn, mx], 'r--', lw=2, label='Perfect Fit')
axes[1].set_title(f'WQI: Actual vs Predicted  R²={r2_test:.4f}', fontweight='bold')
axes[1].set_xlabel('Actual WQI')
axes[1].set_ylabel('Predicted WQI')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
residuals = y_test_r - y_pred_test_r
fig, axes = plt.subplots(1, 2, figsize=(13, 4))

axes[0].scatter(y_pred_test_r, residuals, alpha=0.25, s=8, color='darkorange')
axes[0].axhline(0, color='red', lw=2, linestyle='--')
axes[0].set_title('Residuals vs Predicted Values', fontweight='bold')
axes[0].set_xlabel('Predicted WQI')
axes[0].set_ylabel('Residuals')
axes[0].grid(True, alpha=0.3)

axes[1].hist(residuals, bins=60, color='darkorange', edgecolor='white')
axes[1].set_title('Residuals Distribution', fontweight='bold')
axes[1].set_xlabel('Residual')
axes[1].set_ylabel('Frequency')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Model 2 — Water Quality Classification (Deep Neural Network)

**Architecture:** `Input(15)` → `Dense(512, ReLU)` → `Dense(256, ReLU)` → `Dense(128, ReLU)` → `Dense(64, ReLU)` → `Output(5, Softmax)`

In [None]:
clf_model = MLPClassifier(
    hidden_layer_sizes=(512, 256, 128, 64),
    activation='relu',
    solver='adam',
    learning_rate_init=0.001,
    batch_size=256,
    max_iter=500,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=20,
    random_state=42,
    verbose=False
)

print('Architecture: 15 -> 512 -> 256 -> 128 -> 64 -> 5(Softmax)')
print('Optimizer   : Adam | Loss: Cross-Entropy | Early Stopping')
print('Classes     :', list(le.classes_))
print('Training...')
clf_model.fit(X_train_c_sc, y_train_c)
print(f'Done. Iterations: {clf_model.n_iter_}')

In [None]:
y_pred_train_c = clf_model.predict(X_train_c_sc)
y_pred_test_c  = clf_model.predict(X_test_c_sc)

acc_train   = accuracy_score(y_train_c, y_pred_train_c)
acc_test    = accuracy_score(y_test_c,  y_pred_test_c)
f1_macro    = f1_score(y_test_c, y_pred_test_c, average='macro')
f1_weighted = f1_score(y_test_c, y_pred_test_c, average='weighted')

print('=' * 58)
print('  WATER QUALITY CLASSIFICATION METRICS')
print('=' * 58)
print(f'  Accuracy       (Train) : {acc_train:.4f}')
print(f'  Accuracy       (Test)  : {acc_test:.4f}')
print(f'  F1 Score Macro (Test)  : {f1_macro:.4f}')
print(f'  F1 Score Wtd   (Test)  : {f1_weighted:.4f}')
print('=' * 58)
print('\nClassification Report:')
print(classification_report(y_test_c, y_pred_test_c, target_names=le.classes_))

In [None]:
plt.figure(figsize=(8, 4))
plt.plot(clf_model.loss_curve_, color='steelblue', linewidth=2, label='Train Loss')
if hasattr(clf_model, 'validation_scores_') and clf_model.validation_scores_ is not None:
    ax2 = plt.twinx()
    ax2.plot(clf_model.validation_scores_, color='darkorange', linewidth=2, alpha=0.8, label='Val Acc')
    ax2.set_ylabel('Validation Accuracy', color='darkorange')
    ax2.legend(loc='lower right')
plt.title('Classification Model — Training Loss Curve', fontweight='bold')
plt.xlabel('Iterations')
plt.ylabel('Cross-Entropy Loss')
plt.legend(loc='upper right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
cm = confusion_matrix(y_test_c, y_pred_test_c)
plt.figure(figsize=(9, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=le.classes_, yticklabels=le.classes_, linewidths=0.5)
plt.title(f'Confusion Matrix — Classification  Acc={acc_test:.4f}',
          fontsize=13, fontweight='bold')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.xticks(rotation=25, ha='right')
plt.tight_layout()
plt.show()

In [None]:
report_dict = classification_report(y_test_c, y_pred_test_c,
                                     target_names=le.classes_, output_dict=True)
classes = le.classes_
f1_pc   = [report_dict[c]['f1-score']  for c in classes]
prec_pc = [report_dict[c]['precision'] for c in classes]
rec_pc  = [report_dict[c]['recall']    for c in classes]

x = np.arange(len(classes))
w = 0.25
fig, ax = plt.subplots(figsize=(11, 5))
ax.bar(x - w, prec_pc, w, label='Precision', color='steelblue', edgecolor='white')
ax.bar(x,     rec_pc,  w, label='Recall',    color='darkorange', edgecolor='white')
ax.bar(x + w, f1_pc,   w, label='F1-Score',  color='seagreen',  edgecolor='white')
ax.set_xticks(x)
ax.set_xticklabels(classes, rotation=20, ha='right')
ax.set_ylim(0, 1.08)
ax.set_ylabel('Score')
ax.set_title('Per-Class Precision, Recall & F1-Score', fontweight='bold', fontsize=13)
ax.legend()
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## 7. Final Summary

In [None]:
print('\n' + '=' * 62)
print('        FINAL MODEL PERFORMANCE SUMMARY')
print('=' * 62)
print('\n[Model 1]  WQI Regression — Deep Neural Network')
print('  Architecture : 15 -> 512 -> 256 -> 128 -> 64 -> 1')
print('  Optimizer    : Adam (lr=0.001) | Early Stopping')
print(f'  R² Score Train : {r2_train:.4f}')
print(f'  R² Score Test  : {r2_test:.4f}')
print(f'  RMSE     Test  : {rmse_test:.4f}')
print(f'  MAE      Test  : {mae_test:.4f}')
print('\n[Model 2]  Water Quality Classification — Deep Neural Network')
print('  Architecture : 15 -> 512 -> 256 -> 128 -> 64 -> 5')
print('  Optimizer    : Adam | Loss: Cross-Entropy')
print(f'  Accuracy Train : {acc_train:.4f}')
print(f'  Accuracy Test  : {acc_test:.4f}')
print(f'  F1-Macro Test  : {f1_macro:.4f}')
print(f'  F1-Wtd   Test  : {f1_weighted:.4f}')
print('\n' + '=' * 62)