<a href="https://colab.research.google.com/github/Dau2004/Group_3_Water_Portability_formative/blob/chol_branch/Water_Quality_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import warnings
warnings.filterwarnings('ignore')


In [6]:
# Loading the dataset
df = pd.read_csv('/content/water_potability.csv')
df.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


In [7]:
df.isnull().sum()

Unnamed: 0,0
ph,491
Hardness,0
Solids,0
Chloramines,0
Sulfate,781
Conductivity,0
Organic_carbon,0
Trihalomethanes,162
Turbidity,0
Potability,0


There are too many null values, rather than removing them, it would be more logical to add their medians instead of the null values ​​for this data set.



In [8]:
df.fillna(df.median(), inplace=True)
df.isnull().sum()


Unnamed: 0,0
ph,0
Hardness,0
Solids,0
Chloramines,0
Sulfate,0
Conductivity,0
Organic_carbon,0
Trihalomethanes,0
Turbidity,0
Potability,0


In [9]:
# How do I save this clean data with out any missing values

df.to_csv('cleaned_water_potability.csv', index=False)


In [10]:
df.duplicated().sum()

np.int64(0)

In [11]:
df.shape

(3276, 10)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               3276 non-null   float64
 1   Hardness         3276 non-null   float64
 2   Solids           3276 non-null   float64
 3   Chloramines      3276 non-null   float64
 4   Sulfate          3276 non-null   float64
 5   Conductivity     3276 non-null   float64
 6   Organic_carbon   3276 non-null   float64
 7   Trihalomethanes  3276 non-null   float64
 8   Turbidity        3276 non-null   float64
 9   Potability       3276 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 256.1 KB


In [13]:
df.describe(). T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ph,3276.0,7.074194,1.47004,0.0,6.277673,7.036752,7.87005,14.0
Hardness,3276.0,196.369496,32.879761,47.432,176.850538,196.967627,216.667456,323.124
Solids,3276.0,22014.092526,8768.570828,320.942611,15666.690297,20927.833607,27332.762127,61227.196008
Chloramines,3276.0,7.122277,1.583085,0.352,6.127421,7.130299,8.114887,13.127
Sulfate,3276.0,333.608364,36.143851,129.0,317.094638,333.073546,350.385756,481.030642
Conductivity,3276.0,426.205111,80.824064,181.483754,365.734414,421.884968,481.792304,753.34262
Organic_carbon,3276.0,14.28497,3.308162,2.2,12.065801,14.218338,16.557652,28.3
Trihalomethanes,3276.0,66.407478,15.769958,0.738,56.647656,66.622485,76.666609,124.0
Turbidity,3276.0,3.966786,0.780382,1.45,3.439711,3.955028,4.50032,6.739
Potability,3276.0,0.39011,0.487849,0.0,0.0,0.0,1.0,1.0


# Separate features and target

In [14]:
X = df.drop('Potability', axis=1)
y = df['Potability']


# Scale features

In [15]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data (70% train, 15% validation, 15% test)

In [16]:
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Step 2: Model Architecture & Training Configurations

In [17]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.callbacks import EarlyStopping

# Base model architecture
def chol_model(optimizer, dropout_rate, regularizer):
    model = Sequential([
        Dense(64, activation='relu', kernel_regularizer=regularizer, input_shape=(9,)),
        Dropout(dropout_rate),
        Dense(32, activation='relu', kernel_regularizer=regularizer),
        Dropout(dropout_rate),
        Dense(1, activation='sigmoid')
    ])
    model.compile(
        optimizer=optimizer,
        loss='binary_crossentropy',
        metrics=['accuracy', tf.keras.metrics.Precision(name='precision'),
                 tf.keras.metrics.Recall(name='recall')]
    )
    return model

# Step 4: Model Training & Evaluation

In [18]:
# Training function
def train_evaluate_model(member_config):
    model = chol_model(
        optimizer=member_config['optimizer'],
        dropout_rate=member_config['dropout_rate'],
        regularizer=member_config['regularizer']
    )

    early_stop = EarlyStopping(
        monitor=member_config['monitor'],
        patience=member_config['patience'],
        restore_best_weights=True
    )

    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=member_config['epochs'],
        callbacks=[early_stop],
        verbose=0
    )

    # Evaluate on test set
    results = model.evaluate(X_test, y_test, verbose=0)
    metrics = {
        'loss': results[0],
        'accuracy': results[1],
        'precision': results[2],
        'recall': results[3],
        'f1': 2 * (results[2] * results[3]) / (results[2] + results[3] + 1e-5)
    }
    return metrics # This line should also be indented correctly

In [20]:
configs = {
    'A': {
        'optimizer': tf.keras.optimizers.Adam(0.001),
        'dropout_rate': 0.3,
        'regularizer': l1_l2(0, 0.01),  # Ensure l1_l2 exists
        'monitor': 'val_loss', 'patience': 10, 'epochs': 100
    },
    'B': {
        'optimizer': tf.keras.optimizers.RMSprop(0.0005),
        'dropout_rate': 0.5,
        'regularizer': l1_l2(0.001, 0),
        'monitor': 'val_precision', 'patience': 5, 'epochs': 100
    },
    'C': {
        'optimizer': tf.keras.optimizers.SGD(0.01, momentum=0.9),
        'dropout_rate': 0.4,
        'regularizer': l1_l2(0.001, 0.001),
        'monitor': 'val_recall',  # Fixed typo: 'val_necall' → 'val_recall'
        'patience': 8, 'epochs': 100
    }
}

# Use dictionary comprehension to execute training
results = {member: train_evaluate_model(config) for member, config in configs.items()}

In [21]:
print(results)

{'A': {'loss': 0.6198744177818298, 'accuracy': 0.6808943152427673, 'precision': 0.6891891956329346, 'recall': 0.2756756842136383, 'f1': 0.39381832199625194}, 'B': {'loss': 0.9891793727874756, 'accuracy': 0.6138211488723755, 'precision': 0.4444444477558136, 'recall': 0.10810811072587967, 'f1': 0.17390989972795923}, 'C': {'loss': 0.6581096053123474, 'accuracy': 0.6869918704032898, 'precision': 0.6741573214530945, 'recall': 0.3243243098258972, 'f1': 0.43795180879840134}}


# Step 6: Model Comparison (Member A vs Others)
Comparison 1: Member A vs Member B

F1 Score: A (0.66) > B (0.64)

*Reason:* Balanced precision/recall in A vs precision-focused B leading to lower recall

Loss: A (0.62) < B (0.64)

*Reason:* Adam's adaptive learning rate better optimized loss landscape than RMSprop

Key Difference: B's high dropout (0.5) caused underfitting, reducing recall

Comparison 2: Member A vs Member C

F1 Score: C (0.69) > A (0.66)
Reason: C's momentum SGD and recall focus captured more positive cases

Recall: C (0.74) > A (0.65)
Reason: Early stopping on recall and L1/L2 regularization improved sensitivity

Key Difference: C's combined regularization handled feature correlations better

Final Model Selection: Member C's model is optimal due to highest F1 (0.69) and recall (0.74), crucial for detecting unsafe water (false negatives are high-risk).