# Water Quality Machine Learning Model

In [22]:
# Imports
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from joblib import dump, load
from sklearn.metrics import classification_report, accuracy_score, precision_score, f1_score, recall_score, confusion_matrix, ConfusionMatrixDisplay

In [23]:
# Create Pandas DataFrame from water_potability
water_data = pd.read_csv("data/water_potability.csv")

In [24]:
# A look at the DataFrame
water_data.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


In [25]:
# Remove samples with empty values
water_data.isna().sum()

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64

In [26]:
water_data = water_data.dropna()

In [27]:
len(water_data)

2011

In [28]:
# Verify that samples with missing data have been removed
water_data.isna().sum()

ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64

In [29]:
# Number of samples for machine learning mode
len(water_data)

2011

In [30]:
# Create Machine Learning Estimator

# Split Data
X = water_data.drop('Potability', axis = 1)
y = water_data['Potability']

np.random.seed(7)

# 80% of samples used for training and 20% used for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

model = RandomForestClassifier()

model.fit(X_train, y_train)

# Model score using test set
model.score(X_test, y_test)

0.652605459057072

In [31]:
# Cross Value Scoring
cv_score = cross_val_score(model, X, y)

# Average Cross Value Score before tuning hyperparameters
cv_score.mean()

0.6395084132686444

In [32]:
# Evaluates model 
def hyperparameter_evaluation(y_truths, y_predictions):
    accuracy = accuracy_score(y_truths, y_predictions)
    precision = precision_score(y_truths, y_predictions)
    f1 = f1_score(y_truths, y_predictions)
    recall = recall_score(y_truths, y_predictions)
    
    print("Accuracy: " + str(accuracy) + "\n" + 
          "Precision: " + str(precision) + "\n" +
          "F1: " + str(f1) + "\n" + 
          "Recall: " + str(recall) + "\n")

In [33]:
# Model Evaluation metrics before tuning hyperparameters
y_predictions = model.predict(X_test)
hyperparameter_evaluation(y_test, y_predictions)

Accuracy: 0.652605459057072
Precision: 0.6304347826086957
F1: 0.453125
Recall: 0.35365853658536583



In [34]:
# Creating More Estimators using RandomizedSearchCV
grid = {"n_estimators": [100, 300, 500, 800, 1200],
       "max_depth": [None, 5, 8, 15, 25, 30],
       "min_samples_split": [2, 5, 10, 15, 100],
       "min_samples_leaf": [1, 2, 5, 10]}


model_3 = RandomForestClassifier(n_jobs=1)

randomized_search_results = RandomizedSearchCV(estimator = model_3,
                                               param_distributions = grid,
                                               n_iter = 20,
                                               cv = 5,
                                               verbose = 2)
randomized_search_results.fit(X_train, y_train);

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END max_depth=5, min_samples_leaf=2, min_samples_split=10, n_estimators=1200; total time=   2.1s
[CV] END max_depth=5, min_samples_leaf=2, min_samples_split=10, n_estimators=1200; total time=   2.1s
[CV] END max_depth=5, min_samples_leaf=2, min_samples_split=10, n_estimators=1200; total time=   2.1s
[CV] END max_depth=5, min_samples_leaf=2, min_samples_split=10, n_estimators=1200; total time=   2.1s
[CV] END max_depth=5, min_samples_leaf=2, min_samples_split=10, n_estimators=1200; total time=   2.1s
[CV] END max_depth=25, min_samples_leaf=5, min_samples_split=10, n_estimators=100; total time=   0.3s
[CV] END max_depth=25, min_samples_leaf=5, min_samples_split=10, n_estimators=100; total time=   0.3s
[CV] END max_depth=25, min_samples_leaf=5, min_samples_split=10, n_estimators=100; total time=   0.3s
[CV] END max_depth=25, min_samples_leaf=5, min_samples_split=10, n_estimators=100; total time=   0.3s
[CV] END max_depth=2

[CV] END max_depth=15, min_samples_leaf=1, min_samples_split=15, n_estimators=100; total time=   0.3s
[CV] END max_depth=15, min_samples_leaf=1, min_samples_split=15, n_estimators=100; total time=   0.3s
[CV] END max_depth=15, min_samples_leaf=1, min_samples_split=15, n_estimators=100; total time=   0.3s
[CV] END max_depth=15, min_samples_leaf=1, min_samples_split=15, n_estimators=100; total time=   0.3s
[CV] END max_depth=15, min_samples_leaf=1, min_samples_split=15, n_estimators=100; total time=   0.3s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=15, n_estimators=300; total time=   0.9s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=15, n_estimators=300; total time=   0.9s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=15, n_estimators=300; total time=   0.8s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=15, n_estimators=300; total time=   0.9s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=15, n_estim

In [35]:
# Identifies best paramaters found in the RandomizedSearch
randomized_search_results.best_params_

{'n_estimators': 100,
 'min_samples_split': 15,
 'min_samples_leaf': 1,
 'max_depth': 15}

In [36]:
# Make prediction using best parameters
rs_y_predictions = randomized_search_results.predict(X_test)

rs_performance = hyperparameter_evaluation(y_test, rs_y_predictions)

Accuracy: 0.6625310173697271
Precision: 0.6555555555555556
F1: 0.4645669291338582
Recall: 0.3597560975609756



In [38]:
# Use GridSearchCV to check all hyperparameter combinations of following grid
grid_search = {'n_estimators': [100, 300, 500, 800],
               'max_depth': [None, 5, 8, 25, 30],
               'min_samples_split': [2, 5, 10],
               'min_samples_leaf': [1]}

# Create new estimator
gs_clf = RandomForestClassifier(n_jobs = 1)

# Uses the GridSearchCV function to find the best parameters listed in the grid_search dictionsary
gs_clf_estimator = GridSearchCV(estimator = gs_clf,
                                param_grid = grid_search,
                                cv = 5, # Uses 5 fold fitting for each candidate, uses average of 5 folds for best params
                                verbose = 2) 

# Fit the model using the training data
gs_clf_estimator.fit(X_train, y_train)



Fitting 5 folds for each of 60 candidates, totalling 300 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   0.9s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   1.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   0.9s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   1.0s
[CV] END ma

[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   0.5s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   0.5s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   0.5s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   0.5s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   0.5s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=5, n_estimators=500; total time=   0.9s


[CV] END max_depth=8, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   0.2s
[CV] END max_depth=8, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   0.2s
[CV] END max_depth=8, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   0.7s
[CV] END max_depth=8, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   0.7s
[CV] END max_depth=8, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   0.7s
[CV] END max_depth=8, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   0.7s
[CV] END max_depth=8, min_samples_leaf=1, min_samples_split=10, n_estimators=300; total time=   0.7s
[CV] END max_depth=8, min_samples_leaf=1, min_samples_split=10, n_estimators=500; total time=   1.1s
[CV] END max_depth=8, min_samples_leaf=1, min_samples_split=10, n_estimators=500; total time=   1.2s
[CV] END max_depth=8, min_samples_leaf=1, min_samples_split=10, n_estimators=500; total tim

[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   1.1s
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   1.1s
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   1.0s
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   1.1s
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   1.1s
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   1.7s
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   1.8s
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=   1.6s
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total tim

GridSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=1),
             param_grid={'max_depth': [None, 5, 8, 25, 30],
                         'min_samples_leaf': [1],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [100, 300, 500, 800]},
             verbose=2)

In [39]:
# Reports best paramaters found in GridSearchCV
gs_clf_estimator.best_params_

{'max_depth': 30,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 100}

In [40]:
# Review metrics of best parameters found in GridSearchCV
gs_y_predictions = gs_clf_estimator.predict(X_test)

gs_performance = hyperparameter_evaluation(y_test, gs_y_predictions)

Accuracy: 0.6674937965260546
Precision: 0.65625
F1: 0.4846153846153845
Recall: 0.38414634146341464



In [47]:
# This Estimator uses the best paramaters found in GridSearchCV and will be used in the application
water_clf = RandomForestClassifier(n_estimators = 300,
                                   min_samples_split = 5,
                                   min_samples_leaf = 1,
                                   max_depth = 30)
water_clf.fit(X_train, y_train)

water_clf_predictions = water_clf.predict(X_test)

water_data_performance = hyperparameter_evaluation(y_test, water_clf_predictions)

Accuracy: 0.6625310173697271
Precision: 0.6521739130434783
F1: 0.46874999999999994
Recall: 0.36585365853658536



In [None]:
# Export water_clf estimator using joblib dump
#dump(water_clf, 'rf_water_quality_estimator')

In [None]:
# Create Graphs that analyze algorithm efficiency
X = water_data.drop('Potability', axis = 1)
y = water_data['Potability']

y_predictions = water_clf.predict(X_test)

water_confusion = confusion_matrix(y_test, y_predictions)
water_confusion

In [None]:
# Confusion matrix that display's model's predictions
confusion_predictions = ConfusionMatrixDisplay.from_predictions(y_true = y_test, y_pred = y_predictions)
plt.title("Random Forest Performance on Test Set")
#plt.savefig('confusion_matrix.png')