**Importing Libraries**

In [1]:
# 1. to handle the data
import pandas as pd
import numpy as np
from scipy import stats

# to visualize the data
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# To preprocess the data
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder,OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
# import iterative imputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# machine learning
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
#for classification tasks
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, RandomForestRegressor
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
# pipeline
from sklearn.pipeline import Pipeline
# metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_absolute_error,mean_squared_error,r2_score

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

Reading the **Data**


In [None]:
# read data
df = pd.read_csv("water_potability.csv")
# print first 5 rows
df.head()

**EDA**

In [None]:
# print descriptive statistics
df.describe().style.background_gradient(cmap='coolwarm')

**Print Info**

In [None]:
# print info
print(df.info())

**Target Distribution**

In [None]:
# Target distribution
plt.figure(figsize=(6,6))
# Pie plot
df['Potability'].value_counts().plot.pie(explode=[0.1,0.1],
                    autopct='%1.1f%%', shadow=True,
                    textprops={'fontsize':16}).set_title("Target distribution");

**Histogram**

In [None]:
# Assuming df is your DataFrame
variables = ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
             'Organic_carbon', 'Trihalomethanes', 'Turbidity']

# Set up the figure and axes
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(15, 15))

# Flatten the axes for easy iteration
axes = axes.flatten()

# Loop through each variable and create histograms
for i, var in enumerate(variables):
    ax = axes[i]
    sns.histplot(df[var], kde=True, ax=ax)
    ax.axvline(df[var].mean(), color='red', linestyle='--', label='Mean')
    ax.axvline(df[var].median(), color='blue', linestyle='--', label='Median')

    # Annotate plot with mean and median
    ax.annotate(f'Mean: {df[var].mean():.2f}\nMedian: {df[var].median():.2f}',
                xy=(0.05, 0.95), xycoords='axes fraction', ha='left', va='top')

    ax.set_title(f'Histogram with KDE for {var}')
    ax.set_xlabel(var)
    ax.legend()

# Adjust layout and display
plt.tight_layout()
plt.show()

**Corelation**

In [None]:
# now check the correlation of all columns
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(),annot=True,cmap='coolwarm',linewidths=0.4);

**Parallel coordinates**

In [None]:
import plotly.express as px

# Assuming df is your DataFrame
fig = px.parallel_coordinates(df, color="Potability", color_continuous_scale=['#8DBAFF', '#890D0D'],
                               color_continuous_midpoint=2, height=800, width=1200)

fig.show()

**Missing values**

In [None]:
# print missing values
df.isnull().sum().sort_values(ascending=False)

**Imputing Missing Values**

In [None]:
# missing data columns
missing_data_cols =df.columns[df.isnull().any()]

numeric_cols = ['Sulfate','ph','Trihalomethanes']
# Define numeric_cols,

def impute_continuous_missing_data(passed_col):
    df_null = df[df[passed_col].isnull()]
    df_not_null = df[df[passed_col].notnull()]

    X = df_not_null.drop(passed_col, axis=1)
    y = df_not_null[passed_col]

    other_missing_cols = [col for col in missing_data_cols if col != passed_col]

    iterative_imputer = IterativeImputer(estimator=RandomForestRegressor(random_state=42), add_indicator=True)

    for col in other_missing_cols:
        if X[col].isnull().sum() > 0:
            col_with_missing_values = X[col].values.reshape(-1, 1)
            imputed_values = iterative_imputer.fit_transform(col_with_missing_values)
            X[col] = imputed_values[:, 0]
        else:
            pass

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    rf_regressor = RandomForestRegressor()
    rf_regressor.fit(X_train, y_train)
    y_pred = rf_regressor.predict(X_test)

    print("MAE =", mean_absolute_error(y_test, y_pred))
    print("RMSE =", mean_squared_error(y_test, y_pred, squared=False))
    print("R2 =", r2_score(y_test, y_pred))

    X = df_null.drop(passed_col, axis=1)

    for col in other_missing_cols:
        if X[col].isnull().sum() > 0:
            col_with_missing_values = X[col].values.reshape(-1, 1)
            imputed_values = iterative_imputer.fit_transform(col_with_missing_values)
            X[col] = imputed_values[:, 0]
        else:
            pass

    if len(df_null) > 0:
        df_null[passed_col] = rf_regressor.predict(X)
    else:
        pass

    df_combined = pd.concat([df_not_null, df_null])
    # Check if all missing values have been imputed
    if df_null[passed_col].isnull().sum() == 0:
        print("All missing values in column", passed_col, "imputed successfully.")

    return df_combined[passed_col]

# Impute missing values for numeric columns
for col in numeric_cols:
    print("Missing Values", col, ":", str(round((df[col].isnull().sum() / len(df)) * 100, 2))+"%")
    if col in numeric_cols:
        df[col] = impute_continuous_missing_data(col)

**Training Machine Learning Model** Using Random Forest

In [13]:
# split data into X and y
X = df.drop('Potability', axis=1)
y = df['Potability']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, f1_score, precision_score, accuracy_score

# Define the pipeline with a scaler and the random forest classifier
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define the hyperparameters for grid search
params = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [10, 20, 30]
}

# Perform grid search using the pipeline and parameters
clf = GridSearchCV(pipeline, params, cv=5)
clf.fit(X_train, y_train)

# Get the best model and its parameters
best_model = clf.best_estimator_
best_model.fit(X_train, y_train)
y_pred_best = best_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
conf_matrix = confusion_matrix(y_test, y_pred_best)
f1 = f1_score(y_test, y_pred_best)
precision = precision_score(y_test, y_pred_best)

# Print the results
print("Best Random Forest Model:")
print("Test Accuracy:", accuracy_best)
print("F1 Score:", f1)
print("Precision Score:", precision)
print("Confusion Matrix:",conf_matrix)

Best Random Forest Model:
Test Accuracy: 0.7073170731707317
F1 Score: 0.5102040816326531
Precision Score: 0.6756756756756757
Confusion Matrix: [[364  48]
 [144 100]]


In [None]:
df.Potability.value_counts()


**Integrating Flask**

In [16]:
# Import necessary libraries
from flask import Flask, render_template, request
import joblib

# Load the trained ML model
model = joblib.load('your_model.pkl')

# Initialize Flask app
app = Flask(__name__)

# Define route for home page
@app.route('/')
def home():
    return render_template('index.html')

# Define route for prediction
@app.route('/predict', methods=['POST'])
def predict():
    # Get user input from the form
    ph = float(request.form['ph'])
    hardness = float(request.form['hardness'])
    # Add more variables as needed

    # Make prediction using the ML model
    prediction = model.predict([[ph, hardness]])[0]

    # Determine the result message
    if prediction == 1:
        result = 'Potable'
    else:
        result = 'Non-Potable'

    return render_template('result.html', result=result)

if __name__ == '__main__':
    app.run(debug=True)

FileNotFoundError: [Errno 2] No such file or directory: 'your_model.pkl'