In [1]:
# 📦 Step 1: Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib


In [2]:
# 📥 Step 2: Load Dataset
df = pd.read_csv("Water Quality Prediction.csv")
df.columns = df.columns.str.strip()  # Remove leading/trailing spaces
print("Shape:", df.shape)
df.head()

Shape: (1048575, 24)


Unnamed: 0,Index,pH,Iron,Nitrate,Chloride,Lead,Zinc,Color,Turbidity,Fluoride,...,Chlorine,Manganese,Total Dissolved Solids,Source,Water Temperature,Air Temperature,Month,Day,Time of Day,Target
0,0,8.332988,8.3e-05,8.605777,122.799772,3.71e-52,3.434827,Colorless,0.022683,0.607283,...,3.708178,2.27e-15,332.118789,,,43.493324,January,29.0,4.0,0
1,1,6.917863,8.1e-05,3.734167,227.029851,7.85e-94,1.245317,Faint Yellow,0.019007,0.622874,...,3.292038,8.02e-07,284.641984,Lake,15.348981,71.220586,November,26.0,16.0,0
2,2,5.443762,0.020106,3.816994,230.99563,5.29e-76,0.52828,Light Yellow,0.319956,0.423423,...,3.560224,0.07007989,570.054094,River,11.643467,44.89133,January,31.0,8.0,0
3,3,7.955339,0.143988,8.224944,178.12994,4e-176,4.027879,Near Colorless,0.166319,0.208454,...,3.516907,0.02468295,100.043838,Ground,10.092392,60.843233,April,1.0,21.0,0
4,4,8.091909,0.002167,9.925788,186.540872,4.17e-132,3.807511,Light Yellow,0.004867,0.222912,...,3.177849,0.003296139,168.075545,Spring,15.249416,69.336671,June,29.0,7.0,0


In [3]:
# 🧹 Step 3: Data Cleaning
df.drop_duplicates(inplace=True)
df.fillna(df.median(numeric_only=True), inplace=True)

In [4]:
print(df.columns.tolist())

['Index', 'pH', 'Iron', 'Nitrate', 'Chloride', 'Lead', 'Zinc', 'Color', 'Turbidity', 'Fluoride', 'Copper', 'Odor', 'Sulfate', 'Conductivity', 'Chlorine', 'Manganese', 'Total Dissolved Solids', 'Source', 'Water Temperature', 'Air Temperature', 'Month', 'Day', 'Time of Day', 'Target']


In [5]:
# 🏗️ Step 4: Encode Categorical Features (if any)
# Example: 'Source', 'Time of Day', 'Month', 'Day' might be categorical
categorical_cols = ['Source', 'Time of Day', 'Month', 'Day']

for col in categorical_cols:
    if df[col].dtype == 'object':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])


In [6]:
# 🧪 Step 5: Train-Test Split
target_col = 'Target'
X = df.drop([target_col, 'Index'], axis=1)
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [7]:
# ✅ Detect and encode all object-type columns
from sklearn.preprocessing import LabelEncoder

# Drop index column first
df.drop(columns=['Index'], inplace=True)

# Strip column names
df.columns = df.columns.str.strip()

# Identify non-numeric columns (excluding the target)
non_numeric_cols = df.select_dtypes(include=['object']).columns.tolist()
non_numeric_cols = [col for col in non_numeric_cols if col != 'Target']

# Encode them
for col in non_numeric_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])


In [8]:
# 🧪 Train-Test Split
X = df.drop('Target', axis=1)
y = df['Target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ⚖️ Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [9]:
# 🤖 Step 7: Model Training - Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

In [10]:
# ✅ Step 8: Model Evaluation
y_pred = model.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.8766516462818588

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.85      0.91    161568
           1       0.66      0.96      0.78     48147

    accuracy                           0.88    209715
   macro avg       0.82      0.91      0.85    209715
weighted avg       0.91      0.88      0.88    209715


Confusion Matrix:
 [[137416  24152]
 [  1716  46431]]


In [11]:
# 💾 Step 9: Save Model & Scaler
joblib.dump(model, "water_quality_model.pkl")
joblib.dump(scaler, "scaler.pkl")


['scaler.pkl']

In [12]:
# Function to predict potability from raw input
def predict_water_potability(input_data):
    """
    Predicts whether the water is potable or not.
    input_data should be a list or array in the same order as training features.
    """
    loaded_model = joblib.load("water_quality_model.pkl")
    loaded_scaler = joblib.load("scaler.pkl")
    
    input_scaled = loaded_scaler.transform([input_data])
    prediction = loaded_model.predict(input_scaled)
    
    return "Potable" if prediction[0] == 1 else "Not Potable"


In [13]:
# Example data from the dataset (just an illustration, modify as needed)
sample = X_test.iloc[0].tolist()
print("Sample Input:", sample)

result = predict_water_potability(sample)
print("Prediction:", result)


Sample Input: [7.89059963, 1.614662589, 17.49056312, 228.6828047, 5.8e-16, 4.815446969, 3.0, 0.431479328, 0.275609108, 0.249108179, 0.628937773, 101.9821163, 612.984735, 3.420375836, 0.0005372949999999999, 478.1628949, 8.0, 29.93279315, 39.80603676, 9.0, 27.0, 23.0]
Prediction: Potable




In [14]:
len(sample)

22

In [16]:
sample1 = [7.89059963, 5.614662589, 17.49056312, 228.6828047, 5.8e-16, 4.815446969, 3.0, 0.431479328, 0.275609108, 0.249108179, 0.628937773, 101.9821163, 612.984735, 3.420375836, 0.0005372949999999999, 478.1628949, 8.0, 29.93279315, 39.80603676, 9.0, 27.0, 23.0]

In [17]:

result = predict_water_potability(sample1)
print("Prediction:", result)

Prediction: Potable


