<a href="https://colab.research.google.com/github/AtreyeeDas/Water-Portability-Analysis/blob/main/WaterPortabilityAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import numpy as np
import pandas as pd
import os
import warnings
import google.generativeai as genai
from google.generativeai import types

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

warnings.simplefilter("ignore")

import kagglehub

In [7]:
# Download dataset from Kaggle
path = kagglehub.dataset_download("uom190346a/water-quality-and-potability")
filepath = next(os.path.join(dirname, filenames[0]) for dirname, _, filenames in os.walk(path) if filenames)
df = pd.read_csv(filepath)

Downloading from https://www.kaggle.com/api/v1/datasets/download/uom190346a/water-quality-and-potability?dataset_version_number=2...


100%|██████████| 251k/251k [00:00<00:00, 60.0MB/s]

Extracting files...





In [8]:
# Handle missing values
df.fillna(df.median(), inplace=True)

# Split features and target
X = df.drop("Potability", axis=1)
y = df.Potability

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=18, stratify=y)

# Balance data
smt = SMOTE()
X_train, y_train = smt.fit_resample(X_train, y_train)

# Normalize data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [9]:
# Train a RandomForest model
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [10]:
def predict_potability(user_input):
    """Predicts water potability from user input."""
    input_data = np.array(user_input).reshape(1, -1)
    input_data = scaler.transform(input_data)
    prediction = model.predict(input_data)
    result = "\n💧 Water Quality Prediction 💧\n" + "="*30 + "\n"
    result += f"🔍 Analyzing water sample...\n"
    result += "✅ Safe to Drink! 😊" if prediction[0] == 1 else "⚠️ Not Safe for Drinking! ❌"
    return result

In [11]:
def analyze_with_gemini(user_input):
    """Analyzes water quality using Gemini API."""
    api_key = "AIzaSyB6TYePHsMOvGhgxXP-d-q5w4-KEuqRkQo"
    if not api_key:
        return "Gemini API key not found."

    genai.configure(api_key=api_key)
    model_name = "gemini-2.5-pro-exp-03-25"
    prompt = f"Given the water parameters {user_input} for  the format pH,Hardness (mg/L),Total Dissolved Solids (TDS) (mg/L),Chloramines (mg/L),Sulfate (mg/L),Conductivity (µS/cm), Organic Carbon (mg/L), Trihalomethanes (µg/L), Turbidity (NTU), is the water safe? If not, what are the contamination reasons and purification methods?"
    response = genai.GenerativeModel(model_name).generate_content(prompt)
    gemini_result = "\n🤖 AI Analysis from Gemini API 🤖\n" + "="*35 + "\n"
    gemini_result += response.text if response else "❌ No response from Gemini API."
    return gemini_result

In [12]:
df.tail()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
3271,4.668102,193.681735,47580.991603,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821,1
3272,7.808856,193.553212,17329.80216,8.061362,333.073546,392.44958,19.903225,66.622485,2.798243,1
3273,9.41951,175.762646,33155.578218,7.350233,333.073546,432.044783,11.03907,69.8454,3.298875,1
3274,5.126763,230.603758,11983.869376,6.303357,333.073546,402.883113,11.168946,77.488213,4.708658,1
3275,7.874671,195.102299,17404.177061,7.509306,333.073546,327.45976,16.140368,78.698446,2.309149,1


In [13]:
# Collect user input
print("Enter water parameters in the following order:")
print(", ".join(X.columns))
user_input = []
for col in X.columns:
    value = float(input(f"Enter value for {col}: "))
    user_input.append(value)

Enter water parameters in the following order:
ph, Hardness, Solids, Chloramines, Sulfate, Conductivity, Organic_carbon, Trihalomethanes, Turbidity
Enter value for ph: 9
Enter value for Hardness: 350
Enter value for Solids: 40000
Enter value for Chloramines: 8
Enter value for Sulfate: 400
Enter value for Conductivity: 300
Enter value for Organic_carbon: 15
Enter value for Trihalomethanes: 65
Enter value for Turbidity: 5


In [14]:
# Predict water quality and Get Gemini API Analysis
prediction = predict_potability(user_input)
gemini_analysis = analyze_with_gemini(user_input)

In [15]:
# Display details
print(prediction)
print(gemini_analysis)


💧 Water Quality Prediction 💧
🔍 Analyzing water sample...
⚠️ Not Safe for Drinking! ❌

🤖 AI Analysis from Gemini API 🤖
Based on standard drinking water guidelines (like those from the WHO or EPA), let's analyze each parameter:

1.  **pH: 9.0**
    *   Standard Range: Typically 6.5 - 8.5.
    *   Assessment: **High**. Slightly outside the recommended range. High pH can affect taste and potentially piping.

2.  **Hardness: 350.0 mg/L**
    *   Standard Range: No strict health limit, but >300 mg/L is considered very hard.
    *   Assessment: **Very High**. Primarily an aesthetic issue (scaling, soap lathering, taste) rather than a direct health risk according to most standards.

3.  **Total Dissolved Solids (TDS): 40000.0 mg/L**
    *   Standard Range: EPA secondary standard is 500 mg/L. WHO suggests < 600 mg/L is good, > 1200 mg/L is unacceptable.
    *   Assessment: **Extremely High**. This level is far beyond any acceptable limit for drinking water and is comparable to seawater (averag