In [None]:
# 📘 Train Once If Not Already Trained
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score

# Check if files already exist
model_path = "water_model_large.pkl"

if not os.path.exists(model_path):
    print("🚀 Training model...")

    # Load dataset
    df = pd.read_csv("large_dataset.csv")

    # Use only minimal important columns
    features = ['pH', 'Nitrate', 'Turbidity', 'Chloride', 'Color', 'Source', 'City']
    target = 'Target'
    df = df[features + [target]]

    # Handle null values
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col].fillna(df[col].mode()[0], inplace=True)
        else:
            df[col].fillna(df[col].median(), inplace=True)

    # Encode categorical columns
    label_encoders = {}
    for col in ['Color', 'Source', 'City']:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

    # Extract features and target
    X = df.drop(target, axis=1)
    y = df[target]

    # ❗ Flip labels if 0 = Drinkable and 1 = Not Drinkable
    # So 1 becomes drinkable
    y = y.map({0: 1, 1: 0})

    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Split dataset
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Train model with class_weight balancing
    model = HistGradientBoostingClassifier(class_weight='balanced', random_state=42)
    model.fit(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_test)
    print("✅ Model Trained.")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

    # Save files
    joblib.dump(model, "water_model_large.pkl")
    joblib.dump(scaler, "scaler_large.pkl")
    joblib.dump(label_encoders, "encoders_large.pkl")
    joblib.dump(X.columns.tolist(), "columns_large.pkl")

else:
    print("✅ Model already trained. Skipping training.")


In [2]:
# 📦 Load Saved Components
import joblib
import pandas as pd

model = joblib.load("water_model_large.pkl")
scaler = joblib.load("scaler_large.pkl")
label_encoders = joblib.load("encoders_large.pkl")
feature_columns = joblib.load("columns_large.pkl")

# 🔮 Define Prediction Function
def predict_drinkability(pH, Nitrate, Turbidity, Chloride, Color, Source, City):
    data = pd.DataFrame([{
        'pH': pH,
        'Nitrate': Nitrate,
        'Turbidity': Turbidity,
        'Chloride': Chloride,
        'Color': Color,
        'Source': Source,
        'City': City
    }])

    for col, le in label_encoders.items():
        data[col] = le.transform(data[col])

    data = data[feature_columns]
    data_scaled = scaler.transform(data)
    pred = model.predict(data_scaled)[0]

    # ✅ Correct interpretation
    return "Drinkable ✅" if pred == 1 else "Not Drinkable ❌"


In [7]:
# 🧪 Sample Tests
# print(predict_drinkability(7.2, 10.0, 0.4, 110.0, 'Colorless', 'River', 'Fremont'))          # ✅
# print(predict_drinkability(5.5, 60.0, 3.2, 280.0, 'Light Yellow', 'Ground', 'San Francisco')) # ❌
# print(predict_drinkability(6.8, 8.0, 0.5, 95.0, 'Near Colorless', 'Ground', 'Alameda'))       # ✅
# print(predict_drinkability(4.9, 48.0, 5.5, 330.0, 'Faint Yellow', 'River', 'Oakland'))        # ❌
# print(predict_drinkability(7.5, 6.5, 0.2, 85.0, 'Colorless', 'Lake', 'San Jose'))             # ✅
# print(predict_drinkability(6.3, 25.0, 4.1, 400.0, 'Light Yellow', 'River', 'Fremont'))        # ❌
# print(predict_drinkability(7.0, 9.0, 0.6, 100.0, 'Colorless', 'Lake', 'Redwood City'))        # ✅
# print(predict_drinkability(4.2, 55.0, 2.8, 260.0, 'Faint Yellow', 'Ground', 'San Francisco')) # ❌
