In [14]:
pip install xgboost shap

Collecting shap
  Downloading shap-0.46.0-cp311-cp311-win_amd64.whl.metadata (25 kB)
Collecting slicer==0.0.8 (from shap)
  Downloading slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Downloading shap-0.46.0-cp311-cp311-win_amd64.whl (456 kB)
   ---------------------------------------- 0.0/456.1 kB ? eta -:--:--
   ---------------------------------------- 0.0/456.1 kB ? eta -:--:--
   ---------------------------------------- 0.0/456.1 kB ? eta -:--:--
   ---------------------------------------- 0.0/456.1 kB ? eta -:--:--
   - -------------------------------------- 20.5/456.1 kB ? eta -:--:--
   --- ----------------------------------- 41.0/456.1 kB 653.6 kB/s eta 0:00:01
   --- ----------------------------------- 41.0/456.1 kB 653.6 kB/s eta 0:00:01
   --- ----------------------------------- 41.0/456.1 kB 653.6 kB/s eta 0:00:01
   --- ----------------------------------- 41.0/456.1 kB 653.6 kB/s eta 0:00:01
   --- ----------------------------------- 41.0/456.1 kB 653.6 kB/s eta 0:00:01
 

ERROR: Could not install packages due to an OSError: [WinError 32] The process cannot access the file because it is being used by another process: 'D:\\Anaconda\\Lib\\site-packages\\slicer\\__init__.py'
Consider using the `--user` option or check the permissions.



Collecting shap
  Using cached shap-0.46.0-cp311-cp311-win_amd64.whl.metadata (25 kB)
Collecting slicer==0.0.8 (from shap)
  Using cached slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)
Using cached shap-0.46.0-cp311-cp311-win_amd64.whl (456 kB)
Using cached slicer-0.0.8-py3-none-any.whl (15 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.46.0 slicer-0.0.8
Note: you may need to restart the kernel to use updated packages.


In [10]:
import os
import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

# ==============================
# Define Paths
# ==============================
dataset_path = r"C:\Users\hp\Desktop\EPICS\water_potability.csv"
save_model_path = r"C:\Users\hp\Desktop\EPICS\models2"

# Ensure models directory exists
os.makedirs(save_model_path, exist_ok=True)

# ==============================
# Load & Preprocess Dataset
# ==============================
df = pd.read_csv(dataset_path)

# Handle missing values using median imputation
imputer = SimpleImputer(strategy="median")
df.iloc[:, :-1] = imputer.fit_transform(df.iloc[:, :-1])

# Feature Engineering: Safe limits for water quality
df["safe_solids"] = df["Solids"].apply(lambda x: 1 if x <= 500 else 0)
df["optimal_ph"] = df["ph"].apply(lambda x: 1 if 6.5 <= x <= 8.5 else 0)
df["safe_chloramines"] = df["Chloramines"].apply(lambda x: 1 if 1 <= x <= 4 else 0)
df["safe_sulfate"] = df["Sulfate"].apply(lambda x: 1 if x <= 250 else 0)
df["safe_conductivity"] = df["Conductivity"].apply(lambda x: 1 if 50 <= x <= 1500 else 0)
df["safe_organic_carbon"] = df["Organic_carbon"].apply(lambda x: 1 if x <= 5 else 0)
df["safe_trihalomethanes"] = df["Trihalomethanes"].apply(lambda x: 1 if x <= 80 else 0)
df["safe_turbidity"] = df["Turbidity"].apply(lambda x: 1 if x <= 5 else 0)

# Define input features (X) and target variable (y)
X = df.drop(columns=["Potability"])
y = df["Potability"]

# Normalize data
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Handle class imbalance using SMOTE
smote = SMOTE(sampling_strategy=0.75, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Split data into training & testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# ==============================
# Train Random Forest with Hyperparameter Tuning
# ==============================
rf = RandomForestClassifier(random_state=42)

param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10]
}

grid_search = GridSearchCV(rf, param_grid, cv=3, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_

# Evaluate Random Forest
rf_preds = best_rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_preds)
print(f"\nRandom Forest Accuracy: {rf_accuracy * 100:.2f}%")
print("Random Forest Classification Report:\n", classification_report(y_test, rf_preds))

# Save Random Forest model
joblib.dump(best_rf, os.path.join(save_model_path, "random_forest_model.pkl"))

# ==============================
# Train XGBoost with Improved Hyperparameters
# ==============================
xgb = XGBClassifier(
    n_estimators=300, 
    learning_rate=0.05, 
    max_depth=10, 
    scale_pos_weight=1.2, 
    random_state=42
)

xgb.fit(X_train, y_train)

# Evaluate XGBoost
xgb_preds = xgb.predict(X_test)
xgb_accuracy = accuracy_score(y_test, xgb_preds)
print(f"\nXGBoost Accuracy: {xgb_accuracy * 100:.2f}%")
print("XGBoost Classification Report:\n", classification_report(y_test, xgb_preds))

# Save XGBoost model
joblib.dump(xgb, os.path.join(save_model_path, "xgboost_model.pkl"))

# ==============================
# Save Preprocessing Objects
# ==============================
joblib.dump(scaler, os.path.join(save_model_path, "scaler.pkl"))
joblib.dump(imputer, os.path.join(save_model_path, "imputer.pkl"))

print("\nBoth models trained and saved successfully in 'models2' folder!")



Random Forest Accuracy: 71.29%
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.86      0.78       414
           1       0.71      0.50      0.59       286

    accuracy                           0.71       700
   macro avg       0.71      0.68      0.68       700
weighted avg       0.71      0.71      0.70       700


XGBoost Accuracy: 69.86%
XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.77      0.75       414
           1       0.64      0.59      0.62       286

    accuracy                           0.70       700
   macro avg       0.69      0.68      0.68       700
weighted avg       0.70      0.70      0.70       700


Both models trained and saved successfully in 'models2' folder!


In [18]:
import os
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# Define file paths
dataset_path = r"C:\Users\hp\Desktop\EPICS\water_potability.csv"
save_model_path = r"C:\Users\hp\Desktop\EPICS\models2"

# Ensure models directory exists
os.makedirs(save_model_path, exist_ok=True)

# Load dataset
df = pd.read_csv(dataset_path)

# Handle missing values using median imputation
imputer = SimpleImputer(strategy="median")
df.iloc[:, :-1] = imputer.fit_transform(df.iloc[:, :-1])

# Define input features and target variable
X = df.drop(columns=["Potability"])
y = df["Potability"]

# Normalize data
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# ==============================
# Train Random Forest
# ==============================
rf = RandomForestClassifier(n_estimators=200, max_depth=None, min_samples_split=2, random_state=42)
rf.fit(X_train, y_train)

# Evaluate Random Forest
rf_preds = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_preds)
print(f"Random Forest Accuracy: {rf_accuracy * 100:.2f}%")
print("Random Forest Classification Report:\n", classification_report(y_test, rf_preds))

# Save Random Forest model
joblib.dump(rf, os.path.join(save_model_path, "random_forest_model.pkl"))

# ==============================
# Train XGBoost
# ==============================
xgb = XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)
xgb.fit(X_train, y_train)

# Evaluate XGBoost
xgb_preds = xgb.predict(X_test)
xgb_accuracy = accuracy_score(y_test, xgb_preds)
print(f"\nXGBoost Accuracy: {xgb_accuracy * 100:.2f}%")
print("XGBoost Classification Report:\n", classification_report(y_test, xgb_preds))

# Save XGBoost model
joblib.dump(xgb, os.path.join(save_model_path, "xgboost_model.pkl"))

# Save Preprocessing Objects
joblib.dump(scaler, os.path.join(save_model_path, "scaler.pkl"))
joblib.dump(imputer, os.path.join(save_model_path, "imputer.pkl"))

print("\nBoth models trained and saved successfully in 'models2' folder!")

Random Forest Accuracy: 73.75%
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.75      0.74       394
           1       0.75      0.72      0.74       406

    accuracy                           0.74       800
   macro avg       0.74      0.74      0.74       800
weighted avg       0.74      0.74      0.74       800


XGBoost Accuracy: 67.12%
XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.68      0.67       394
           1       0.68      0.66      0.67       406

    accuracy                           0.67       800
   macro avg       0.67      0.67      0.67       800
weighted avg       0.67      0.67      0.67       800


Both models trained and saved successfully in 'models2' folder!


In [21]:
import os
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# Define file paths
dataset_path = r"C:\Users\hp\Desktop\EPICS\water_potability.csv"
save_model_path = r"C:\Users\hp\Desktop\EPICS\models2"

# Ensure models directory exists
os.makedirs(save_model_path, exist_ok=True)

# Load dataset
df = pd.read_csv(dataset_path)

# Handle missing values using median imputation
imputer = SimpleImputer(strategy="median")
df.iloc[:, :-1] = imputer.fit_transform(df.iloc[:, :-1])

# Define input features and target variable
X = df.drop(columns=["Potability"])
y = df["Potability"]

# Normalize data
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# ==============================
# Train Random Forest (Optimized)
# ==============================
rf = RandomForestClassifier(n_estimators=200, max_depth=None, min_samples_split=2, random_state=42)
rf.fit(X_train, y_train)

# Evaluate Random Forest
rf_preds = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_preds)
print(f"Random Forest Accuracy: {rf_accuracy * 100:.2f}%")
print("Random Forest Classification Report:\n", classification_report(y_test, rf_preds))

# Save Random Forest model
joblib.dump(rf, os.path.join(save_model_path, "random_forest_model.pkl"))

# Save Preprocessing Objects
joblib.dump(scaler, os.path.join(save_model_path, "scaler.pkl"))
joblib.dump(imputer, os.path.join(save_model_path, "imputer.pkl"))

print("\nRandom Forest model trained and saved successfully in 'models2' folder!")


Random Forest Accuracy: 73.75%
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.75      0.74       394
           1       0.75      0.72      0.74       406

    accuracy                           0.74       800
   macro avg       0.74      0.74      0.74       800
weighted avg       0.74      0.74      0.74       800


Random Forest model trained and saved successfully in 'models2' folder!


In [1]:
import os
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectFromModel

# Define file paths
dataset_path = r"C:\Users\hp\Desktop\EPICS\water_potability.csv"
save_model_path = r"C:\Users\hp\Desktop\EPICS\models2"

# Ensure models directory exists
os.makedirs(save_model_path, exist_ok=True)

# Load dataset
df = pd.read_csv(dataset_path)

# Handle missing values using median imputation
imputer = SimpleImputer(strategy="median")
df.iloc[:, :-1] = imputer.fit_transform(df.iloc[:, :-1])

# Define input features and target variable
X = df.drop(columns=["Potability"])
y = df["Potability"]

# Normalize data
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# ==============================
# Feature Selection: Reduce importance of Hardness & Organic Carbon
# ==============================
rf_initial = RandomForestClassifier(n_estimators=200, random_state=42)
rf_initial.fit(X_train, y_train)

# Get feature importances
feature_importances = rf_initial.feature_importances_
feature_names = X.columns

# Reduce importance of Hardness & Organic Carbon
for i, feature in enumerate(feature_names):
    if feature in ["Hardness", "Organic_carbon"]:
        feature_importances[i] *= 0.3  # Reduce weight by 70%

# Select features based on importance
selector = SelectFromModel(rf_initial, threshold=np.percentile(feature_importances, 20), prefit=True)
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

# ==============================
# Train Final Random Forest Model
# ==============================
rf_final = RandomForestClassifier(n_estimators=200, max_depth=None, min_samples_split=2, random_state=42)
rf_final.fit(X_train_selected, y_train)

# Evaluate Random Forest
rf_preds = rf_final.predict(X_test_selected)
rf_accuracy = accuracy_score(y_test, rf_preds)
print(f"Random Forest Accuracy: {rf_accuracy * 100:.2f}%")
print("Random Forest Classification Report:\n", classification_report(y_test, rf_preds))

# Save Final Random Forest Model
joblib.dump(rf_final, os.path.join(save_model_path, "random_forest_model.pkl"))

# Save Preprocessing Objects
joblib.dump(scaler, os.path.join(save_model_path, "scaler.pkl"))
joblib.dump(imputer, os.path.join(save_model_path, "imputer.pkl"))
joblib.dump(selector, os.path.join(save_model_path, "feature_selector.pkl"))

print("\nOptimized Random Forest model trained and saved successfully in 'models2' folder!")

Random Forest Accuracy: 73.75%
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.75      0.74       394
           1       0.75      0.72      0.74       406

    accuracy                           0.74       800
   macro avg       0.74      0.74      0.74       800
weighted avg       0.74      0.74      0.74       800


Optimized Random Forest model trained and saved successfully in 'models2' folder!
