In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv("water_potability.csv")
for column in df.columns:
    if df[column].isnull().sum() > 0:
        df[column] = df.groupby('Potability')[column].transform(lambda x: x.fillna(x.median()))
df['TDS_to_Conductivity'] = df['Solids'] / df['Conductivity']
df['Organic_to_Turbidity'] = df['Organic_carbon'] / df['Turbidity']
df['Hardness_to_Solids'] = df['Hardness'] / df['Solids']
df['pH_Deviation'] = abs(df['ph'] - 7.0)  # Deviation from neutral pH
df['TDS_Concentration'] = df['Solids'] / df['Conductivity']  # Another TDS measure
df['Organic_Load'] = df['Organic_carbon'] * df['Turbidity']  # Combined organic measure

X = df.drop('Potability', axis=1)
y = df['Potability']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = XGBClassifier(
    n_estimators=1000,
    max_depth=6,
    learning_rate=0.005,
    subsample=0.85,
    colsample_bytree=0.85,
    min_child_weight=2,
    gamma=0.1,
    objective='binary:logistic',
    random_state=42,
    n_jobs=-1,
    tree_method='hist',  
    enable_categorical=False 
)


print("Training model...")
model.fit(
    X_train_scaled, 
    y_train,
    eval_set=[(X_test_scaled, y_test)],
    verbose=False
)

y_pred = model.predict(X_test_scaled)

print("\nModel Performance on Test Set:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance)

# Save the model and scaler
joblib.dump(model, 'gradient_boosting_model1.pkl')
joblib.dump(scaler, 'scaler1.pkl')

print("\nModel and scaler saved successfully!") 

Training model...

Model Performance on Test Set:
Accuracy: 0.7942073170731707

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.92      0.84       400
           1       0.82      0.60      0.70       256

    accuracy                           0.79       656
   macro avg       0.80      0.76      0.77       656
weighted avg       0.80      0.79      0.79       656


Confusion Matrix:
[[367  33]
 [102 154]]

Feature Importance:
                 feature  importance
4                Sulfate    0.205499
12          pH_Deviation    0.168648
0                     ph    0.093166
13     TDS_Concentration    0.058774
7        Trihalomethanes    0.057095
9    TDS_to_Conductivity    0.054708
11    Hardness_to_Solids    0.054359
3            Chloramines    0.050173
1               Hardness    0.049971
2                 Solids    0.043135
6         Organic_carbon    0.035415
5           Conductivity    0.034838
8              Turbidity  