In [61]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix,roc_auc_score,roc_curve
from sklearn.tree import  DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_selection import RFE
from xgboost import XGBClassifier
from sklearn.model_selection import KFold,cross_val_score

In [2]:
water = pd.read_excel("water_potability.xlsx")

In [3]:
water

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135,0.0
1,3.716080,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0.0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0.0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0.0
4,9.092223,181.101509,17978.986339,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0.0
...,...,...,...,...,...,...,...,...,...,...
3271,4.668102,193.681735,47580.991603,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821,1.0
3272,7.808856,193.553212,17329.802160,8.061362,,392.449580,19.903225,,2.798243,1.0
3273,9.419510,175.762646,33155.578218,7.350233,,432.044783,11.039070,69.845400,3.298875,1.0
3274,5.126763,230.603758,11983.869376,6.303357,,402.883113,11.168946,77.488213,4.708658,1.0


In [4]:
water.isnull().sum()

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64

In [5]:
water["ph"].fillna(value = water["ph"].mean(), inplace = True)
water["Sulfate"].fillna(value = water["Sulfate"].mean(), inplace = True)
water["Trihalomethanes"].fillna(value = water["Trihalomethanes"].mean(), inplace = True)

In [6]:
water.isna().sum()

ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64

In [7]:
water.Potability.value_counts()

0.0    1998
1.0    1278
Name: Potability, dtype: int64

In [16]:
X = water.drop(columns="Potability")
Y = water.Potability
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=.80, random_state=42, stratify=Y)

In [17]:
sm = SMOTE(random_state = 42)
X_train, Y_train = sm.fit_resample(X_train, Y_train)
  

In [18]:
Y_train.value_counts()

0.0    1598
1.0    1598
Name: Potability, dtype: int64

In [19]:
Y_test.value_counts()

0.0    400
1.0    256
Name: Potability, dtype: int64

In [20]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [22]:
logit = LogisticRegression()
logit.fit(X_train, Y_train)

In [25]:
logit.score(X_train, Y_train), logit.score(X_test, Y_test)

(0.5181476846057572, 0.5198170731707317)

In [26]:
pred1 = logit.predict(X_train)
pred_test1 = logit.predict(X_test)

print(confusion_matrix(Y_test, pred_test1))

print(classification_report(Y_test, pred_test1))

[[214 186]
 [129 127]]
              precision    recall  f1-score   support

         0.0       0.62      0.54      0.58       400
         1.0       0.41      0.50      0.45       256

    accuracy                           0.52       656
   macro avg       0.51      0.52      0.51       656
weighted avg       0.54      0.52      0.53       656



In [44]:
model1 = DecisionTreeClassifier(max_depth=5)
model1.fit(X_train, Y_train) 

In [45]:
model1.score(X_train, Y_train), model1.score(X_test, Y_test)

(0.619837296620776, 0.5548780487804879)

In [46]:
pred2 = model1.predict(X_train)
pred_test2 = model1.predict(X_test)

print(confusion_matrix(Y_test, pred_test2))

print(classification_report(Y_test, pred_test2))

[[226 174]
 [118 138]]
              precision    recall  f1-score   support

         0.0       0.66      0.56      0.61       400
         1.0       0.44      0.54      0.49       256

    accuracy                           0.55       656
   macro avg       0.55      0.55      0.55       656
weighted avg       0.57      0.55      0.56       656



In [59]:
k_fold = KFold(n_splits=5, shuffle=True, random_state=123)
cvscore_dt_train = cross_val_score(estimator=model1, X=X_train, y=Y_train, cv=k_fold)

cvscore_dt_train, cvscore_imb_dt_train.std(), cvscore_dt_train.mean()

(array([0.571875  , 0.57276995, 0.62128326, 0.56494523, 0.57902973]),
 0.019442074078046826,
 0.5819806338028168)

In [58]:
k_fold = KFold(n_splits=5, shuffle=True, random_state=123)
cvscore_dt_test = cross_val_score(estimator=model1, X=X_test, y=Y_test, cv=k_fold)

cvscore_dt_test, cvscore_dt_test.std(), cvscore_dt_test.mean()

(array([0.49242424, 0.58778626, 0.60305344, 0.58778626, 0.59541985]),
 0.04082926882018105,
 0.573294008790192)

In [64]:
rf= RandomForestClassifier()
param_grid = [
{'n_estimators': [10, 25, 50,75,100], 
 'max_depth': [2, 3, 4, 5, 8, 10, 12, 15], 
 'bootstrap': [True, False],
 'max_features':["sqrt","auto","log2", 0.2, None]}
]

random_search = RandomizedSearchCV(rf, param_grid, cv=5, scoring='neg_mean_squared_error')
random_search.fit(X_train, Y_train)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


In [65]:
random_search.best_params_, random_search.best_estimator_

({'n_estimators': 50,
  'max_features': 'log2',
  'max_depth': 15,
  'bootstrap': False},
 RandomForestClassifier(bootstrap=False, max_depth=15, max_features='log2',
                        n_estimators=50))

In [66]:
rf= RandomForestClassifier(bootstrap=False, max_depth=15, max_features='log2',
                       n_estimators=50)
rf.fit(X_train, Y_train)

In [67]:
rf.score(X_train, Y_train), rf.score(X_test, Y_test)

(0.9946808510638298, 0.614329268292683)

In [68]:
pred3 = rf.predict(X_train)
pred_test3 = rf.predict(X_test)

print(confusion_matrix(Y_test, pred_test2))

print(classification_report(Y_test, pred_test2))

[[226 174]
 [118 138]]
              precision    recall  f1-score   support

         0.0       0.66      0.56      0.61       400
         1.0       0.44      0.54      0.49       256

    accuracy                           0.55       656
   macro avg       0.55      0.55      0.55       656
weighted avg       0.57      0.55      0.56       656



In [69]:
params={
 "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15, 17, 20],
 "min_child_weight" : [0.25, 0.05, 0.5, 1, 3, 5, 7 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ],
 "n_estimators"     : [50, 100, 200, 150, 250, 300]  
}

xgb_clf = XGBClassifier()

random_search = RandomizedSearchCV(xgb_clf, param_distributions=params, n_iter=5, scoring='roc_auc',
                                 n_jobs=-1, cv=5, verbose=3, random_state=42)
random_search.fit(X_train, Y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [71]:
random_search.best_params_, random_search.best_estimator_

({'n_estimators': 250,
  'min_child_weight': 1,
  'max_depth': 17,
  'learning_rate': 0.05,
  'gamma': 0.0,
  'colsample_bytree': 0.7},
 XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
               colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.7,
               early_stopping_rounds=None, enable_categorical=False,
               eval_metric=None, gamma=0.0, gpu_id=-1, grow_policy='depthwise',
               importance_type=None, interaction_constraints='',
               learning_rate=0.05, max_bin=256, max_cat_to_onehot=4,
               max_delta_step=0, max_depth=17, max_leaves=0, min_child_weight=1,
               missing=nan, monotone_constraints='()', n_estimators=250,
               n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
               reg_alpha=0, reg_lambda=1, ...))

In [73]:
xgb_clf_1 = XGBClassifier(n_estimators=250, max_depth=17, min_child_weight=1,learning_rate= 0.05)
xgb_clf_1.fit(X_train, Y_train)
predictions = xgb_clf_1.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(Y_test, predictions))

print("Classification Report")
print(classification_report(Y_test, predictions))


Confusion Matrix:
[[292 108]
 [125 131]]
Classification Report
              precision    recall  f1-score   support

         0.0       0.70      0.73      0.71       400
         1.0       0.55      0.51      0.53       256

    accuracy                           0.64       656
   macro avg       0.62      0.62      0.62       656
weighted avg       0.64      0.64      0.64       656



In [74]:
k_fold = KFold(n_splits=5, shuffle=True, random_state=123)
cvscore_train = cross_val_score(estimator=xgb_clf_1, X=X_train, y=Y_train, cv=k_fold)

cvscore_train, cvscore_train.std(), cvscore_train.mean()

(array([0.690625  , 0.69170579, 0.70735524, 0.7057903 , 0.69014085]),
 0.0077477930094921555,
 0.6971234350547731)

In [76]:
k_fold = KFold(n_splits=5, shuffle=True, random_state=123)
cvscore_test = cross_val_score(estimator=xgb_clf_1, X=X_test, y=Y_test, cv=k_fold)

cvscore_test, cvscore_test.std(), cvscore_test.mean()

(array([0.51515152, 0.52671756, 0.66412214, 0.58015267, 0.57251908]),
 0.05260250188628259,
 0.5717325931066389)