In [18]:
import yaml

import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import optuna
from optuna import trial


In [28]:
df = pd.read_csv("/home/kenny/osda/osda_datasets/processed/water_potability.csv")
df


Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,7.080795,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135,0
1,3.716080,129.422921,18630.057858,6.635246,333.775777,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,333.775777,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0
...,...,...,...,...,...,...,...,...,...,...
3271,4.668102,193.681735,47580.991603,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821,1
3272,7.808856,193.553212,17329.802160,8.061362,333.775777,392.449580,19.903225,66.396293,2.798243,1
3273,9.419510,175.762646,33155.578218,7.350233,333.775777,432.044783,11.039070,69.845400,3.298875,1
3274,5.126763,230.603758,11983.869376,6.303357,333.775777,402.883113,11.168946,77.488213,4.708658,1


In [30]:
y = df["Potability"]
X = df.drop(["Potability"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Decision tree pipeline

In [31]:
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 32),
        'min_samples_split': trial.suggest_float('min_samples_split', 0.1, 1.0),
        'min_samples_leaf': trial.suggest_float('min_samples_leaf', 0.1, 0.5),
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy'])
    }
    clf = DecisionTreeClassifier(**params)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    accuracy_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='accuracy')
    f1_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='f1_macro')

    mean_accuracy = np.mean(accuracy_scores)
    mean_f1 = np.mean(f1_scores)

    trial.report(mean_accuracy, step=1)
    trial.report(mean_f1, step=2)

    return mean_accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

best_trial = study.best_trial
best_params = best_trial.params
best_classifier = DecisionTreeClassifier(**best_params)

best_classifier.fit(X_train, y_train)

y_pred = best_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')

print(f'Best Parameters: {best_params}')
print(f'Accuracy on Test Set: {accuracy:.4f}')
print(f'F1-Score on Test Set: {f1:.4f}')


[I 2023-10-07 23:04:45,881] A new study created in memory with name: no-name-d8aebf9d-e9b0-4c16-8381-98aca9dae93b
[I 2023-10-07 23:04:45,952] Trial 0 finished with value: 0.6053435114503817 and parameters: {'max_depth': 14, 'min_samples_split': 0.11691956267957426, 'min_samples_leaf': 0.22829042164558003, 'criterion': 'entropy'}. Best is trial 0 with value: 0.6053435114503817.
[I 2023-10-07 23:04:46,032] Trial 1 finished with value: 0.6118320610687024 and parameters: {'max_depth': 17, 'min_samples_split': 0.6595494098447979, 'min_samples_leaf': 0.10273779127713355, 'criterion': 'gini'}. Best is trial 1 with value: 0.6118320610687024.
[I 2023-10-07 23:04:46,082] Trial 2 finished with value: 0.6053435114503817 and parameters: {'max_depth': 19, 'min_samples_split': 0.9279509934947681, 'min_samples_leaf': 0.2429617153799098, 'criterion': 'entropy'}. Best is trial 1 with value: 0.6118320610687024.
[I 2023-10-07 23:04:46,138] Trial 3 finished with value: 0.6053435114503817 and parameters: {'

Best Parameters: {'max_depth': 19, 'min_samples_split': 0.4189581839100983, 'min_samples_leaf': 0.12032325336541312, 'criterion': 'gini'}
Accuracy on Test Set: 0.6311
F1-Score on Test Set: 0.5020


Random Forest Classifier

In [33]:
def objective(trial):
    params = {
        "n_estimators":trial.suggest_int("n_estimators", 50, 200),
        "max_depth":trial.suggest_int("max_depth", 1, 32),
        "min_samples_split":trial.suggest_float("min_samples_split", 0.1, 1.0),
        "min_samples_leaf":trial.suggest_float("min_samples_leaf", 0.1, 0.5),
    }
    clf = RandomForestClassifier(**params)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    accuracy_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='accuracy')
    f1_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='f1_macro')

    mean_accuracy = np.mean(accuracy_scores)
    mean_f1 = np.mean(f1_scores)

    trial.report(mean_accuracy, step=1)
    trial.report(mean_f1, step=2)

    return mean_accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

best_trial = study.best_trial
best_params = best_trial.params
best_classifier = RandomForestClassifier(**best_params)

best_classifier.fit(X_train, y_train)

y_pred = best_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')

print(f'Best Parameters: {best_params}')
print(f'Accuracy on Test Set: {accuracy:.4f}')
print(f'F1-Score on Test Set: {f1:.4f}')


[I 2023-10-07 23:05:40,617] A new study created in memory with name: no-name-eb684e96-5712-4d80-acfd-3bb3e97bb6e1
[I 2023-10-07 23:05:42,284] Trial 0 finished with value: 0.6053435114503817 and parameters: {'n_estimators': 168, 'max_depth': 24, 'min_samples_split': 0.20763536206639366, 'min_samples_leaf': 0.3200318233262416}. Best is trial 0 with value: 0.6053435114503817.
[I 2023-10-07 23:05:44,102] Trial 1 finished with value: 0.6053435114503817 and parameters: {'n_estimators': 189, 'max_depth': 6, 'min_samples_split': 0.8072474324980102, 'min_samples_leaf': 0.3520349641042875}. Best is trial 0 with value: 0.6053435114503817.
[I 2023-10-07 23:05:46,422] Trial 2 finished with value: 0.6053435114503817 and parameters: {'n_estimators': 178, 'max_depth': 19, 'min_samples_split': 0.21138424980040715, 'min_samples_leaf': 0.2347737570460213}. Best is trial 0 with value: 0.6053435114503817.
[I 2023-10-07 23:05:47,656] Trial 3 finished with value: 0.6053435114503817 and parameters: {'n_estima

Best Parameters: {'n_estimators': 168, 'max_depth': 24, 'min_samples_split': 0.20763536206639366, 'min_samples_leaf': 0.3200318233262416}
Accuracy on Test Set: 0.6280
F1-Score on Test Set: 0.3858


XGBoost

In [23]:
def objective(trial):
    params = {
        "n_estimators":trial.suggest_int("n_estimators", 50, 200),
        "max_depth":trial.suggest_int("max_depth", 1, 32),
        "learning_rate":trial.suggest_float("learning_rate", 0.001, 1.0),
        "gamma":trial.suggest_float("gamma", 0.0, 1.0),
        "tree_method": 'hist',
        "device": "cuda",
    }

    clf = XGBClassifier(**params)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    accuracy_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='accuracy')
    f1_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='f1_macro')

    mean_accuracy = np.mean(accuracy_scores)
    mean_f1 = np.mean(f1_scores)

    trial.report(mean_accuracy, step=1)
    trial.report(mean_f1, step=2)

    return mean_f1

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

best_trial = study.best_trial
best_params = best_trial.params
best_params["tree_method"] = "hist"
best_params["device"] = "cuda"
best_classifier = XGBClassifier(**best_params)

best_classifier.fit(X_train, y_train)

y_pred = best_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')

print(f'Best Parameters: {best_params}')
print(f'Accuracy on Test Set: {accuracy:.4f}')
print(f'F1-Score on Test Set: {f1:.4f}')


[I 2023-10-07 22:54:43,612] A new study created in memory with name: no-name-6ecd99fe-b104-44c4-82f8-dba7dc9f4595
[I 2023-10-07 22:54:43,980] Trial 0 finished with value: 0.7905442035273287 and parameters: {'n_estimators': 61, 'max_depth': 28, 'learning_rate': 0.9853176806939056, 'gamma': 0.10375417902320494}. Best is trial 0 with value: 0.7905442035273287.
[I 2023-10-07 22:54:44,953] Trial 1 finished with value: 0.7779745266984094 and parameters: {'n_estimators': 106, 'max_depth': 3, 'learning_rate': 0.9016174728346926, 'gamma': 0.548576150420376}. Best is trial 0 with value: 0.7905442035273287.
[I 2023-10-07 22:54:45,656] Trial 2 finished with value: 0.7899838098614828 and parameters: {'n_estimators': 167, 'max_depth': 21, 'learning_rate': 0.4666314771696521, 'gamma': 0.11257110442163465}. Best is trial 0 with value: 0.7905442035273287.
[I 2023-10-07 22:54:46,428] Trial 3 finished with value: 0.7990099933145496 and parameters: {'n_estimators': 186, 'max_depth': 15, 'learning_rate': 0

Best Parameters: {'n_estimators': 84, 'max_depth': 4, 'learning_rate': 0.2746548521342681, 'gamma': 0.13876433773507824, 'tree_method': 'hist', 'device': 'cuda'}
Accuracy on Test Set: 0.8361
F1-Score on Test Set: 0.8360


Catboost

In [24]:
def objective(trial):
    params = {
        "n_estimators":trial.suggest_int("n_estimators", 50, 200),
        "max_depth":trial.suggest_int("max_depth", 1, 12),
        "learning_rate":trial.suggest_float("learning_rate", 0.001, 1.0),
        "task_type":"GPU",
        "verbose":False,
    }
    clf = CatBoostClassifier(**params)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    accuracy_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='accuracy')
    f1_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='f1_macro')

    mean_accuracy = np.mean(accuracy_scores)
    mean_f1 = np.mean(f1_scores)

    trial.report(mean_accuracy, step=1)
    trial.report(mean_f1, step=2)

    return mean_f1

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

best_trial = study.best_trial
best_params = best_trial.params
best_params["task_type"] = "GPU"
best_params["verbose"] = False
best_classifier = CatBoostClassifier(**best_params)

best_classifier.fit(X_train, y_train)

y_pred = best_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')

print(f'Best Parameters: {best_params}')
print(f'Accuracy on Test Set: {accuracy:.4f}')
print(f'F1-Score on Test Set: {f1:.4f}')


[I 2023-10-07 22:57:34,728] A new study created in memory with name: no-name-25ee4e1a-8f08-49b0-836b-ed4dc260eee6
[I 2023-10-07 22:57:37,759] Trial 0 finished with value: 0.7967896572173263 and parameters: {'n_estimators': 57, 'max_depth': 8, 'learning_rate': 0.4648595993442728}. Best is trial 0 with value: 0.7967896572173263.
[I 2023-10-07 22:57:41,682] Trial 1 finished with value: 0.7938054528869248 and parameters: {'n_estimators': 186, 'max_depth': 5, 'learning_rate': 0.5225356655105206}. Best is trial 0 with value: 0.7967896572173263.
[I 2023-10-07 22:57:55,258] Trial 2 finished with value: 0.7879285009033871 and parameters: {'n_estimators': 104, 'max_depth': 12, 'learning_rate': 0.7823002106293817}. Best is trial 0 with value: 0.7967896572173263.
[I 2023-10-07 22:58:02,271] Trial 3 finished with value: 0.7884741555610564 and parameters: {'n_estimators': 167, 'max_depth': 9, 'learning_rate': 0.9909436052793451}. Best is trial 0 with value: 0.7967896572173263.
[I 2023-10-07 22:58:05

Best Parameters: {'n_estimators': 73, 'max_depth': 1, 'learning_rate': 0.5725578005604699, 'task_type': 'GPU', 'verbose': False}
Accuracy on Test Set: 0.8689
F1-Score on Test Set: 0.8685


KNN

In [25]:
def objective(trial):
    params = {
        "n_neighbors":trial.suggest_int("n_neighbors", 1, 50),
        "weights":trial.suggest_categorical("weights", ["uniform", "distance"]),
    }
    clf = KNeighborsClassifier(**params)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    accuracy_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='accuracy')
    f1_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='f1_macro')

    mean_accuracy = np.mean(accuracy_scores)
    mean_f1 = np.mean(f1_scores)

    trial.report(mean_accuracy, step=1)
    trial.report(mean_f1, step=2)

    return mean_f1

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500)

best_trial = study.best_trial
best_params = best_trial.params
best_classifier = KNeighborsClassifier(**best_params)

best_classifier.fit(X_train, y_train)

y_pred = best_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')

print(f'Best Parameters: {best_params}')
print(f'Accuracy on Test Set: {accuracy:.4f}')
print(f'F1-Score on Test Set: {f1:.4f}')


[I 2023-10-07 23:02:53,751] A new study created in memory with name: no-name-2b3dffd6-908f-4463-88e6-8ada6875c11c
[I 2023-10-07 23:02:53,819] Trial 0 finished with value: 0.6296102413935293 and parameters: {'n_neighbors': 29, 'weights': 'uniform'}. Best is trial 0 with value: 0.6296102413935293.
[I 2023-10-07 23:02:53,874] Trial 1 finished with value: 0.6239045013371072 and parameters: {'n_neighbors': 5, 'weights': 'uniform'}. Best is trial 0 with value: 0.6296102413935293.
[I 2023-10-07 23:02:53,930] Trial 2 finished with value: 0.62983558470515 and parameters: {'n_neighbors': 18, 'weights': 'uniform'}. Best is trial 2 with value: 0.62983558470515.
[I 2023-10-07 23:02:53,971] Trial 3 finished with value: 0.6196847328658868 and parameters: {'n_neighbors': 44, 'weights': 'distance'}. Best is trial 2 with value: 0.62983558470515.
[I 2023-10-07 23:02:54,029] Trial 4 finished with value: 0.6332172875338995 and parameters: {'n_neighbors': 16, 'weights': 'uniform'}. Best is trial 4 with valu

Best Parameters: {'n_neighbors': 21, 'weights': 'uniform'}
Accuracy on Test Set: 0.6557
F1-Score on Test Set: 0.6497


NaiveBayes

In [26]:
def objective(trial):
    params = {
    }

    clf = GaussianNB(**params)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    accuracy_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='accuracy')
    f1_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='f1_macro')

    mean_accuracy = np.mean(accuracy_scores)
    mean_f1 = np.mean(f1_scores)

    trial.report(mean_accuracy, step=1)
    trial.report(mean_f1, step=2)

    return mean_f1

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

best_trial = study.best_trial
best_params = best_trial.params
best_classifier = GaussianNB(**best_params)


best_classifier.fit(X_train, y_train)

y_pred = best_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')

print(f'Best Parameters: {best_params}')
print(f'Accuracy on Test Set: {accuracy:.4f}')
print(f'F1-Score on Test Set: {f1:.4f}')


[I 2023-10-07 23:03:39,675] A new study created in memory with name: no-name-486b2c48-f725-4812-a58a-a9e59974f541
[I 2023-10-07 23:03:39,713] Trial 0 finished with value: 0.798940560244908 and parameters: {}. Best is trial 0 with value: 0.798940560244908.
[I 2023-10-07 23:03:39,749] Trial 1 finished with value: 0.798940560244908 and parameters: {}. Best is trial 0 with value: 0.798940560244908.
[I 2023-10-07 23:03:39,786] Trial 2 finished with value: 0.798940560244908 and parameters: {}. Best is trial 0 with value: 0.798940560244908.
[I 2023-10-07 23:03:39,820] Trial 3 finished with value: 0.798940560244908 and parameters: {}. Best is trial 0 with value: 0.798940560244908.
[I 2023-10-07 23:03:39,855] Trial 4 finished with value: 0.798940560244908 and parameters: {}. Best is trial 0 with value: 0.798940560244908.
[I 2023-10-07 23:03:39,892] Trial 5 finished with value: 0.798940560244908 and parameters: {}. Best is trial 0 with value: 0.798940560244908.
[I 2023-10-07 23:03:39,927] Trial 

Best Parameters: {}
Accuracy on Test Set: 0.8852
F1-Score on Test Set: 0.8852


Logistic Regression

In [27]:
def objective(trial):
    params = {
        "C":trial.suggest_float("C", 0.001, 10),
        "solver":"liblinear",
        "penalty":trial.suggest_categorical("penalty", ["l1", "l2"]),
    }

    clf = LogisticRegression(**params)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    accuracy_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='accuracy')
    f1_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='f1_macro')

    mean_accuracy = np.mean(accuracy_scores)
    mean_f1 = np.mean(f1_scores)

    trial.report(mean_accuracy, step=1)
    trial.report(mean_f1, step=2)

    return mean_f1


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)


best_trial = study.best_trial
best_params = best_trial.params
best_params["solver"] = "liblinear"
best_classifier = LogisticRegression(**best_params)


best_classifier.fit(X_train, y_train)


y_pred = best_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')

print(f'Best Parameters: {best_params}')
print(f'Accuracy on Test Set: {accuracy:.4f}')
print(f'F1-Score on Test Set: {f1:.4f}')


[I 2023-10-07 23:03:55,431] A new study created in memory with name: no-name-c75decac-d898-46c0-86fd-0cfcf4f623c7
[I 2023-10-07 23:03:55,469] Trial 0 finished with value: 0.8352695849309842 and parameters: {'C': 1.4705187456036537, 'penalty': 'l2'}. Best is trial 0 with value: 0.8352695849309842.
[I 2023-10-07 23:03:55,505] Trial 1 finished with value: 0.8398486527106428 and parameters: {'C': 6.35243449249008, 'penalty': 'l2'}. Best is trial 1 with value: 0.8398486527106428.
[I 2023-10-07 23:03:55,567] Trial 2 finished with value: 0.831791239109751 and parameters: {'C': 7.250306314063035, 'penalty': 'l1'}. Best is trial 1 with value: 0.8398486527106428.
[I 2023-10-07 23:03:55,603] Trial 3 finished with value: 0.8300242775257161 and parameters: {'C': 0.3791330507702049, 'penalty': 'l2'}. Best is trial 1 with value: 0.8398486527106428.
[I 2023-10-07 23:03:55,638] Trial 4 finished with value: 0.8311587905762874 and parameters: {'C': 2.444456015827832, 'penalty': 'l2'}. Best is trial 1 wit

Best Parameters: {'C': 6.35243449249008, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy on Test Set: 0.8197
F1-Score on Test Set: 0.8195
