In [19]:
import yaml

import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import optuna
from optuna import trial


In [20]:
df = pd.read_csv("/home/kenny/osda/osda_datasets/processed/Healthcare-Diabetes.csv")
df


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,126.0,33.6,0.627,50,1
1,1,85.0,66.0,29.0,126.0,26.6,0.351,31,0
2,8,183.0,64.0,29.0,126.0,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
773,1,100.0,62.0,29.0,126.0,64.4,0.152,36,0
774,0,163.0,40.0,23.0,64.0,40.7,0.322,33,0
775,6,139.0,84.0,37.0,126.0,50.7,0.320,50,1
776,2,167.0,44.0,30.0,140.0,52.7,0.452,28,0


In [21]:
y = df["Outcome"]
X = df.drop(["Outcome"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Decision tree pipeline

In [23]:
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 32),
        'min_samples_split': trial.suggest_float('min_samples_split', 0.1, 1.0),
        'min_samples_leaf': trial.suggest_float('min_samples_leaf', 0.1, 0.5),
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy'])
    }
    clf = DecisionTreeClassifier(**params)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    accuracy_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='accuracy')
    f1_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='f1_macro')

    mean_accuracy = np.mean(accuracy_scores)
    mean_f1 = np.mean(f1_scores)

    trial.report(mean_accuracy, step=1)
    trial.report(mean_f1, step=2)

    return mean_f1

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

best_trial = study.best_trial
best_params = best_trial.params
best_classifier = DecisionTreeClassifier(**best_params)

best_classifier.fit(X_train, y_train)

y_pred = best_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')

print(f'Best Parameters: {best_params}')
print(f'Accuracy on Test Set: {accuracy:.4f}')
print(f'F1-Score on Test Set: {f1:.4f}')


[I 2023-10-07 22:03:42,490] A new study created in memory with name: no-name-f87579ac-c503-40e4-9cad-16a2aefb2edd


[I 2023-10-07 22:03:42,570] Trial 0 finished with value: 0.6170241875852815 and parameters: {'max_depth': 1, 'min_samples_split': 0.7184466923719466, 'min_samples_leaf': 0.17353974310416911, 'criterion': 'entropy'}. Best is trial 0 with value: 0.6170241875852815.
[I 2023-10-07 22:03:42,623] Trial 1 finished with value: 0.6773451135728629 and parameters: {'max_depth': 3, 'min_samples_split': 0.9522369968634157, 'min_samples_leaf': 0.49755674051818277, 'criterion': 'gini'}. Best is trial 1 with value: 0.6773451135728629.
[I 2023-10-07 22:03:42,677] Trial 2 finished with value: 0.6274435305914106 and parameters: {'max_depth': 30, 'min_samples_split': 0.8370785417322915, 'min_samples_leaf': 0.2401086425491936, 'criterion': 'entropy'}. Best is trial 1 with value: 0.6773451135728629.
[I 2023-10-07 22:03:42,736] Trial 3 finished with value: 0.7302811554983681 and parameters: {'max_depth': 20, 'min_samples_split': 0.23619312446134758, 'min_samples_leaf': 0.1194544723031859, 'criterion': 'entro

Best Parameters: {'max_depth': 22, 'min_samples_split': 0.14888899446224862, 'min_samples_leaf': 0.10039840249136735, 'criterion': 'entropy'}
Accuracy on Test Set: 0.7115
F1-Score on Test Set: 0.6901


Random Forest Classifier

In [24]:
def objective(trial):
    params = {
        "n_estimators":trial.suggest_int("n_estimators", 50, 200),
        "max_depth":trial.suggest_int("max_depth", 1, 32),
        "min_samples_split":trial.suggest_float("min_samples_split", 0.1, 1.0),
        "min_samples_leaf":trial.suggest_float("min_samples_leaf", 0.1, 0.5),
    }
    clf = RandomForestClassifier(**params)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    accuracy_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='accuracy')
    f1_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='f1_macro')

    mean_accuracy = np.mean(accuracy_scores)
    mean_f1 = np.mean(f1_scores)

    trial.report(mean_accuracy, step=1)
    trial.report(mean_f1, step=2)

    return mean_f1

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

best_trial = study.best_trial
best_params = best_trial.params
best_classifier = RandomForestClassifier(**best_params)

best_classifier.fit(X_train, y_train)

y_pred = best_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')

print(f'Best Parameters: {best_params}')
print(f'Accuracy on Test Set: {accuracy:.4f}')
print(f'F1-Score on Test Set: {f1:.4f}')


[I 2023-10-07 22:04:27,279] A new study created in memory with name: no-name-3200b44d-33f7-4fa5-bae4-86fc8635896a
[I 2023-10-07 22:04:29,773] Trial 0 finished with value: 0.38847183062158147 and parameters: {'n_estimators': 193, 'max_depth': 22, 'min_samples_split': 0.35471323774533836, 'min_samples_leaf': 0.31705275946660905}. Best is trial 0 with value: 0.38847183062158147.
[I 2023-10-07 22:04:30,561] Trial 1 finished with value: 0.38847183062158147 and parameters: {'n_estimators': 59, 'max_depth': 16, 'min_samples_split': 0.8089139899406724, 'min_samples_leaf': 0.34643918564024745}. Best is trial 0 with value: 0.38847183062158147.
[I 2023-10-07 22:04:32,479] Trial 2 finished with value: 0.38847183062158147 and parameters: {'n_estimators': 151, 'max_depth': 32, 'min_samples_split': 0.3312746731874682, 'min_samples_leaf': 0.41716493440135416}. Best is trial 0 with value: 0.38847183062158147.
[I 2023-10-07 22:04:33,186] Trial 3 finished with value: 0.4207439306547306 and parameters: {'

Best Parameters: {'n_estimators': 158, 'max_depth': 6, 'min_samples_split': 0.12760192115905514, 'min_samples_leaf': 0.10002544787434561}
Accuracy on Test Set: 0.7628
F1-Score on Test Set: 0.6963


XGBoost

In [25]:
def objective(trial):
    params = {
        "n_estimators":trial.suggest_int("n_estimators", 50, 200),
        "max_depth":trial.suggest_int("max_depth", 1, 32),
        "learning_rate":trial.suggest_float("learning_rate", 0.001, 1.0),
        "gamma":trial.suggest_float("gamma", 0.0, 1.0),
        "tree_method": 'hist',
        "device": "cuda",
    }

    clf = XGBClassifier(**params)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    accuracy_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='accuracy')
    f1_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='f1_macro')

    mean_accuracy = np.mean(accuracy_scores)
    mean_f1 = np.mean(f1_scores)

    trial.report(mean_accuracy, step=1)
    trial.report(mean_f1, step=2)

    return mean_f1

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

best_trial = study.best_trial
best_params = best_trial.params
best_classifier = XGBClassifier(**best_params)

best_classifier.fit(X_train, y_train)

y_pred = best_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')

print(f'Best Parameters: {best_params}')
print(f'Accuracy on Test Set: {accuracy:.4f}')
print(f'F1-Score on Test Set: {f1:.4f}')


[I 2023-10-07 22:09:05,985] A new study created in memory with name: no-name-6c868ae8-3323-40d1-a198-e61eafede5a8
[I 2023-10-07 22:09:06,743] Trial 0 finished with value: 0.7480853686977088 and parameters: {'n_estimators': 162, 'max_depth': 10, 'learning_rate': 0.2659048052002756, 'gamma': 0.8411405149110109}. Best is trial 0 with value: 0.7480853686977088.
[I 2023-10-07 22:09:07,163] Trial 1 finished with value: 0.7564048063516204 and parameters: {'n_estimators': 86, 'max_depth': 4, 'learning_rate': 0.3524340286429865, 'gamma': 0.6276131847807174}. Best is trial 1 with value: 0.7564048063516204.
[I 2023-10-07 22:09:07,594] Trial 2 finished with value: 0.7224815613261528 and parameters: {'n_estimators': 91, 'max_depth': 27, 'learning_rate': 0.754184373354832, 'gamma': 0.6851910467080642}. Best is trial 1 with value: 0.7564048063516204.
[I 2023-10-07 22:09:08,126] Trial 3 finished with value: 0.7184272197413746 and parameters: {'n_estimators': 119, 'max_depth': 12, 'learning_rate': 0.88

Best Parameters: {'n_estimators': 108, 'max_depth': 2, 'learning_rate': 0.2513887245394026, 'gamma': 0.6062044711728032}
Accuracy on Test Set: 0.7115
F1-Score on Test Set: 0.6800


Catboost

In [26]:
def objective(trial):
    params = {
        "n_estimators":trial.suggest_int("n_estimators", 50, 200),
        "max_depth":trial.suggest_int("max_depth", 1, 12),
        "learning_rate":trial.suggest_float("learning_rate", 0.001, 1.0),
        "task_type":"GPU",
        "verbose":False,
    }
    clf = CatBoostClassifier(**params)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    accuracy_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='accuracy')
    f1_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='f1_macro')

    mean_accuracy = np.mean(accuracy_scores)
    mean_f1 = np.mean(f1_scores)

    trial.report(mean_accuracy, step=1)
    trial.report(mean_f1, step=2)

    return mean_f1

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

best_trial = study.best_trial
best_params = best_trial.params
best_classifier = CatBoostClassifier(**best_params)

best_classifier.fit(X_train, y_train)

y_pred = best_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')

print(f'Best Parameters: {best_params}')
print(f'Accuracy on Test Set: {accuracy:.4f}')
print(f'F1-Score on Test Set: {f1:.4f}')


[I 2023-10-07 22:10:45,284] A new study created in memory with name: no-name-495a3d28-1c76-4841-9522-227f4f90dafc
[I 2023-10-07 22:10:47,395] Trial 0 finished with value: 0.7553869233541064 and parameters: {'n_estimators': 63, 'max_depth': 2, 'learning_rate': 0.5983882886620515}. Best is trial 0 with value: 0.7553869233541064.
[I 2023-10-07 22:10:51,826] Trial 1 finished with value: 0.7259651440423825 and parameters: {'n_estimators': 173, 'max_depth': 7, 'learning_rate': 0.6965608729844162}. Best is trial 0 with value: 0.7553869233541064.
[I 2023-10-07 22:10:55,186] Trial 2 finished with value: 0.7325610157853869 and parameters: {'n_estimators': 198, 'max_depth': 3, 'learning_rate': 0.5596936369255838}. Best is trial 0 with value: 0.7553869233541064.
[I 2023-10-07 22:11:00,398] Trial 3 finished with value: 0.7437833477755045 and parameters: {'n_estimators': 54, 'max_depth': 11, 'learning_rate': 0.7245417170967214}. Best is trial 0 with value: 0.7553869233541064.
[I 2023-10-07 22:11:02,

KNN

In [11]:
def objective(trial):
    params = {
        "n_neighbors":trial.suggest_int("n_neighbors", 1, 20),
        "weights":trial.suggest_categorical("weights", ["uniform", "distance"]),
    }
    clf = KNeighborsClassifier(**params)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    accuracy_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='accuracy')
    f1_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='f1_macro')

    mean_accuracy = np.mean(accuracy_scores)
    mean_f1 = np.mean(f1_scores)

    trial.report(mean_accuracy, step=1)
    trial.report(mean_f1, step=2)

    return mean_f1

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

best_trial = study.best_trial
best_params = best_trial.params
best_classifier = KNeighborsClassifier(**best_params)

best_classifier.fit(X_train, y_train)

y_pred = best_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')

print(f'Best Parameters: {best_params}')
print(f'Accuracy on Test Set: {accuracy:.4f}')
print(f'F1-Score on Test Set: {f1:.4f}')


[I 2023-10-07 21:30:56,628] A new study created in memory with name: no-name-d4d56a03-c998-4d01-8a77-913337219f3c
[I 2023-10-07 21:30:56,701] Trial 0 finished with value: 0.49313154343812027 and parameters: {'n_neighbors': 14, 'weights': 'distance'}. Best is trial 0 with value: 0.49313154343812027.
[I 2023-10-07 21:30:56,774] Trial 1 finished with value: 0.489479565419748 and parameters: {'n_neighbors': 19, 'weights': 'distance'}. Best is trial 0 with value: 0.49313154343812027.
[I 2023-10-07 21:30:57,018] Trial 2 finished with value: 0.4665355459818237 and parameters: {'n_neighbors': 16, 'weights': 'uniform'}. Best is trial 0 with value: 0.49313154343812027.
[I 2023-10-07 21:30:57,087] Trial 3 finished with value: 0.48494842061955934 and parameters: {'n_neighbors': 16, 'weights': 'distance'}. Best is trial 0 with value: 0.49313154343812027.
[I 2023-10-07 21:30:57,331] Trial 4 finished with value: 0.4714215803287828 and parameters: {'n_neighbors': 8, 'weights': 'uniform'}. Best is tria

Best Parameters: {'n_neighbors': 3, 'weights': 'distance'}
Accuracy on Test Set: 0.5320
F1-Score on Test Set: 0.4825


NaiveBayes

In [12]:
def objective(trial):
    params = {
    }

    clf = GaussianNB(**params)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    accuracy_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='accuracy')
    f1_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='f1_macro')

    mean_accuracy = np.mean(accuracy_scores)
    mean_f1 = np.mean(f1_scores)

    trial.report(mean_accuracy, step=1)
    trial.report(mean_f1, step=2)

    return mean_f1

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

best_trial = study.best_trial
best_params = best_trial.params
best_classifier = GaussianNB(**best_params)


best_classifier.fit(X_train, y_train)

y_pred = best_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')

print(f'Best Parameters: {best_params}')
print(f'Accuracy on Test Set: {accuracy:.4f}')
print(f'F1-Score on Test Set: {f1:.4f}')


[I 2023-10-07 21:31:29,173] A new study created in memory with name: no-name-615e11e5-3cee-41e4-80c1-67d5504b2f45
[I 2023-10-07 21:31:29,215] Trial 0 finished with value: 0.5314562105525856 and parameters: {}. Best is trial 0 with value: 0.5314562105525856.
[I 2023-10-07 21:31:29,254] Trial 1 finished with value: 0.5314562105525856 and parameters: {}. Best is trial 0 with value: 0.5314562105525856.
[I 2023-10-07 21:31:29,289] Trial 2 finished with value: 0.5314562105525856 and parameters: {}. Best is trial 0 with value: 0.5314562105525856.
[I 2023-10-07 21:31:29,325] Trial 3 finished with value: 0.5314562105525856 and parameters: {}. Best is trial 0 with value: 0.5314562105525856.
[I 2023-10-07 21:31:29,360] Trial 4 finished with value: 0.5314562105525856 and parameters: {}. Best is trial 0 with value: 0.5314562105525856.
[I 2023-10-07 21:31:29,396] Trial 5 finished with value: 0.5314562105525856 and parameters: {}. Best is trial 0 with value: 0.5314562105525856.
[I 2023-10-07 21:31:29

Best Parameters: {}
Accuracy on Test Set: 0.6311
F1-Score on Test Set: 0.5268


Logistic Regression

In [17]:
def objective(trial):
    params = {
        "C":trial.suggest_float("C", 0.001, 10),
        "penalty":trial.suggest_categorical("penalty", ["l1", "l2"]),
        "solver":"liblinear",
    }

    clf = LogisticRegression(**params)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    accuracy_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='accuracy')
    f1_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='f1_macro')

    mean_accuracy = np.mean(accuracy_scores)
    mean_f1 = np.mean(f1_scores)

    trial.report(mean_accuracy, step=1)
    trial.report(mean_f1, step=2)

    return mean_f1


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500)


best_trial = study.best_trial
best_params = best_trial.params
best_classifier = LogisticRegression(**best_params)


best_classifier.fit(X_train, y_train)


y_pred = best_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')

print(f'Best Parameters: {best_params}')
print(f'Accuracy on Test Set: {accuracy:.4f}')
print(f'F1-Score on Test Set: {f1:.4f}')


[I 2023-10-07 21:53:19,714] A new study created in memory with name: no-name-943639c4-aa40-4be0-b3e0-9a36f3884c9a
[I 2023-10-07 21:53:19,813] Trial 0 finished with value: 0.37796573646884246 and parameters: {'C': 4.887143174332373, 'penalty': 'l2'}. Best is trial 0 with value: 0.37796573646884246.
[I 2023-10-07 21:53:20,011] Trial 1 finished with value: 0.37677504190395683 and parameters: {'C': 3.861887325979434, 'penalty': 'l1'}. Best is trial 0 with value: 0.37796573646884246.
[I 2023-10-07 21:53:20,188] Trial 2 finished with value: 0.37677504190395683 and parameters: {'C': 4.385524842896019, 'penalty': 'l1'}. Best is trial 0 with value: 0.37796573646884246.
[I 2023-10-07 21:53:20,362] Trial 3 finished with value: 0.37677504190395683 and parameters: {'C': 5.97889848068869, 'penalty': 'l1'}. Best is trial 0 with value: 0.37796573646884246.
[I 2023-10-07 21:53:20,456] Trial 4 finished with value: 0.37796573646884246 and parameters: {'C': 2.0389699809800543, 'penalty': 'l2'}. Best is tr

Best Parameters: {'C': 4.887143174332373, 'penalty': 'l2'}
Accuracy on Test Set: 0.6280
F1-Score on Test Set: 0.3858
