In [30]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization
import pandas as pd

In [31]:
df = pd.read_csv('../data/diabetes.csv')

In [32]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [33]:
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

In [34]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [35]:
def xGb():
    clf = xgb.XGBClassifier()
    clf.fit(X_train , y_train)
    pred = clf.predict(X_test) 
    return pred

In [48]:
from optuna import create_study, Trial
from sklearn.metrics import accuracy_score


def objective(trial: Trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 50, 400),
        'gamma': trial.suggest_loguniform('gamma', 0.1, 5),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1),
    }

    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)

    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)

    # Optuna expects the objective function to return a value to be maximized or minimized
    return accuracy

def optuna_xgb():
    study = create_study(direction='maximize')
    n_trials = 100
    try:
        study.optimize(objective, n_trials=n_trials)
    except Exception as e:
        print("Optimization stopped:", e)

    best_trial = study.best_trial
    best_params = best_trial.params
    best_model = xgb.XGBClassifier(**best_params)
    best_model.fit(X_train, y_train)
    predictions = best_model.predict(X_test)
    return predictions

In [46]:
from SklearnTuner import SklearnTuner

def gen_xgb():

    hyperparameter_param_distributions = {
  "learning_rate": [(0.01, 0.3), float],
  "n_estimators": [(50, 200), int],
  "max_depth": [(3, 10), int],
  "min_child_weight": [(1, 10), int],
  "subsample": [(0.5, 1.0), float],
  "colsample_bytree": [(0.5, 1.0), float],
  "gamma": [(0, 1), float],
  "objective": [["binary:logistic"], str]
    }

    model = xgb.XGBClassifier()
    model_type = "classifier"
    pops = 100
    generations = 100
    mutation_rate = 0.3
    SklearnTuner = SklearnTuner(X_train,y_train,model=model,model_type=model_type ,param_distributions = hyperparameter_param_distributions, population=pops , max_generation=generations , mutation_rate=mutation_rate)
    SklearnTuner.search()
    best_params = SklearnTuner.get_best_params()

    tuned_model = model.set_params(**best_params)

    model.fit(X_train , y_train)
    tuned_model.fit(X_train , y_train)

    tuned_pred  = tuned_model.predict(X_test)

    return tuned_pred


In [38]:
from sklearn.metrics import accuracy_score,classification_report

pred = xGb()
score = accuracy_score(pred , y_test)
report = classification_report(xGb() , y_test)
print(score , report)


0.7077922077922078               precision    recall  f1-score   support

           0       0.74      0.79      0.76        92
           1       0.65      0.58      0.62        62

    accuracy                           0.71       154
   macro avg       0.70      0.69      0.69       154
weighted avg       0.70      0.71      0.70       154



In [49]:
from sklearn.metrics import accuracy_score,classification_report

pred = optuna_xgb()
score = accuracy_score(pred , y_test)
report = classification_report(xGb() , y_test)
print(score , report)


[I 2024-02-06 07:00:31,322] A new study created in memory with name: no-name-e69aa36e-ebda-439e-883e-a26bc1427984
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'gamma': trial.suggest_loguniform('gamma', 0.1, 5),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1),
[I 2024-02-06 07:00:31,378] Trial 0 finished with value: 0.7792207792207793 and parameters: {'max_depth': 4, 'learning_rate': 0.016573488743709116, 'n_estimators': 53, 'gamma': 0.7667564348895597, 'min_child_weight': 8, 'subsample': 0.7064578299900208, 'colsample_bytree': 0.528083817925969}. Best is trial 0 with value: 0.7792207792207793.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'gamma': trial.suggest_loguniform('gamma', 0.1, 5),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1),
[I 2024-02-06 07:00:31,631]

0.7987012987012987               precision    recall  f1-score   support

           0       0.74      0.79      0.76        92
           1       0.65      0.58      0.62        62

    accuracy                           0.71       154
   macro avg       0.70      0.69      0.69       154
weighted avg       0.70      0.71      0.70       154



In [47]:
from sklearn.metrics import accuracy_score,classification_report

pred = gen_xgb()
score = accuracy_score(pred , y_test)
report = classification_report(xGb() , y_test)
print(score , report)


Generation 0 : Best Score : 0.7687591630014661, Avg Score : 0.7541147540983605
Generation 1 : Best Score : 0.7687591630014661, Avg Score : 0.7570939624150338
Generation 2 : Best Score : 0.7687591630014661, Avg Score : 0.7590984939357593
Generation 3 : Best Score : 0.7719978675196588, Avg Score : 0.7603858456617355
Generation 4 : Best Score : 0.7719978675196588, Avg Score : 0.7615096628015463
Generation 5 : Best Score : 0.7719978675196588, Avg Score : 0.7623883779821405
Generation 6 : Best Score : 0.7719978675196588, Avg Score : 0.7632658936425427
Generation 7 : Best Score : 0.7719978675196588, Avg Score : 0.7637058509929358
Generation 8 : Best Score : 0.7719978675196588, Avg Score : 0.764307077169132
Generation 9 : Best Score : 0.7719978675196588, Avg Score : 0.7647639610822332
Generation 10 : Best Score : 0.7719978675196588, Avg Score : 0.7650586432093821
Generation 11 : Best Score : 0.7719978675196588, Avg Score : 0.7655301879248291
Generation 12 : Best Score : 0.7720111955217913, Av