In [27]:
import os
import joblib
import mlflow
import numpy as np
import pickle
import logging

import xgboost as xgb
from catboost import CatBoostClassifier

from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score, accuracy_score, recall_score, classification_report

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [28]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [29]:
def load_data(filename):
    data_path = '../Data/'
    path = os.path.join(data_path, filename)
    with open(path, 'rb') as f_in:
        X, y  = pickle.load(f_in)

    if X.empty or y.empty:
        logging.error(f'{filename} data is empty')
    else:
        logging.info('Data Loaded succesfully')

    return X, y

In [30]:
train = 'train.pkl'
test = 'test.pkl'

X_train, y_train = load_data(train)
X_test, y_test = load_data(test)

2025-07-05 00:06:02,243 - INFO - Data Loaded succesfully
2025-07-05 00:06:02,260 - INFO - Data Loaded succesfully


In [31]:
y_train

0         0
1         0
2         2
3         0
4         0
         ..
509949    2
509950    2
509951    2
509952    2
509953    2
Name: Diabetes_012, Length: 509954, dtype: int64

In [32]:
X_train

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0,1,1,20,1,0,0,1,1,1,...,1,0,2,0,0,0,1,12,6,8
1,0,0,1,34,0,0,0,1,0,1,...,1,0,3,0,0,0,1,8,5,8
2,1,1,1,24,0,0,0,1,1,1,...,1,0,2,0,5,0,1,12,5,6
3,0,1,1,27,0,0,0,1,1,1,...,1,0,1,0,0,0,1,5,6,7
4,0,1,1,24,0,0,0,1,1,1,...,1,0,3,0,0,1,0,12,4,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
509949,0,1,1,26,0,0,0,1,1,1,...,1,0,3,0,0,0,0,9,6,8
509950,1,1,1,28,0,0,1,1,0,1,...,1,0,3,0,0,0,1,10,6,6
509951,1,1,1,25,1,0,0,1,1,1,...,1,0,3,0,0,0,0,11,6,8
509952,1,1,1,35,0,0,0,0,0,1,...,1,0,2,5,2,1,0,11,4,1


In [33]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')
mlflow.set_experiment('optimization_v3')

<Experiment: artifact_location='mlflow-artifacts:/3', creation_time=1751654916930, experiment_id='3', last_update_time=1751654916930, lifecycle_stage='active', name='optimization_v3', tags={}>

In [None]:
def catboost_objective(params):
    with mlflow.start_run():
        mlflow.set_tag('Model', 'Catboost')
        mlflow.log_params(params)

        classes = np.unique(y_train)
        weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
        class_weights = dict(zip(classes, weights))

        if 1 in class_weights:
            class_weights[1] *= 1.5
        if 2 in class_weights:
            class_weights[2] *= 1.2

        categorical_features_indices = [X_train.columns.get_loc(col) for col in ['Age', 'GenHlth', 'Education', 'Income']]

        model = CatBoostClassifier(**params,
                                   class_weights=class_weights,
                                   cat_features=categorical_features_indices,
                                   early_stopping_rounds=50,
                                   eval_metric='TotalF1')

        model.fit(X_train, y_train, 
                  eval_set=(X_test, y_test),
                  use_best_model=True, 
                  logging_level='Silent')

        y_pred = model.predict(X_test)

        # Logging important metrics
        score = f1_score(y_test, y_pred, average='macro')
        loss = 1 - score 
        mlflow.log_metric('f1_macro', score)
        print(classification_report(y_test, y_pred))

        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric('accuracy', accuracy)

        recall = recall_score(y_test, y_pred, average='macro')
        mlflow.log_metric('recall', recall)

    return {'loss': loss, 'status': STATUS_OK}

catboost_search_space = {
    'depth': scope.int(hp.quniform('depth', 4, 10, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, -1.6),
    'l2_leaf_reg': hp.loguniform('l2_leaf_reg', -2, 1),
    'bagging_temperature': hp.uniform('bagging_temperature', 0.0, 1.0),
    'random_strength': hp.uniform('random_strength', 0.0, 1.0),
    'border_count': scope.int(hp.quniform('border_count', 32, 64, 16)),
    'iterations': 500,
    'loss_function': 'MultiClass',
    'verbose': 0,
    'random_seed': 42
}

best_result = fmin(
    fn=catboost_objective,
    space=catboost_search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

In [None]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_test, label=y_test)

def xgboost_objective(params):
    with mlflow.start_run():
        mlflow.set_tag('Model', 'XGboost')
        mlflow.log_params(params)

        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50,
            verbose_eval=False
        )

        y_pred_proba = booster.predict(valid) 
        y_pred = np.argmax(y_pred_proba, axis=1)

        #Logging important metrics
        score = f1_score(y_test, y_pred, average='macro')
        loss = 1 - score
        mlflow.log_metric('f1_macro', score)

        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric('accuracy', accuracy)

        recall = recall_score(y_test, y_pred, average='macro')
        mlflow.log_metric('recall', recall)

    return {'loss': loss, 'status': STATUS_OK}

search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 3, 10, 1)),
    'learning_rate': hp.loguniform('learning_rate', -5, -1.6),
    'reg_alpha': hp.loguniform('reg_alpha', -8, 0),
    'reg_lambda': hp.loguniform('reg_lambda', -7, 0),
    'min_child_weight': hp.loguniform('min_child_weight',  1, 10),
    'subsample': hp.uniform('subsample', 0.6, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
    'objective': 'multi:softprob',
    'num_class': 3,
    'seed': 42
}

best_result = fmin(
    fn=xgboost_objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

2025-07-05 00:19:25,609 - INFO - build_posterior_wrapper took 0.002452 seconds
2025-07-05 00:19:25,609 - INFO - TPE using 0 trials


🏃 View run industrious-fly-107 at: http://127.0.0.1:5000/#/experiments/3/runs/091f558689474212824ddbcb8788f784

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/3

  2%|▏         | 1/50 [01:10<57:37, 70.57s/trial, best loss: 0.5817733495785127]

2025-07-05 00:20:36,176 - INFO - build_posterior_wrapper took 0.001487 seconds
2025-07-05 00:20:36,177 - INFO - TPE using 1/1 trials with best loss 0.581773


🏃 View run handsome-colt-313 at: http://127.0.0.1:5000/#/experiments/3/runs/e3adefdd8d7f4cada7f1a857e64787b1

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/3                    

  4%|▍         | 2/50 [02:19<55:40, 69.60s/trial, best loss: 0.581537511679526] 

2025-07-05 00:21:45,106 - INFO - build_posterior_wrapper took 0.002004 seconds
2025-07-05 00:21:45,107 - INFO - TPE using 2/2 trials with best loss 0.581538


🏃 View run classy-skink-193 at: http://127.0.0.1:5000/#/experiments/3/runs/12b55d08ba4f41fd8f89982ec3cf5d14

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/3                   

  6%|▌         | 3/50 [03:16<49:53, 63.68s/trial, best loss: 0.581537511679526]

2025-07-05 00:22:41,746 - INFO - build_posterior_wrapper took 0.002605 seconds
2025-07-05 00:22:41,747 - INFO - TPE using 3/3 trials with best loss 0.581538


🏃 View run entertaining-ray-106 at: http://127.0.0.1:5000/#/experiments/3/runs/0c9a5f6f4a85435594ab26d2c489f3c3

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/3                   

  8%|▊         | 4/50 [04:02<43:27, 56.68s/trial, best loss: 0.581537511679526]

2025-07-05 00:23:27,700 - INFO - build_posterior_wrapper took 0.002079 seconds
2025-07-05 00:23:27,701 - INFO - TPE using 4/4 trials with best loss 0.581538


🏃 View run skillful-mule-90 at: http://127.0.0.1:5000/#/experiments/3/runs/0831ecef153d4013b055146e46f0a69a

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/3                   

 10%|█         | 5/50 [04:55<41:31, 55.37s/trial, best loss: 0.5809791522124197]

2025-07-05 00:24:20,752 - INFO - build_posterior_wrapper took 0.002090 seconds
2025-07-05 00:24:20,753 - INFO - TPE using 5/5 trials with best loss 0.580979


🏃 View run sedate-jay-683 at: http://127.0.0.1:5000/#/experiments/3/runs/65746a8f34b34450ad5f2cd2d99db325

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/3                    

 12%|█▏        | 6/50 [05:40<38:14, 52.14s/trial, best loss: 0.5809791522124197]

2025-07-05 00:25:06,609 - INFO - build_posterior_wrapper took 0.002091 seconds
2025-07-05 00:25:06,610 - INFO - TPE using 6/6 trials with best loss 0.580979


🏃 View run thoughtful-fawn-527 at: http://127.0.0.1:5000/#/experiments/3/runs/4711ac8a670b43f387ea2dc820ef0f83

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/3                    

 14%|█▍        | 7/50 [06:30<36:41, 51.20s/trial, best loss: 0.5809791522124197]

2025-07-05 00:25:55,878 - INFO - build_posterior_wrapper took 0.002073 seconds
2025-07-05 00:25:55,879 - INFO - TPE using 7/7 trials with best loss 0.580979
