In [11]:
import os
import mlflow
import numpy as np
import pickle
import logging
import sys
import pandas as pd
import numpy as np
import catboost as cb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, recall_score, classification_report
from sklearn.model_selection import cross_val_score

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

  import pkg_resources


In [12]:
print("Python Version:", sys.version)
print("Pandas Version:", pd.__version__)
print("NumPy Version:", np.__version__)
print("CatBoost Version:", cb.__version__)

Python Version: 3.12.10 (tags/v3.12.10:0cc8128, Apr  8 2025, 12:21:36) [MSC v.1943 64 bit (AMD64)]
Pandas Version: 2.3.0
NumPy Version: 2.3.1
CatBoost Version: 1.2.8


In [13]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [14]:
def load_data(filename):
    data_path = '../Data/'
    path = os.path.join(data_path, filename)
    with open(path, 'rb') as f_in:
        X, y  = pickle.load(f_in)

    if X.empty or y.empty:
        logging.error(f'{filename} data is empty')
    else:
        logging.info('Data Loaded succesfully')

    return X, y

In [15]:
train = 'train.pkl'
test = 'test.pkl'

X_train, y_train = load_data(train)
X_test, y_test = load_data(test)

2025-07-06 15:21:44,532 - INFO - Data Loaded succesfully
2025-07-06 15:21:44,553 - INFO - Data Loaded succesfully


In [16]:
y_train

0        0
1        0
2        0
3        0
4        0
        ..
70687    1
70688    1
70689    1
70690    1
70691    1
Name: Diabetes_binary, Length: 70692, dtype: int64

In [17]:
X_train

Unnamed: 0,BMI,Age,Income,PhysHlth,Education,GenHlth,MentHlth,HighBP,Fruits
0,26,4,8,30,6,3,5,1,0
1,26,12,8,0,6,3,0,1,1
2,26,13,8,10,6,1,0,0,1
3,28,11,8,3,6,3,0,1,1
4,29,8,8,0,5,2,0,0,1
...,...,...,...,...,...,...,...,...,...
70687,37,6,1,0,4,4,0,0,0
70688,29,10,6,0,3,2,0,0,1
70689,25,13,4,0,6,5,15,1,1
70690,18,11,4,0,2,4,0,1,0


In [18]:
print(X_train.isnull().sum())

BMI          0
Age          0
Income       0
PhysHlth     0
Education    0
GenHlth      0
MentHlth     0
HighBP       0
Fruits       0
dtype: int64


In [None]:
print(X_train.dtypes)

BMI          int64
Age          int64
Income       int64
PhysHlth     int64
Education    int64
GenHlth      int64
MentHlth     int64
HighBP       int64
Fruits       int64
dtype: object


PermissionError: [WinError 5] Failed to open local file '...'. Detail: [Windows error 5] Přístup byl odepřen.


In [29]:
mlflow.set_tracking_uri('http://127.0.0.1:5000')

In [8]:
old_experiment_name = "optimization_v3"

try:
    old_experiment = mlflow.get_experiment_by_name(old_experiment_name)

    if old_experiment:
        mlflow.delete_experiment(old_experiment.experiment_id)
        print(f"Experiment '{old_experiment_name}' (ID: {old_experiment.experiment_id}) byl archivován.")
    else:
        print(f"Experiment '{old_experiment_name}' nebyl nalezen.")

except Exception as e:
    print(f"Došlo k chybě při archivaci experimentu: {e}")

Experiment 'optimization_v3' (ID: 3) byl archivován.


In [30]:
mlflow.set_experiment("optimization_v4")

<Experiment: artifact_location='mlflow-artifacts:/4', creation_time=1751728213237, experiment_id='4', last_update_time=1751728213237, lifecycle_stage='active', name='optimization_v4', tags={}>

In [31]:
def catboost_objective(params):
    with mlflow.start_run():
        mlflow.set_tag('Model', 'Catboost')
        mlflow.log_params(params)

        categorical_features_indices = [X_train.columns.get_loc(col) for col in ['Age', 'GenHlth', 'Education', 'Income']]

        model = CatBoostClassifier(**params,
                                   cat_features=categorical_features_indices,
                                   early_stopping_rounds=50,
                                   eval_metric='TotalF1')

        model.fit(X_train, y_train, 
                  eval_set=(X_test, y_test),
                  use_best_model=True, 
                  logging_level='Silent')

        y_pred = model.predict(X_test)

        # Logging important metrics
        cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1_macro')
        cv_score = cv_scores.mean()
        cv_std = cv_scores.std()
        mlflow.log_metric("cv_f1_mean", cv_score)
        mlflow.log_metric("cv_f1_std", cv_std)
        
        score = f1_score(y_test, y_pred, average='macro')
        loss = 1 - score 
        mlflow.log_metric('f1_macro', score)
        print(classification_report(y_test, y_pred))

        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric('accuracy', accuracy)

        recall = recall_score(y_test, y_pred, average='macro')
        mlflow.log_metric('recall', recall)

    return {'loss': loss, 'status': STATUS_OK}

catboost_search_space = {
    'depth': scope.int(hp.quniform('depth', 4, 10, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, -1.6),
    'l2_leaf_reg': hp.loguniform('l2_leaf_reg', -2, 1),
    'bagging_temperature': hp.uniform('bagging_temperature', 0.0, 1.0),
    'random_strength': hp.uniform('random_strength', 0.0, 1.0),
    'border_count': scope.int(hp.quniform('border_count', 32, 64, 16)),
    'iterations': 500,
    'loss_function': 'Logloss',
    'verbose': 0,
    'random_seed': 42
}

best_result = fmin(
    fn=catboost_objective,
    space=catboost_search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

2025-07-06 15:41:56,776 - INFO - build_posterior_wrapper took 0.001022 seconds
2025-07-06 15:41:56,776 - INFO - TPE using 0 trials


              precision    recall  f1-score   support 

           0       0.94      0.73      0.82    213703
           1       0.34      0.74      0.47     39977

    accuracy                           0.73    253680
   macro avg       0.64      0.74      0.64    253680
weighted avg       0.84      0.73      0.77    253680

🏃 View run enthused-doe-715 at: http://127.0.0.1:5000/#/experiments/4/runs/6ef7a180eb164a41a98f93fcf9fad754

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/4

  2%|▏         | 1/50 [02:49<2:18:09, 169.17s/trial, best loss: 0.35555440365186364]

2025-07-06 15:44:45,951 - INFO - build_posterior_wrapper took 0.001999 seconds
2025-07-06 15:44:45,952 - INFO - TPE using 1/1 trials with best loss 0.355554


🏃 View run rogue-eel-468 at: http://127.0.0.1:5000/#/experiments/4/runs/c1e60e0474af47078e30cda04e815255

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/4                        

  2%|▏         | 1/50 [02:53<2:21:23, 173.13s/trial, best loss: 0.35555440365186364]


KeyboardInterrupt: 

In [28]:
print(X_train.dtypes)
print(X_train[['Age', 'GenHlth', 'Education', 'Income']].head())
for col in ['Age', 'GenHlth', 'Education', 'Income']:
    print(f"Unique values in {col}: {X_train[col].unique()}")

BMI          int64
Age          int64
Income       int64
PhysHlth     int64
Education    int64
GenHlth      int64
MentHlth     int64
HighBP       int64
Fruits       int64
dtype: object
   Age  GenHlth  Education  Income
0    4        3          6       8
1   12        3          6       8
2   13        1          6       8
3   11        3          6       8
4    8        2          5       8
Unique values in Age: [ 4 12 13 11  8  1  6  3  7 10  9  5  2]
Unique values in GenHlth: [3 1 2 4 5]
Unique values in Education: [6 5 4 3 2 1]
Unique values in Income: [8 7 6 3 4 1 5 2]


In [27]:
print(X_train.dtypes)
print(X_train.head(10))
print(X_train.iloc[:, 1].unique())

BMI          int64
Age          int64
Income       int64
PhysHlth     int64
Education    int64
GenHlth      int64
MentHlth     int64
HighBP       int64
Fruits       int64
dtype: object
   BMI  Age  Income  PhysHlth  Education  GenHlth  MentHlth  HighBP  Fruits
0   26    4       8        30          6        3         5       1       0
1   26   12       8         0          6        3         0       1       1
2   26   13       8        10          6        1         0       0       1
3   28   11       8         3          6        3         0       1       1
4   29    8       8         0          5        2         0       0       1
5   18    1       7         0          4        2         7       0       1
6   26   13       6         0          5        1         0       0       1
7   31    6       3         0          4        4         0       0       1
8   32    3       8         0          6        3         0       0       1
9   27    6       4         6          4        3      

In [None]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_test, label=y_test)

def xgboost_objective(params):
    with mlflow.start_run():
        mlflow.set_tag('Model', 'XGboost')
        mlflow.log_params(params)

        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50,
            verbose_eval=False
        )

        y_pred_proba = booster.predict(valid) 
        y_pred = np.argmax(y_pred_proba, axis=1)
        
        #Loggin important metrics
        cv_scores = cross_val_score(booster, X_train, y_train, cv=5, scoring='f1_macro')
        cv_score = cv_scores.mean()
        cv_std = cv_scores.std()
        mlflow.log_metric("cv_f1_mean", cv_score)
        mlflow.log_metric("cv_f1_std", cv_std)

        score = f1_score(y_test, y_pred, average='macro')
        loss = 1 - score
        mlflow.log_metric('f1_macro', score)

        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric('accuracy', accuracy)

        recall = recall_score(y_test, y_pred, average='macro')
        mlflow.log_metric('recall', recall)

    return {'loss': loss, 'status': STATUS_OK}

search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 3, 10, 1)),
    'learning_rate': hp.loguniform('learning_rate', -5, -1.6),
    'reg_alpha': hp.loguniform('reg_alpha', -8, 0),
    'reg_lambda': hp.loguniform('reg_lambda', -7, 0),
    'min_child_weight': hp.loguniform('min_child_weight',  1, 10),
    'subsample': hp.uniform('subsample', 0.6, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
    'objective': 'binary:logistic',
    'num_class': 2,
    'seed': 42
}

best_result = fmin(
    fn=xgboost_objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

In [None]:
def rf_objective(params):
    with mlflow.start_run():
        mlflow.set_tag('Model', 'Random Forest')
        mlflow.log_params(params)

        # Dependency bootstrap and oob_score
        if not params['bootstrap']:
            params['oob_score'] = False

        model = RandomForestClassifier(
            n_estimators=int(params['n_estimators']),
            max_depth=params['max_depth'],
            criterion=params['criterion'],
            min_samples_split=int(params['min_samples_split']),
            min_samples_leaf=int(params['min_samples_leaf']),
            min_weight_fraction_leaf=params['min_weight_fraction_leaf'],
            max_features=params['max_features'],
            max_leaf_nodes=params['max_leaf_nodes'],
            bootstrap=params['bootstrap'],
            oob_score=params['oob_score'],
            class_weight=params['class_weight'],
            random_state=42,
            n_jobs=-1
            )
        
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)

        #Logging important metrics
        cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1_macro')
        cv_score = cv_scores.mean()
        cv_std = cv_scores.std()
        mlflow.log_metric("cv_f1_mean", cv_score)
        mlflow.log_metric("cv_f1_std", cv_std)
        
        f1 = f1_score(y_test, y_pred, average='macro')
        loss = 1 - f1
        mlflow.log_metric('f1_macro', f1)

        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric('accuracy', accuracy)

        recall = recall_score(y_test, y_pred, average='macro')
        mlflow.log_metric('recall', recall)

        return {'loss': loss, 'status': STATUS_OK}
    

search_space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 100, 1000, 100)),
    'max_depth': hp.choice('max_depth', [None] + [scope.int(hp.quniform('max_depth_val', 5, 50, 1))]),
    'criterion': hp.choice('criterion', ['gini', 'entropy', 'log_loss']),
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 20, 1)),
    'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 20, 1)),
    'min_weight_fraction_leaf': hp.uniform('min_weight_fraction_leaf', 0.0, 0.4),
    'max_features': hp.choice('max_features', ['sqrt', 'log2', None]),
    'max_leaf_nodes': hp.choice('max_leaf_nodes', [None] + [scope.int(hp.quniform('max_leaf_nodes_val', 10, 100, 1))]),
    'bootstrap': hp.choice('bootstrap', [True, False]),
    'oob_score': hp.choice('oob_score', [True, False]),
    'class_weight': hp.choice('class_weight', ['balanced', None])
    }

best_result = fmin(
    fn=rf_objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)  

In [None]:
def logreg_objective(params):
    with mlflow.start_run():
        mlflow.set_tag('Model', 'LogisticRegression')
        mlflow.log_params(params)

        model = LogisticRegression(
            penalty=params['penalty'],
            C=params['C'],
            solver=params['solver'],
            class_weight='balanced',
            max_iter=1000,
            random_state=42
        )

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        #Loggin important metrics
        cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1_macro')
        cv_score = cv_scores.mean()
        cv_std = cv_scores.std()
        mlflow.log_metric("cv_f1_mean", cv_score)
        mlflow.log_metric("cv_f1_std", cv_std)

        score = f1_score(y_test, y_pred, average='macro')
        loss = 1 - score
        mlflow.log_metric('f1_macro', score)

        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric('accuracy', accuracy)

        recall = recall_score(y_test, y_pred, average='macro')
        mlflow.log_metric('recall', recall)

    return {'loss': loss, 'status': STATUS_OK}

logreg_search_space = {
    'penalty': hp.choice('penalty', ['l1', 'l2']),
    'C': hp.loguniform('C', -4, 2),
    'solver': hp.choice('solver', ['liblinear', 'saga'])
}

best_result = fmin(
    fn=logreg_objective,
    space=logreg_search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

In [23]:
data_path = '../Data/'
filename = 'train.pkl'
path = os.path.join(data_path, filename)
with open(path, 'rb') as f_in:
    X, y  = pickle.load(f_in)

In [25]:
X.head()

Unnamed: 0,BMI,Age,Income,PhysHlth,Education,GenHlth,MentHlth,HighBP,Fruits
0,26,4,8,30,6,3,5,1,0
1,26,12,8,0,6,3,0,1,1
2,26,13,8,10,6,1,0,0,1
3,28,11,8,3,6,3,0,1,1
4,29,8,8,0,5,2,0,0,1


In [26]:
X.dtypes

BMI          int32
Age          int32
Income       int32
PhysHlth     int32
Education    int32
GenHlth      int32
MentHlth     int32
HighBP       int32
Fruits       int32
dtype: object

In [28]:
import pickle
with open('../Data/train.pkl', 'rb') as f:
    X, y = pickle.load(f)

print(X.dtypes)

BMI          int32
Age          int32
Income       int32
PhysHlth     int32
Education    int32
GenHlth      int32
MentHlth     int32
HighBP       int32
Fruits       int32
dtype: object


In [None]:
from mlflow.tracking import MlflowClient
import re
def set_new_experiment_name(base_name="classification_experiment"):
    client = MlflowClient()
    experiments = client.search_experiments(view_type=mlflow.entities.ViewType.ALL)

    matching = [e for e in experiments if e.name.startswith(base_name)]

    versions = []
    for e in matching:
        match = re.search(rf"{base_name}_v(\d+)", e.name)
        if match:
            versions.append(int(match.group(1)))

    next_version = max(versions) + 1 if versions else 1
    new_experiment_name = f"{base_name}_v{next_version}"

    return new_experiment_name

In [34]:
name = set_new_experiment_name()
print(name)

classification_experiment_v1


In [37]:
client = MlflowClient()
experiments = client.search_experiments(view_type=mlflow.entities.ViewType.ALL)