In [54]:
from collections import defaultdict
import os
import psycopg
import pandas as pd
from numpy import random, array, median
import mlflow
from mlflow.utils.mlflow_tags import MLFLOW_PARENT_RUN_ID
import optuna
from optuna.integration.mlflow import MLflowCallback
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score,
    confusion_matrix,
    log_loss,
)
from mlflow.models.signature import infer_signature

pip_requirements = "/home/mle-user/mle_projects/mle-mlflow/requirements.txt"
TABLE_NAME = "users_churn"
# –ù–∞—Å—Ç—Ä–æ–π–∫–∞ –ø–æ–¥–∫–ª—é—á–µ–Ω–∏—è
connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}
connection.update(postgres_credentials)

# –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö
with psycopg.connect(**connection) as conn:
    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

In [55]:
features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"
split_column = "begin_date"
test_size = 0.2

df = df.sort_values(by=[split_column])
X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target],
    test_size=test_size,
    shuffle=False,
)

In [59]:
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5001

os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID")
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY") 
AWS_BUCKET_NAME="s3-student-mle-20250130-d1608e0ec6"
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
# –£—Å—Ç–∞–Ω–æ–≤–∫–∞ URI –¥–ª—è —Ö—Ä–∞–Ω–µ–Ω–∏—è –∞—Ä—Ç–µ—Ñ–∞–∫—Ç–æ–≤
artifact_location = f"s3://{AWS_BUCKET_NAME}"

In [60]:
# –§—É–Ω–∫—Ü–∏—è –æ–ø—Ç–∏–º–∏–∑–∞—Ü–∏–∏ - –∏–∑–º–µ–Ω–∏–º, —á—Ç–æ–±—ã –∏–∑–±–µ–∂–∞—Ç—å –≤–ª–æ–∂–µ–Ω–Ω—ã—Ö –∑–∞–ø—É—Å–∫–æ–≤ –≤–Ω—É—Ç—Ä–∏
def objective(trial: optuna.Trial) -> float:
    param = {
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 12),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.1, 5),
        "random_strength": trial.suggest_float("random_strength", 0.1, 5),
        "loss_function": "Logloss",
        "task_type": "CPU",
        "random_seed": 0,
        "iterations": 300,
        "verbose": False,
    }

    model = CatBoostClassifier(**param)
    skf = StratifiedKFold(n_splits=2)
    metrics = defaultdict(list)

    for train_index, val_index in skf.split(X_train, y_train):
        train_x, val_x = X_train.iloc[train_index], X_train.iloc[val_index]
        train_y, val_y = y_train.iloc[train_index], y_train.iloc[val_index]

        model.fit(train_x, train_y)
        probas = model.predict_proba(val_x)[:, 1]
        prediction = model.predict(val_x)

        # –ü–æ–ª—É—á–µ–Ω–∏–µ –º–µ—Ç—Ä–∏–∫ –∏–∑ –º–∞—Ç—Ä–∏—Ü—ã –æ—à–∏–±–æ–∫
        _, err1, _, err2 = confusion_matrix(val_y, prediction, normalize='all').ravel()
        
        # –°–±–æ—Ä –≤—Å–µ—Ö –º–µ—Ç—Ä–∏–∫
        metrics["err1"].append(err1)
        metrics["err2"].append(err2)
        metrics["auc"].append(roc_auc_score(val_y, probas))
        metrics["f1"].append(f1_score(val_y, prediction))
        metrics["precision"].append(precision_score(val_y, prediction))
        metrics["recall"].append(recall_score(val_y, prediction))
        metrics["logloss"].append(log_loss(val_y, probas))

    # –ê–≥—Ä–µ–≥–∏—Ä–æ–≤–∞–Ω–∏–µ –º–µ—Ç—Ä–∏–∫
    err1 = median(array(metrics["err1"]))
    err2 = median(array(metrics["err2"]))
    auc = median(array(metrics["auc"]))
    precision = median(array(metrics["precision"]))
    recall = median(array(metrics["recall"]))
    f1 = median(array(metrics["f1"]))
    logloss = median(array(metrics["logloss"]))

    # –í–º–µ—Å—Ç–æ –ª–æ–≥–∏—Ä–æ–≤–∞–Ω–∏—è –≤–Ω—É—Ç—Ä–∏ —Ñ—É–Ω–∫—Ü–∏–∏, –ø—Ä–æ—Å—Ç–æ –¥–æ–±–∞–≤–∏–º –º–µ—Ç—Ä–∏–∫–∏ –≤ trial
    trial.set_user_attr("err1", float(err1))
    trial.set_user_attr("err2", float(err2))
    trial.set_user_attr("precision", float(precision))
    trial.set_user_attr("recall", float(recall))
    trial.set_user_attr("f1", float(f1))
    trial.set_user_attr("logloss", float(logloss))
    
    return auc

In [61]:
# –ù–∞—Å—Ç—Ä–æ–π–∫–∞ MLflow –∏ Optuna
EXPERIMENT_NAME = "catboost_hyperparameter_optimization4"
RUN_NAME = "other_model_bayesian_search4"
STUDY_NAME = "churn_bayesian_search_other4"
STUDY_DB_NAME = "sqlite:///local.study.db"


In [62]:
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if not experiment:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id

# –£–±–µ–¥–∏–º—Å—è, —á—Ç–æ –Ω–µ—Ç –∞–∫—Ç–∏–≤–Ω—ã—Ö –∑–∞–ø—É—Å–∫–æ–≤ –ø–µ—Ä–µ–¥ –Ω–∞—á–∞–ª–æ–º –Ω–æ–≤–æ–≥–æ
if mlflow.active_run():
    mlflow.end_run()

# –°–æ–∑–¥–∞–µ–º Optuna study –±–µ–∑ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–∏—è MLflowCallback
study = optuna.create_study(
    direction="maximize",
    study_name=STUDY_NAME,
    storage=STUDY_DB_NAME,
    sampler=optuna.samplers.TPESampler(),
    load_if_exists=True
)

[I 2025-09-17 23:18:57,984] A new study created in RDB with name: churn_bayesian_search_other4


In [63]:
with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as parent_run:
    parent_run_id = parent_run.info.run_id
    
    # –û–ø—Ç–∏–º–∏–∑–∞—Ü–∏—è –±–µ–∑ MLflowCallback
    study.optimize(objective, n_trials=10)
    
    # –õ–æ–≥–∏—Ä—É–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã –∫–∞–∂–¥–æ–≥–æ trial –≤—Ä—É—á–Ω—É—é
    for trial in study.trials:
        # –°–æ–∑–¥–∞–µ–º –≤–ª–æ–∂–µ–Ω–Ω—ã–π –∑–∞–ø—É—Å–∫ –¥–ª—è –∫–∞–∂–¥–æ–≥–æ trial
        with mlflow.start_run(run_name=f"trial_{trial.number}", nested=True) as child_run:
            # –õ–æ–≥–∏—Ä—É–µ–º –ø–∞—Ä–∞–º–µ—Ç—Ä—ã
            params = trial.params
            mlflow.log_params(params)
            
            # –õ–æ–≥–∏—Ä—É–µ–º –º–µ—Ç—Ä–∏–∫–∏
            mlflow.log_metric("auc", trial.value)
            for key in ["err1", "err2", "precision", "recall", "f1", "logloss"]:
                if key in trial.user_attrs:
                    mlflow.log_metric(key, trial.user_attrs[key])
    
    # –õ–æ–≥–∏—Ä–æ–≤–∞–Ω–∏–µ –ª—É—á—à–µ–π –º–æ–¥–µ–ª–∏ –≤ MLflow
    best_params = study.best_params
    best_model = CatBoostClassifier(
        learning_rate=best_params["learning_rate"],
        depth=best_params["depth"],
        l2_leaf_reg=best_params["l2_leaf_reg"],
        random_strength=best_params["random_strength"],
        loss_function="Logloss",
        task_type="CPU",
        random_seed=0,
        iterations=300,
        verbose=False
    )
    best_model.fit(X_train, y_train)

    signature = mlflow.models.infer_signature(X_test, best_model.predict(X_test))
    input_example = X_test[:10]

    # –ü—Ä–∏ –ª–æ–≥–∏—Ä–æ–≤–∞–Ω–∏–∏ –º–æ–¥–µ–ª–∏, –æ–Ω–∞ –±—É–¥–µ—Ç –∞–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–∏ —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∞ –≤ S3
    mlflow.catboost.log_model(
    best_model,
    artifact_path="cv",
    signature=signature,
    input_example=input_example,
    pip_requirements=pip_requirements,
)
    mlflow.log_params(best_params)
    mlflow.log_metric("auc", study.best_value)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-09-17 23:19:03,535] Trial 0 finished with value: 0.7064139331484232 and parameters: {'learning_rate': 0.00977814032643483, 'depth': 1, 'l2_leaf_reg': 2.648285789734457, 'random_strength': 3.6352919304495988}. Best is trial 0 with value: 0.7064139331484232.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-09-17 23:19:04,683] Trial 1 finished with value: 0.7744640966584169 and parameters: {'learning_rate': 0.006575891606504968, 'depth': 5, 'l2_leaf_reg': 3.7254232292533453, 'random_strength': 4.762556176873608}. Best is trial 1 with value: 0.7744640966584169.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2025-09-17 23:19:07,385] Trial 2 finished with value: 0.7332109110674263 and parameters: {'learning_rate': 0.0020656757853478344, 'depth': 11, 'l2_leaf_reg': 2.71765492274798

üèÉ View run trial_0 at: http://127.0.0.1:5001/#/experiments/24/runs/12d1ca30523a4d4388bfa3a6fb862ae8
üß™ View experiment at: http://127.0.0.1:5001/#/experiments/24
üèÉ View run trial_1 at: http://127.0.0.1:5001/#/experiments/24/runs/fd109b49547643e8b206ec0915c722ac
üß™ View experiment at: http://127.0.0.1:5001/#/experiments/24
üèÉ View run trial_2 at: http://127.0.0.1:5001/#/experiments/24/runs/56034c8d91f04047a04076de6030d1a7
üß™ View experiment at: http://127.0.0.1:5001/#/experiments/24
üèÉ View run trial_3 at: http://127.0.0.1:5001/#/experiments/24/runs/952ae21b618d4ac9a8b425311e7b490a
üß™ View experiment at: http://127.0.0.1:5001/#/experiments/24
üèÉ View run trial_4 at: http://127.0.0.1:5001/#/experiments/24/runs/d11c2c4c3bd94a23ac3172c926f6664c
üß™ View experiment at: http://127.0.0.1:5001/#/experiments/24
üèÉ View run trial_5 at: http://127.0.0.1:5001/#/experiments/24/runs/1bf774db0b934fffb93f18326a676dad
üß™ View experiment at: http://127.0.0.1:5001/#/experiments/2

 - mlflow (current: 3.3.2, required: mlflow==2.7.1)
 - jupyterlab (current: uninstalled, required: jupyterlab==4.0.7)
 - psycopg (current: 3.2.9, required: psycopg[binary,pool]==3.1.12)
 - pandas (current: 2.1.3, required: pandas==2.0.1)
 - scikit-learn (current: 1.4.1.post1, required: scikit-learn==1.3.1)
 - catboost (current: 1.2.3, required: catboost==1.2.2)
 - scipy (current: 1.12.0, required: scipy==1.11.3)
 - optuna (current: 4.5.0, required: optuna==3.4.0)
 - ipywidgets (current: uninstalled, required: ipywidgets==8.1.1)
 - seaborn (current: 0.13.2, required: seaborn==0.13.0)
 - autofeat (current: 2.1.3, required: autofeat==2.1.2)
 - mlxtend (current: 0.23.4, required: mlxtend==0.23.0)
 - plotly (current: 5.19.0, required: plotly==5.20.0)
 - kaleido (current: uninstalled, required: kaleido==0.2.1)
 - boto3 (current: 1.34.51, required: boto3==1.34.77)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install depende

üèÉ View run other_model_bayesian_search4 at: http://127.0.0.1:5001/#/experiments/27/runs/cc4ff940c65945778cc1231bcafb517c
üß™ View experiment at: http://127.0.0.1:5001/#/experiments/27
