In [1]:
import sys
# !{sys.executable} -m pip install optuna-integration[mlflow]


### Настройка окружения

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV
from catboost import CatBoostClassifier
import numpy as np
import os
import psycopg
import pandas as pd
import mlflow
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, confusion_matrix, precision_score, recall_score, f1_score, log_loss
from optuna.integration.mlflow import MLflowCallback
from collections import defaultdict


* 'schema_extra' has been renamed to 'json_schema_extra'
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
TABLE_NAME = 'users_churn'

TRACKING_SERVER_HOST = '127.0.0.1'
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = 'churn_task_alexdem'

In [4]:
# credentials postgres

connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}

connection.update(postgres_credentials)

In [5]:
# mlflow settings

os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID")
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY")

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

In [6]:
# выгрузка данных

with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

df.head(2) 

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,device_protection,tech_support,streaming_tv,gender,streaming_movies,senior_citizen,partner,dependents,multiple_lines,target
0,1,7590-VHVEG,2020-01-01,NaT,Month-to-month,Yes,Electronic check,29.85,29.85,DSL,...,No,No,No,Female,No,0,Yes,No,,0
1,2,5575-GNVDE,2017-04-01,NaT,One year,No,Mailed check,56.95,1889.5,DSL,...,Yes,No,No,Male,No,0,No,No,No,0


In [7]:
# обработка признаков ранее обученным трансформером

cat_features = [
    'paperless_billing',
    'payment_method',
    'internet_service',
    'online_security',
    'online_backup',
    'device_protection',
    'tech_support',
    'streaming_tv',
    'streaming_movies',
    'gender',
    'senior_citizen',
    'partner',
    'dependents',
    'multiple_lines',
    'type'
]
num_features = ["monthly_charges", "total_charges"]
target = ['target'] # колонка с таргетом вашей модели
df[num_features] = df[num_features].fillna(0)

df['senior_citizen'] = df['senior_citizen'].map({1:'Yes', 0:'No'})

logged_transformer = 'runs:/01e47211b28c4a6cbc96fc7f9302b453/column_transformer'

# Load model
logged_transformer = mlflow.sklearn.load_model(logged_transformer)

Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 22.63it/s]


In [8]:
features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

split_column = 'begin_date'
stratify_column = target
test_size = 0.2

df = df.sort_values(by=[split_column])

X_train, X_test, y_train, y_test = train_test_split(df[cat_features + num_features], df[target], test_size=test_size, shuffle=False)

X_train = logged_transformer.transform(X_train)
X_test = logged_transformer.transform(X_test)

print(f"Размер выборки для обучения: {X_train.shape}")
print(f"Размер выборки для теста: {X_test.shape}")

Размер выборки для обучения: (5634, 36)
Размер выборки для теста: (1409, 36)


In [9]:
X_train = pd.DataFrame(X_train, columns=logged_transformer.get_feature_names_out())
X_test = pd.DataFrame(X_test, columns=logged_transformer.get_feature_names_out())


### Подбор гиперпараметров

#### GridSearch

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, confusion_matrix, precision_score, recall_score, f1_score, log_loss

In [12]:
loss_function = "Logloss"
task_type = 'CPU'
random_seed = 0
iterations = 300
verbose = False

params = {
    'depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.9],
    'l2_leaf_reg': [1, 5, 20],
}

model = CatBoostClassifier(loss_function=loss_function, verbose=verbose, task_type=task_type, random_seed=random_seed, iterations=iterations)

cv = GridSearchCV(estimator=model, param_grid=params, cv=2, scoring='accuracy', n_jobs=-1)

clf = cv.fit(X_train, y_train)

In [14]:
cv_results = pd.DataFrame(clf.cv_results_)

best_params = clf.best_params_

model_best = CatBoostClassifier(loss_function=loss_function, **best_params, verbose=verbose, task_type=task_type, random_seed=random_seed, iterations=iterations)

model_best.fit(X_train, y_train)

prediction = model_best.predict(X_test)
probas = model_best.predict_proba(X_test)[:, 1]

# расчёт метрик качества
metrics = {}

_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()
auc = roc_auc_score(y_test, probas)
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

# сохранение метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

print(cv_results.columns)
# дополнительные метрики из результатов кросс-валидации
metrics["mean_fit_time"] = cv_results['mean_fit_time'].mean()# среднее время обучения
metrics["std_fit_time"] =  cv_results['std_fit_time'].mean()# стандартное отклонение времени обучения
metrics["mean_test_score"] = cv_results['mean_test_score'].mean()# средний результат на тесте
metrics['std_test_score'] = cv_results['std_test_score'].mean()
metrics['best_score'] = clf.best_score_

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_depth', 'param_l2_leaf_reg', 'param_learning_rate', 'params',
       'split0_test_score', 'split1_test_score', 'mean_test_score',
       'std_test_score', 'rank_test_score'],
      dtype='object')


In [29]:
EXPERIMENT_NAME

'churn_task_alexndem'

In [32]:
RUN_NAME = 'model_grid_search' # ваш код здесь
REGISTRY_MODEL_NAME = 'model_churn_grid_search'

# настройки для логирования в MLFlow
pip_requirements = 'requirements.txt'
signature = mlflow.models.infer_signature(X_test, prediction)
input_example = X_test[:10]

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    cv_info = mlflow.sklearn.log_model(cv, artifact_path='cv')
    model_info = mlflow.catboost.log_model( 
			cb_model=model_best,
            artifact_path="models",
            registered_model_name=REGISTRY_MODEL_NAME,
            signature=signature,
            input_example=input_example,
            pip_requirements=pip_requirements)
		
    mlflow.log_metrics(metrics)
    mlflow.log_params(best_params)

Successfully registered model 'model_churn_grid_search'.
2025/06/05 20:32:16 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: model_churn_grid_search, version 1
Created version '1' of model 'model_churn_grid_search'.


#### RandomizedSearch

In [35]:
from sklearn.model_selection import RandomizedSearchCV


In [36]:
loss_function = "Logloss"
task_type = 'CPU'
random_seed = 0
iterations = 300
verbose = False

param_distributions = {
    'depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.9],
    'l2_leaf_reg': [1, 5, 20],
}

model = CatBoostClassifier(loss_function=loss_function, verbose=verbose, task_type=task_type, random_seed=random_seed, iterations=iterations)

cv = RandomizedSearchCV(estimator=model, param_distributions=param_distributions, n_iter=20, cv=2, scoring='accuracy', n_jobs=-1)

clf = cv.fit(X_train, y_train)

In [37]:
cv_results = pd.DataFrame(clf.cv_results_)

best_params = clf.best_params_

model = CatBoostClassifier(loss_function=loss_function, **best_params, verbose=verbose, task_type=task_type, random_seed=random_seed, iterations=iterations)

model.fit(X_train, y_train)

prediction = model.predict(X_test)
probas = model.predict_proba(X_test)[:, 1]

# расчёт метрик качества
metrics = {}

_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()
auc = roc_auc_score(y_test, probas)
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

# сохранение метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

# дополнительные метрики из результатов кросс-валидации
metrics["mean_fit_time"] = cv_results['mean_fit_time'].mean()# среднее время обучения
metrics["std_fit_time"] =  cv_results['std_fit_time'].mean()# стандартное отклонение времени обучения
metrics["mean_test_score"] = cv_results['mean_test_score'].mean()# средний результат на тесте
metrics['std_test_score'] = cv_results['std_test_score'].mean()
metrics['best_score'] = clf.best_score_

In [38]:
RUN_NAME = 'model_random_search' # ваш код здесь
REGISTRY_MODEL_NAME = 'model_churn_random_search'

# настройки для логирования в MLFlow
pip_requirements = 'requirements.txt'
signature = mlflow.models.infer_signature(X_test, prediction)
input_example = X_test[:10]

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    cv_info = mlflow.sklearn.log_model(cv, artifact_path='cv')
    model_info = mlflow.catboost.log_model( 
			cb_model=model_best,
            artifact_path="models",
            registered_model_name=REGISTRY_MODEL_NAME,
            signature=signature,
            input_example=input_example,
            pip_requirements=pip_requirements)
		
    mlflow.log_metrics(metrics)
    mlflow.log_params(best_params)

Successfully registered model 'model_churn_random_search'.
2025/06/05 20:35:24 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: model_churn_random_search, version 1
Created version '1' of model 'model_churn_random_search'.


#### optuna

In [11]:
RUN_NAME = "model_bayesian_search"
STUDY_DB_NAME = "sqlite:///local.study.db"
STUDY_NAME = "churn_model"


In [14]:
def objective(trial: optuna.Trial) -> float:
    param = {
    "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
    "depth": trial.suggest_int("depth", 1, 12),
    "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.1, 5),
    "random_strength": trial.suggest_float("random_strength", 0.1, 5),
    "loss_function": "Logloss",
    "task_type": "CPU",
    "random_seed": 0,
    "iterations": 300,
    "verbose": False,
    }
    model = CatBoostClassifier(**param)

    skf = StratifiedKFold(n_splits=2)

    metrics = defaultdict(list)
    for i, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):
        train_x = X_train.iloc[train_index]
        val_x = X_train.iloc[val_index]
        train_y = y_train.iloc[train_index]
        val_y = y_train.iloc[val_index]

        model.fit(train_x, train_y)
        prediction = model.predict(val_x)
        probas = model.predict_proba(val_x)[:, 1]

        _, err_1, _, err_2 = confusion_matrix(val_y, prediction, normalize='all').ravel()
        auc = roc_auc_score(val_y, probas)
        precision = precision_score(val_y, prediction)
        recall = recall_score(val_y, prediction)
        f1 = f1_score(val_y, prediction)
        logloss = log_loss(val_y, prediction)
        
        metrics["err1"].append(err_1)
        metrics["err2"].append(err_2)
        metrics["auc"].append(auc)
        metrics["precision"].append(precision)
        metrics["recall"].append(recall)
        metrics["f1"].append(f1)
        metrics["logloss"].append(logloss)


    # ваш код здесь #
    err_1 = np.median(metrics['err1'])
    err_2 = np.median(metrics['err2'])
    auc = np.median(metrics['auc'])
    precision = np.median(metrics['precision'])
    recall = np.median(metrics['recall'])
    f1 = np.median(metrics['f1'])
    logloss = np.median(metrics['logloss'])
    

    return auc

In [32]:
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if not experiment:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id
    

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    

mlflc = MLflowCallback(
    tracking_uri=f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}",
    metric_name="AUC",
    create_experiment=False,
    mlflow_kwargs={"experiment_id": experiment_id, 'tags': {'mlflow.parentRunId': run_id}, 'artifact_location': f'runs:/{run_id}/models'})

study = optuna.create_study(study_name=STUDY_NAME, storage=STUDY_DB_NAME, direction="maximize", sampler=optuna.samplers.TPESampler(), load_if_exists=True)
study.optimize(objective, n_trials=10, callbacks=[mlflc])
best_params = study.best_params

print(f"Number of finished trials: {len(study.trials)}")
print(f"Best params: {best_params}")


Exception: Run with UUID 7084f9826b0c41c9a5ce9330658d2c6c is already active. To start a new run, first end the current run with mlflow.end_run(). To start a nested run, call start_run with nested=True

In [35]:
param = {
    "loss_function": "Logloss",
    "task_type": "CPU",
    "random_seed": 0,
    "iterations": 300,
    "verbose": False,
    }
param.update(best_params)
model = CatBoostClassifier(**param)

model.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x7f584881be20>

In [38]:
signature = mlflow.models.infer_signature(X_test, model.predict(X_test))
input_example = X_test[:10]

with mlflow.start_run(run_id='240283b019ec4dc59b338b0275436ccf'):
    # Логирование произвольного файла
    
    # Логирование модели CatBoost
    mlflow.sklearn.log_model(
        model,
        artifact_path="cv",
        signature=signature,
        input_example=input_example,
        pip_requirements="requirements.txt"
    )



0:	learn: 0.6895686	total: 279ms	remaining: 4m 38s
1:	learn: 0.6862826	total: 552ms	remaining: 4m 35s
2:	learn: 0.6829717	total: 819ms	remaining: 4m 32s
3:	learn: 0.6799125	total: 946ms	remaining: 3m 55s
4:	learn: 0.6769624	total: 1.21s	remaining: 4m
5:	learn: 0.6738109	total: 1.48s	remaining: 4m 4s
6:	learn: 0.6707051	total: 1.74s	remaining: 4m 6s
7:	learn: 0.6672643	total: 2.01s	remaining: 4m 9s
8:	learn: 0.6634997	total: 2.28s	remaining: 4m 10s
9:	learn: 0.6601847	total: 2.54s	remaining: 4m 11s
10:	learn: 0.6574501	total: 2.8s	remaining: 4m 12s
11:	learn: 0.6544715	total: 3.07s	remaining: 4m 12s
12:	learn: 0.6518143	total: 3.34s	remaining: 4m 13s
13:	learn: 0.6491701	total: 3.62s	remaining: 4m 15s
14:	learn: 0.6462433	total: 3.9s	remaining: 4m 15s
15:	learn: 0.6435364	total: 4.17s	remaining: 4m 16s
16:	learn: 0.6406690	total: 4.43s	remaining: 4m 16s
17:	learn: 0.6378475	total: 4.7s	remaining: 4m 16s
18:	learn: 0.6350608	total: 4.98s	remaining: 4m 16s
19:	learn: 0.6325283	total: 5.25

Successfully registered model 'model_bayesian_search'.
2025/06/07 17:47:29 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: model_bayesian_search, version 1
Created version '1' of model 'model_bayesian_search'.


In [None]:
with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    

Registered model 'model_bayesian_search' already exists. Creating a new version of this model...
2025/06/07 18:21:34 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: model_bayesian_search, version 2
Created version '2' of model 'model_bayesian_search'.


In [28]:
run_id = 'a7260cb0814241859bf6dd363621741e'
    


TypeError: CatBoost.save_model() got an unexpected keyword argument 'run_id'