### Настройка окружения

In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from catboost import CatBoostClassifier
import numpy as np
import os
import psycopg
import pandas as pd
import mlflow
from catboost import CatBoostClassifier


* 'schema_extra' has been renamed to 'json_schema_extra'


In [30]:
TABLE_NAME = 'users_churn'

TRACKING_SERVER_HOST = '127.0.0.1'
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = 'churn_task_alexdem'

In [3]:
# credentials postgres

connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}

connection.update(postgres_credentials)

In [24]:
# mlflow settings

os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID")
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY")

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

In [5]:
# выгрузка данных

with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

df.head(2) 

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,device_protection,tech_support,streaming_tv,gender,streaming_movies,senior_citizen,partner,dependents,multiple_lines,target
0,1,7590-VHVEG,2020-01-01,NaT,Month-to-month,Yes,Electronic check,29.85,29.85,DSL,...,No,No,No,Female,No,0,Yes,No,,0
1,2,5575-GNVDE,2017-04-01,NaT,One year,No,Mailed check,56.95,1889.5,DSL,...,Yes,No,No,Male,No,0,No,No,No,0


In [6]:
# обработка признаков ранее обученным трансформером

cat_features = [
    'paperless_billing',
    'payment_method',
    'internet_service',
    'online_security',
    'online_backup',
    'device_protection',
    'tech_support',
    'streaming_tv',
    'streaming_movies',
    'gender',
    'senior_citizen',
    'partner',
    'dependents',
    'multiple_lines',
    'type'
]
num_features = ["monthly_charges", "total_charges"]
target = ['target'] # колонка с таргетом вашей модели
df[num_features] = df[num_features].fillna(0)

df['senior_citizen'] = df['senior_citizen'].map({1:'Yes', 0:'No'})

logged_transformer = 'runs:/01e47211b28c4a6cbc96fc7f9302b453/column_transformer'

# Load model
logged_transformer = mlflow.sklearn.load_model(logged_transformer)

  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 24.54it/s]


In [None]:
features = ["monthly_charges", "total_charges", "senior_citizen"]
target = "target"

split_column = 'begin_date'
stratify_column = target
test_size = 0.2

df = df.sort_values(by=[split_column])

X_train, X_test, y_train, y_test = train_test_split(df[cat_features + num_features], df[target], test_size=test_size, shuffle=False)

X_train = logged_transformer.transform(X_train)
X_test = logged_transformer.transform(X_test)

print(f"Размер выборки для обучения: {X_train.shape}")
print(f"Размер выборки для теста: {X_test.shape}")

### Подбор гиперпараметров

#### GridSearch

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, confusion_matrix, precision_score, recall_score, f1_score, log_loss

In [12]:
loss_function = "Logloss"
task_type = 'CPU'
random_seed = 0
iterations = 300
verbose = False

params = {
    'depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.9],
    'l2_leaf_reg': [1, 5, 20],
}

model = CatBoostClassifier(loss_function=loss_function, verbose=verbose, task_type=task_type, random_seed=random_seed, iterations=iterations)

cv = GridSearchCV(estimator=model, param_grid=params, cv=2, scoring='accuracy', n_jobs=-1)

clf = cv.fit(X_train, y_train)

In [14]:
cv_results = pd.DataFrame(clf.cv_results_)

best_params = clf.best_params_

model_best = CatBoostClassifier(loss_function=loss_function, **best_params, verbose=verbose, task_type=task_type, random_seed=random_seed, iterations=iterations)

model_best.fit(X_train, y_train)

prediction = model_best.predict(X_test)
probas = model_best.predict_proba(X_test)[:, 1]

# расчёт метрик качества
metrics = {}

_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()
auc = roc_auc_score(y_test, probas)
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

# сохранение метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

print(cv_results.columns)
# дополнительные метрики из результатов кросс-валидации
metrics["mean_fit_time"] = cv_results['mean_fit_time'].mean()# среднее время обучения
metrics["std_fit_time"] =  cv_results['std_fit_time'].mean()# стандартное отклонение времени обучения
metrics["mean_test_score"] = cv_results['mean_test_score'].mean()# средний результат на тесте
metrics['std_test_score'] = cv_results['std_test_score'].mean()
metrics['best_score'] = clf.best_score_

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_depth', 'param_l2_leaf_reg', 'param_learning_rate', 'params',
       'split0_test_score', 'split1_test_score', 'mean_test_score',
       'std_test_score', 'rank_test_score'],
      dtype='object')


In [29]:
EXPERIMENT_NAME

'churn_task_alexndem'

In [32]:
RUN_NAME = 'model_grid_search' # ваш код здесь
REGISTRY_MODEL_NAME = 'model_churn_grid_search'

# настройки для логирования в MLFlow
pip_requirements = 'requirements.txt'
signature = mlflow.models.infer_signature(X_test, prediction)
input_example = X_test[:10]

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    cv_info = mlflow.sklearn.log_model(cv, artifact_path='cv')
    model_info = mlflow.catboost.log_model( 
			cb_model=model_best,
            artifact_path="models",
            registered_model_name=REGISTRY_MODEL_NAME,
            signature=signature,
            input_example=input_example,
            pip_requirements=pip_requirements)
		
    mlflow.log_metrics(metrics)
    mlflow.log_params(best_params)

Successfully registered model 'model_churn_grid_search'.
2025/06/05 20:32:16 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: model_churn_grid_search, version 1
Created version '1' of model 'model_churn_grid_search'.


#### RandomizedSearch

In [35]:
from sklearn.model_selection import RandomizedSearchCV


In [36]:
loss_function = "Logloss"
task_type = 'CPU'
random_seed = 0
iterations = 300
verbose = False

param_distributions = {
    'depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.9],
    'l2_leaf_reg': [1, 5, 20],
}

model = CatBoostClassifier(loss_function=loss_function, verbose=verbose, task_type=task_type, random_seed=random_seed, iterations=iterations)

cv = RandomizedSearchCV(estimator=model, param_distributions=param_distributions, n_iter=20, cv=2, scoring='accuracy', n_jobs=-1)

clf = cv.fit(X_train, y_train)

In [37]:
cv_results = pd.DataFrame(clf.cv_results_)

best_params = clf.best_params_

model = CatBoostClassifier(loss_function=loss_function, **best_params, verbose=verbose, task_type=task_type, random_seed=random_seed, iterations=iterations)

model.fit(X_train, y_train)

prediction = model.predict(X_test)
probas = model.predict_proba(X_test)[:, 1]

# расчёт метрик качества
metrics = {}

_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()
auc = roc_auc_score(y_test, probas)
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

# сохранение метрик в словарь
metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

# дополнительные метрики из результатов кросс-валидации
metrics["mean_fit_time"] = cv_results['mean_fit_time'].mean()# среднее время обучения
metrics["std_fit_time"] =  cv_results['std_fit_time'].mean()# стандартное отклонение времени обучения
metrics["mean_test_score"] = cv_results['mean_test_score'].mean()# средний результат на тесте
metrics['std_test_score'] = cv_results['std_test_score'].mean()
metrics['best_score'] = clf.best_score_

In [38]:
RUN_NAME = 'model_random_search' # ваш код здесь
REGISTRY_MODEL_NAME = 'model_churn_random_search'

# настройки для логирования в MLFlow
pip_requirements = 'requirements.txt'
signature = mlflow.models.infer_signature(X_test, prediction)
input_example = X_test[:10]

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    cv_info = mlflow.sklearn.log_model(cv, artifact_path='cv')
    model_info = mlflow.catboost.log_model( 
			cb_model=model_best,
            artifact_path="models",
            registered_model_name=REGISTRY_MODEL_NAME,
            signature=signature,
            input_example=input_example,
            pip_requirements=pip_requirements)
		
    mlflow.log_metrics(metrics)
    mlflow.log_params(best_params)

Successfully registered model 'model_churn_random_search'.
2025/06/05 20:35:24 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: model_churn_random_search, version 1
Created version '1' of model 'model_churn_random_search'.
