 1. Импорт библиотек

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score

import joblib
import time

In [2]:
import mlflow
import mlflow.catboost
import os

os.makedirs("./mlruns", exist_ok=True)

# хранить в текущей папке проекта
mlflow.set_tracking_uri("file:./mlruns")
mlflow.set_experiment("HMEQ_CatBoost")

  return FileStore(store_uri, store_uri)
2025/12/16 16:56:25 INFO mlflow.tracking.fluent: Experiment with name 'HMEQ_CatBoost' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///Users/alex/dev/DAIB_project/mlruns/921718720432123846', creation_time=1765893385021, experiment_id='921718720432123846', last_update_time=1765893385021, lifecycle_stage='active', name='HMEQ_CatBoost', tags={}>

2. Загрузка данных

Датасет: https://www.kaggle.com/datasets/ajay1735/hmeq-data

In [3]:
df = pd.read_csv("./data/hmeq.csv")
df.head()

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,REASON,JOB,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
0,1,1100,25860.0,39025.0,HomeImp,Other,10.5,0.0,0.0,94.366667,1.0,9.0,
1,1,1300,70053.0,68400.0,HomeImp,Other,7.0,0.0,2.0,121.833333,0.0,14.0,
2,1,1500,13500.0,16700.0,HomeImp,Other,4.0,0.0,0.0,149.466667,1.0,10.0,
3,1,1500,,,,,,,,,,,
4,0,1700,97800.0,112000.0,HomeImp,Office,3.0,0.0,0.0,93.333333,0.0,14.0,


3. Разделяем признаки и цель

In [4]:
target = 'BAD'
X = df.drop(columns=[target])
y = df[target]

4. Обработка пропусков

In [5]:
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

X[categorical_cols] = X[categorical_cols].fillna("Unknown")

 5. Создаём модель

In [6]:
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV

cat_model = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="AUC",
    verbose=False,
    random_seed=42
)

params = {
    "iterations": [700, 900],
    "depth": [6, 8],
    "learning_rate": [0.02, 0.03],
    "l2_leaf_reg": [1, 3],
}

grid = GridSearchCV(
    estimator=cat_model,
    param_grid=params,
    cv=3,
    scoring="roc_auc",
    n_jobs=-1
)

6. Train / Test Split

In [7]:
df_split = df[df["CLAGE"].notna()].copy()

# Граница по квантилю CLAGE (70% старых / 30% новых)
clage_threshold = df_split["CLAGE"].quantile(0.3)

# Reference = "исторические" данные
df_reference = df_split[df_split["CLAGE"] >= clage_threshold].copy()

# Current / Production = "новые" данные
df_current = df_split[df_split["CLAGE"] < clage_threshold].copy()

# Разделяем признаки и таргет
target = "BAD"

X_ref = df_reference.drop(columns=[target])
y_ref = df_reference[target]

X_current = df_current.drop(columns=[target])
y_current = df_current[target]

categorical_cols = X_ref.select_dtypes(include=["object"]).columns.tolist()

X_ref[categorical_cols] = X_ref[categorical_cols].fillna("Unknown")
X_current[categorical_cols] = X_current[categorical_cols].fillna("Unknown")

# Из reference делаем train / validation
X_train, X_test, y_train, y_test = train_test_split(
    X_ref,
    y_ref,
    test_size=0.2,
    random_state=42,
    stratify=y_ref
)

print("Train size:", X_train.shape)
print("Reference size:", X_ref.shape)
print("Current (prod-like) size:", X_current.shape)

Train size: (3164, 12)
Reference size: (3956, 12)
Current (prod-like) size: (1696, 12)


In [8]:
current_df = X_current.copy()
current_df.to_csv("data/current_data.csv", index=False)

7. Обучение модели

In [9]:
cat_features = [X.columns.get_loc(col) for col in categorical_cols]

run_name = "catboost_gridsearch_v1"

with mlflow.start_run(run_name=run_name) as run:
    # --- полезные теги (для "управления ЖЦ")
    mlflow.set_tag("dataset", "HMEQ")
    mlflow.set_tag("task", "binary_classification")
    mlflow.set_tag("model_type", "CatBoostClassifier")
    mlflow.set_tag("stage", "training")

    # --- логируем настройки подбора
    mlflow.log_param("cv", 3)
    mlflow.log_param("scoring", "roc_auc")
    mlflow.log_param("param_grid", str(params))

    start = time.time()
    grid.fit(X_train, y_train, cat_features=cat_features)
    train_seconds = time.time() - start
    mlflow.log_metric("train_time_sec", train_seconds)

    model = grid.best_estimator_

    # --- логируем лучшее
    mlflow.log_params(grid.best_params_)
    mlflow.log_metric("cv_best_roc_auc", float(grid.best_score_))

    # --- оцениваем на holdout (test)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)

    mlflow.log_metric("test_accuracy", float(acc))
    mlflow.log_metric("test_roc_auc", float(auc))

    report = classification_report(y_test, y_pred)
    mlflow.log_param("classification_report", report)

    # --- логируем модель в MLflow
    # input_example помогает потом в деплое/валидации
    input_example = X_test.head(5)

    mlflow.catboost.log_model(
        cb_model=model,
        artifact_path="model",
        input_example=input_example
    )

    print("MLflow run_id:", run.info.run_id)
    print("Best params:", grid.best_params_)
    print("Test Accuracy:", acc)
    print("Test ROC-AUC:", auc)
    
    from mlflow.tracking import MlflowClient

    print("tracking_uri:", mlflow.get_tracking_uri())
    print("experiment_id:", run.info.experiment_id)
    print("run_id:", run.info.run_id)
    
    client = MlflowClient()
    
    root = client.list_artifacts(run.info.run_id, path="")
    print("artifacts root:", [a.path for a in root])
    
    model_dir = client.list_artifacts(run.info.run_id, path="model")
    print("artifacts model/:", [a.path for a in model_dir])




MLflow run_id: 6af00ddd008b4d3ca702042bb2cd65ee
Best params: {'depth': 8, 'iterations': 900, 'l2_leaf_reg': 1, 'learning_rate': 0.02}
Test Accuracy: 0.9482323232323232
Test ROC-AUC: 0.9805097451274363
tracking_uri: file:./mlruns
experiment_id: 921718720432123846
run_id: 6af00ddd008b4d3ca702042bb2cd65ee
artifacts root: []
artifacts model/: ['model/MLmodel', 'model/conda.yaml', 'model/input_example.json', 'model/model.cb', 'model/python_env.yaml', 'model/requirements.txt', 'model/serving_input_example.json']


In [10]:
# reference data для мониторинга
reference_df = X_train.copy()
reference_df["target"] = y_train.values

reference_df.to_csv("./data/reference_data.csv", index=False)

In [11]:
reference_df

Unnamed: 0,LOAN,MORTDUE,VALUE,REASON,JOB,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC,target
227,5300,47449.0,63895.0,HomeImp,Office,19.0,0.0,0.0,208.162714,0.0,20.0,19.762114,0
706,8100,28523.0,39816.0,DebtCon,Other,5.0,0.0,0.0,130.643933,0.0,16.0,37.393526,1
397,6500,60054.0,70434.0,HomeImp,Other,1.0,0.0,0.0,127.062702,2.0,21.0,37.800353,1
3532,18600,50167.0,96312.0,DebtCon,ProfExe,5.0,0.0,0.0,128.339434,1.0,21.0,36.377322,0
4948,26100,,120470.0,DebtCon,Mgr,7.0,0.0,0.0,161.287672,0.0,27.0,34.130004,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2825,15800,51163.0,157644.0,DebtCon,ProfExe,19.0,0.0,0.0,239.224789,0.0,28.0,20.747521,0
1188,10000,42321.0,53800.0,DebtCon,Other,4.0,0.0,0.0,275.900000,0.0,19.0,,1
2358,14200,102789.0,123691.0,DebtCon,Unknown,3.0,0.0,0.0,127.060008,1.0,26.0,31.338764,0
1805,12200,82904.0,125057.0,DebtCon,Office,8.0,0.0,0.0,131.659053,1.0,21.0,37.273140,0


In [12]:
!mlflow ui --backend-store-uri file:./mlruns --host 0.0.0.0 --port 5005

Registry store URI not provided. Using backend store URI.
  return FileStore(store_uri, artifact_uri)
  return FileStore(store_uri)
[MLflow] Security middleware enabled with default settings (localhost-only). To allow connections from other hosts, use --host 0.0.0.0 and configure --allowed-hosts and --cors-allowed-origins.
[32mINFO[0m:     Uvicorn running on [1mhttp://0.0.0.0:5005[0m (Press CTRL+C to quit)
[32mINFO[0m:     Started parent process [[36m[1m88965[0m]
[32mINFO[0m:     Started server process [[36m88969[0m]
[32mINFO[0m:     Waiting for application startup.
[32mINFO[0m:     Started server process [[36m88970[0m]
[32mINFO[0m:     Waiting for application startup.
[32mINFO[0m:     Started server process [[36m88967[0m]
[32mINFO[0m:     Started server process [[36m88968[0m]
[32mINFO[0m:     Application startup complete.
[32mINFO[0m:     Waiting for application startup.
[32mINFO[0m:     Waiting for application startup.
[32mINFO[0m: 