In [2]:
import os
import optuna
import mlflow
import mlflow.sklearn
from mlflow.client import MlflowClient
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
import optuna
from catboost import CatBoostRegressor
from optuna.integration.mlflow import MLflowCallback
import numpy as np
import random
from optuna.samplers import TPESampler
from numpy import median, array 


* 'schema_extra' has been renamed to 'json_schema_extra'


In [3]:
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "catb00st_optuna" # напишите название вашего эксперимента
RUN_NAME = "optuna"

FS_ASSETS = 'assets/optuna'
os.makedirs(FS_ASSETS, exist_ok=True)


pd.options.display.max_columns = 100
pd.options.display.max_rows = 64

In [4]:
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net" #endpoint бакета от YandexCloud
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID") # получаем id ключа бакета, к которому подключён MLFlow, из .env
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY") # получаем ключ бакета, к которому подключён MLFlow, из .env

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

In [5]:
def get_artifact_runID(name):
    exp_ids=[]
    exps = client.search_experiments()
    got = False
    for e in exps:
        if not got:
            exp_ids.append(e.experiment_id)
            for e_id in exp_ids:
                runs = client.search_runs(e_id)
                for run in runs:
                    if run.info.status !='FAILED' and not got:
                        artifacts = client.list_artifacts(run.info.run_id)
                        #print(artifacts)
                        try:
                            
                            if name in str(artifacts):
                                run_id = run.info.run_id
                                print(run_id)
                                got = True
                        except:
                            pass
    return run_id

In [6]:
client = MlflowClient()
# Получаем список артефактов
the_run_id = get_artifact_runID("f_test.csv")

local_path = client.download_artifacts(the_run_id, "f_test.csv", 'data')
test = pd.read_csv(local_path)
test.sample(5)

2672a8996f4249ea82958891c914e4e3


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,cats_transformer__encoder__nearest_metro,numeric__kitchen_area living_area^2,numeric__distance_to_metro,numeric__living_area^3,numeric__living_area,numeric__distance_to_metro^2,cats_transformer__bins_to_cats__build_year,numeric__kitchen_area living_area,cats_transformer__bins_to_cats__floors_total,cats_transformer__encoder__rooms,y
29144,16230180.0,18464.767774,6.827563,89915.393148,44.8,46.615614,11348030.0,412.159993,12080040.0,12460980.0,11500000
6416,12650970.0,5832.0,18.103179,19683.0,27.0,327.725078,13683380.0,216.0,14915540.0,9123777.0,8000000
18975,13258210.0,9600.0,8.052151,64000.0,40.0,64.83713,10284300.0,240.0,11596480.0,12460980.0,9400000
26202,13258210.0,25704.409277,3.997304,126506.020978,50.200002,15.978437,14446400.0,512.040008,11596480.0,12460980.0,16800000
30201,15172270.0,4375.0,4.230069,15625.0,25.0,17.893482,14446400.0,175.0,11596480.0,9123777.0,9500000


In [7]:
the_run_id = get_artifact_runID("f_train.csv")

local_path = client.download_artifacts(the_run_id, "f_train.csv", 'data')
train = pd.read_csv(local_path)
train.sample(5)

2672a8996f4249ea82958891c914e4e3


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,cats_transformer__encoder__nearest_metro,numeric__kitchen_area living_area^2,numeric__distance_to_metro,numeric__living_area^3,numeric__living_area,numeric__distance_to_metro^2,cats_transformer__bins_to_cats__build_year,numeric__kitchen_area living_area,cats_transformer__bins_to_cats__floors_total,cats_transformer__encoder__rooms,y
10448,13215130.0,3742.847963,6.817552,11852.352297,22.8,46.47902,10285330.0,164.159997,11596790.0,9123555.0,6200000
26452,15427480.0,8715.924611,2.68881,54010.15895,37.800002,7.229699,14437170.0,230.580006,11596790.0,12463630.0,10700000
78290,13191060.0,37558.438226,1.680423,269586.116897,64.599998,2.823823,11386550.0,581.399986,12088520.0,15668040.0,10000000
14258,10655130.0,20469.5309,19.420567,41781.925756,34.700001,377.158432,12290630.0,589.900013,12086430.0,12463630.0,7400000
17852,12404680.0,3810.240231,13.217156,16003.009453,25.200001,174.693206,14437170.0,151.200005,11596790.0,9123576.0,6300000


In [8]:
X_train = train.drop(['y'], axis=1)
y_train = train['y']
X_test = test.drop(['y'], axis=1)
y_test = test['y']

In [9]:
def objective(trial, X_train, X_test, y_train, y_test, experiment_id):
    """Objective function to be optimized by Optuna"""
    # Use nested runs to track each trial in MLflow
    with mlflow.start_run(experiment_id = experiment_id, nested=True):
        # Define hyperparameters to optimize
        params = {
            'iterations': trial.suggest_int('iterations', 100, 600),
            'depth': trial.suggest_int('depth', 2, 16),
            'min_child_samples': trial.suggest_int('min_child_samples', 2, 10),
            'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.05, 1.0),
            "subsample": trial.suggest_float("subsample", 0.05, 1.0),
            'loss_function': 'RMSE',
            'random_state': 42,
            'silent': True
        }
        
        # Log parameters
        for param_name, param_value in params.items():
            mlflow.log_param(param_name, param_value)
        
        # Create and train the model
        model = CatBoostRegressor(**params)
        model.fit(X_train, y_train)
        
        # Make predictions and evaluate
        y_pred = model.predict(X_test)
        metrics ={
            'neg_mean_absolute_error': - mean_absolute_error(y_pred, y_test),
            'neg_root_mean_squared_error': - mean_squared_error(y_pred, y_test),
            'r2': r2_score(y_pred, y_test),
            'neg_mean_absolute_percentage_error': - mean_absolute_percentage_error(y_pred, y_test)
        }
        
        # Log metrics
        mlflow.log_metrics(metrics)
        
        # Log model
        mlflow.catboost.log_model(model, 'model')
        
        # Log feature importance
        importance = pd.DataFrame({
            'feature': X_train.columns,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        fig, ax = plt.figure(figsize=(10, 6)), plt.axes()
        importance.plot.bar(x='feature', y='importance', ax=ax)
        plt.title('Feature Importance')
        plt.tight_layout()
        plt.savefig('feature_importance.png')
        mlflow.log_artifact('feature_importance.png')
        plt.close()
        
        print(f"Trial {trial.number}: RMSE={mean_squared_error(y_pred, y_test):.4f}, MAE={mean_absolute_error(y_pred, y_test):.4f}, R2={r2_score(y_pred, y_test):.4f}, MAPE%={mean_absolute_percentage_error(y_pred, y_test):.4f}")
        
        return mean_squared_error(y_pred, y_test)  # We want to minimize RMSE


In [None]:
def run_optimization(experiment_id):
    """Run the complete optimization process"""
    # Setup MLflow with S3
    
    # Create and run Optuna study with MLflow tracking
    with mlflow.start_run(run_name="optuna_optimization", experiment_id=experiment_id):
        mlflow.log_param("optimizer", "optuna")
        mlflow.log_param("n_trials", 10)
        
        # Create study
        study = optuna.create_study(direction='minimize', 
                                    study_name='cb_hyperparameter_optimization',
                                    pruner=optuna.pruners.MedianPruner())
        
        # Optimize with partial function to pass dataset
        study.optimize(lambda trial: objective(trial, X_train, X_test, y_train, y_test, experiment_id), 
                       n_trials=10)
        
        # Log best trial information
        mlflow.log_params({f"best_{k}": v for k, v in study.best_params.items()})
        #mlflow.log_metric("best_rmse", study.best_value)
        
        # Print results
        print("\n" + "="*50)
        print("Best trial:")
        print(f"  Value (RMSE): {study.best_value:.4f}")
        print("  Params:")
        for key, value in study.best_params.items():
            print(f"    {key}: {value}")
        
        # Train the best model
        best_model = CatBoostRegressor(**study.best_params)
        best_model.fit(X_train, y_train)
        
        # Log the best model
        mlflow.sklearn.log_model(best_model, "optuna")
        
        # Generate and log optimization visualizations
        try:
            fig = optuna.visualization.plot_optimization_history(study)
            fig.write_image("optimization_history.png")
            mlflow.log_artifact("optimization_history.png")
            
            fig = optuna.visualization.plot_param_importances(study)
            fig.write_image("param_importances.png")
            mlflow.log_artifact("param_importances.png")
            
            fig = optuna.visualization.plot_slice(study)
            fig.write_image("slice_plot.png")
            mlflow.log_artifact("slice_plot.png")
        except Exception as e:
            print(f"Visualization error: {e}")
        
        print("\nOptimization completed. All results are logged to MLflow.")
        print(f"Best model saved with parameters: {study.best_params}")

if __name__ == "__main__":
    experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
    if not experiment:
        experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
    else:
        experiment_id = experiment.experiment_id
    print(experiment)
    run_optimization(experiment_id)

<Experiment: artifact_location='s3://s3-student-mle-20250130-833968fcc1/20', creation_time=1741694413161, experiment_id='20', last_update_time=1741694413161, lifecycle_stage='active', name='catb00st_optuna', tags={}>


[I 2025-03-11 13:38:19,935] A new study created in memory with name: cb_hyperparameter_optimization
[I 2025-03-11 13:38:32,791] Trial 0 finished with value: 6894244488828.594 and parameters: {'iterations': 206, 'depth': 8, 'min_child_samples': 4, 'colsample_bylevel': 0.9120607611667203, 'subsample': 0.6865255566465035}. Best is trial 0 with value: 6894244488828.594.


Trial 0: RMSE=6894244488828.5938, MAE=2009646.8813, R2=0.6213, MAPE%=0.1656


[I 2025-03-11 13:38:40,672] Trial 1 finished with value: 7379397219179.799 and parameters: {'iterations': 597, 'depth': 3, 'min_child_samples': 6, 'colsample_bylevel': 0.3756431318821266, 'subsample': 0.4394093903294672}. Best is trial 0 with value: 6894244488828.594.


Trial 1: RMSE=7379397219179.7988, MAE=2071857.6277, R2=0.5659, MAPE%=0.1687


[I 2025-03-11 13:38:45,545] Trial 2 finished with value: 7724135977930.743 and parameters: {'iterations': 329, 'depth': 8, 'min_child_samples': 4, 'colsample_bylevel': 0.08487216592806894, 'subsample': 0.2356145665500194}. Best is trial 0 with value: 6894244488828.594.


Trial 2: RMSE=7724135977930.7432, MAE=2118282.9048, R2=0.5337, MAPE%=0.1713


[I 2025-03-11 13:38:57,074] Trial 3 finished with value: 6846879124617.842 and parameters: {'iterations': 434, 'depth': 7, 'min_child_samples': 2, 'colsample_bylevel': 0.4231009927821365, 'subsample': 0.7699557111433408}. Best is trial 3 with value: 6846879124617.842.


Trial 3: RMSE=6846879124617.8418, MAE=2002678.8255, R2=0.6149, MAPE%=0.1641


[I 2025-03-11 13:46:42,381] Trial 4 finished with value: 7116339403512.768 and parameters: {'iterations': 208, 'depth': 16, 'min_child_samples': 3, 'colsample_bylevel': 0.9784305429038128, 'subsample': 0.07235595424910576}. Best is trial 3 with value: 6846879124617.842.


Trial 4: RMSE=7116339403512.7676, MAE=2022276.3202, R2=0.6083, MAPE%=0.1662


[I 2025-03-11 13:46:46,935] Trial 5 finished with value: 7923361528804.435 and parameters: {'iterations': 201, 'depth': 3, 'min_child_samples': 2, 'colsample_bylevel': 0.10744080248864399, 'subsample': 0.8243093647806834}. Best is trial 3 with value: 6846879124617.842.


Trial 5: RMSE=7923361528804.4346, MAE=2142179.1174, R2=0.5213, MAPE%=0.1738


[I 2025-03-11 13:50:17,852] Trial 6 finished with value: 6727388894878.412 and parameters: {'iterations': 297, 'depth': 16, 'min_child_samples': 6, 'colsample_bylevel': 0.3466501333703128, 'subsample': 0.26785414666727836}. Best is trial 6 with value: 6727388894878.412.


Trial 6: RMSE=6727388894878.4121, MAE=1973145.9020, R2=0.6349, MAPE%=0.1618


[I 2025-03-11 13:50:23,048] Trial 7 finished with value: 7219771996007.577 and parameters: {'iterations': 287, 'depth': 5, 'min_child_samples': 8, 'colsample_bylevel': 0.29056084424832795, 'subsample': 0.33427248907556306}. Best is trial 6 with value: 6727388894878.412.


Trial 7: RMSE=7219771996007.5771, MAE=2053224.9880, R2=0.5848, MAPE%=0.1678


[I 2025-03-11 13:50:28,388] Trial 8 finished with value: 7666731800866.478 and parameters: {'iterations': 284, 'depth': 2, 'min_child_samples': 5, 'colsample_bylevel': 0.6786156348366911, 'subsample': 0.5520829026943243}. Best is trial 6 with value: 6727388894878.412.


Trial 8: RMSE=7666731800866.4775, MAE=2109699.6895, R2=0.5439, MAPE%=0.1710


[I 2025-03-11 13:52:59,995] Trial 9 finished with value: 6473304393247.93 and parameters: {'iterations': 578, 'depth': 14, 'min_child_samples': 6, 'colsample_bylevel': 0.48937975070584505, 'subsample': 0.4259784517693547}. Best is trial 9 with value: 6473304393247.93.


Trial 9: RMSE=6473304393247.9297, MAE=1943018.2886, R2=0.6465, MAPE%=0.1597

Best trial:
  Value (RMSE): 6473304393247.9297
  Params:
    iterations: 578
    depth: 14
    min_child_samples: 6
    colsample_bylevel: 0.48937975070584505
    subsample: 0.4259784517693547
Learning rate set to 0.13132
0:	learn: 4601267.1469270	total: 198ms	remaining: 1m 54s
1:	learn: 4269011.9123614	total: 520ms	remaining: 2m 29s
2:	learn: 3987845.0947216	total: 724ms	remaining: 2m 18s
3:	learn: 3757621.4642224	total: 956ms	remaining: 2m 17s
4:	learn: 3569658.4129250	total: 1.17s	remaining: 2m 14s
5:	learn: 3415124.0911319	total: 1.45s	remaining: 2m 18s
6:	learn: 3290890.4329448	total: 1.74s	remaining: 2m 21s
7:	learn: 3190722.3427914	total: 1.93s	remaining: 2m 17s
8:	learn: 3103473.3817407	total: 2.13s	remaining: 2m 14s
9:	learn: 3035006.5439925	total: 2.42s	remaining: 2m 17s
10:	learn: 2979587.7711301	total: 2.65s	remaining: 2m 16s
11:	learn: 2929369.0874972	total: 2.9s	remaining: 2m 16s
12:	learn: 28925




Optimization completed. All results are logged to MLflow.
Best model saved with parameters: {'iterations': 578, 'depth': 14, 'min_child_samples': 6, 'colsample_bylevel': 0.48937975070584505, 'subsample': 0.4259784517693547}
