In [1]:
import os
import psycopg
import pandas as pd
from catboost import CatBoostRegressor
import joblib
import json
from sklearn.model_selection import KFold, cross_validate
import mlflow
import yaml
from mlflow.client import MlflowClient


* 'schema_extra' has been renamed to 'json_schema_extra'


In [2]:

os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net" #endpoint бакета от YandexCloud
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID") # получаем id ключа бакета, к которому подключён MLFlow, из .env
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY") # получаем ключ бакета, к которому подключён MLFlow, из .env

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = 'catboost_legacy2'
RUN_NAME = "baseline+data"
REGISTRY_MODEL_NAME = 'baseline'
FS_ASSETS = "baseline"  

In [3]:
# устанавливаем host, который будет отслеживать наши эксперименты
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}") 

In [4]:
mlflow.set_experiment(EXPERIMENT_NAME)
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
with mlflow.start_run(run_name=f"{RUN_NAME}_intersection_and_union", experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    with open('cv_results/cv_res.json', 'r') as fd:
        cv_results = yaml.safe_load(fd)
    model = joblib.load('models/fitted_model.pkl')
    fit_params = model.get_params()
    mlflow.log_params(fit_params)
    mlflow.log_metrics(cv_results)
    
    data = pd.read_csv('data/initial_data.csv')
    mlflow.log_artifacts('data')
    X = data.drop(['price'], axis=1)
    metadata = {"model_type": "baseline", 'structure': 'Catboost'}
    train_data = X.head(5)
    pred = model.predict(train_data)
    signature = mlflow.models.infer_signature(train_data, pred)
    input_example = train_data
    pip_requirements = '/home/mle-user/mle_projects/yn/requirements.txt'
    model_info = mlflow.catboost.log_model( 
		    cb_model=model,
            artifact_path="models",
            registered_model_name=REGISTRY_MODEL_NAME,
            signature=signature,
            input_example=input_example,
            pip_requirements=pip_requirements,
            metadata = metadata,
            await_registration_for=60
		)
    mlflow.end_run()
    

  inputs = _infer_schema(model_input) if model_input is not None else None
Registered model 'baseline' already exists. Creating a new version of this model...
2025/03/12 17:32:45 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: baseline, version 3
Created version '3' of model 'baseline'.
