#### Этап 1: Обучение и логирование baseline-модели

In [1]:
# libs

import pandas as pd
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from category_encoders import CatBoostEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from catboost import CatBoostRegressor

##### Выгрузка данных

In [2]:
# функция для создания соединения с базой данных

def create_connection():

    load_dotenv()
    host = os.environ.get('DB_DESTINATION_HOST')
    port = os.environ.get('DB_DESTINATION_PORT')
    db = os.environ.get('DB_DESTINATION_NAME')
    username = os.environ.get('DB_DESTINATION_USER')
    password = os.environ.get('DB_DESTINATION_PASSWORD')
    
    conn = create_engine(f'postgresql://{username}:{password}@{host}:{port}/{db}', connect_args={'sslmode':'require'})
    return conn

In [3]:
conn = create_connection()
data = pd.read_sql('select * from buildings_clean', conn, index_col='id')
conn.dispose()

In [4]:
data.drop(columns='building_id', inplace=True)

In [5]:
data.head()

Unnamed: 0_level_0,build_year,building_type_int,latitude,longitude,ceiling_height,flats_count,floors_total,has_elevator,floor,is_apartment,kitchen_area,living_area,rooms,studio,total_area,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,1961,1,55.706188,37.740948,2.64,60,5,False,3,False,6.0,30.0,2,False,41.0,10400000.0
2,1979,4,55.61895,37.540848,2.64,256,16,True,5,False,9.0,34.0,2,False,54.0,10500000.0
3,1983,4,55.67004,37.742149,2.64,203,17,True,6,False,10.0,31.0,2,False,56.0,12200000.0
4,1973,4,55.763409,37.825542,2.64,431,9,True,1,False,7.0,29.0,2,False,45.0,7300000.0
5,1971,4,55.806629,37.806507,2.64,98,14,True,1,False,6.21,23.790001,2,False,38.32,9000000.0


##### Обучение baseline-модели

In [6]:
target_col = 'price'
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=target_col), data[target_col], test_size=0.2, random_state=42)

In [7]:

cat_features = X_train.select_dtypes(include='bool')
potential_binary_features = cat_features.nunique() == 2

binary_cat_features = cat_features[potential_binary_features[potential_binary_features].index]
num_features = X_train.select_dtypes(['float'])
rank_features = X_train.select_dtypes(include=['int'])

preprocessor = ColumnTransformer(
    [
        ('binary', OneHotEncoder(drop='if_binary'), binary_cat_features.columns.tolist()),
        ('int', CatBoostEncoder(return_df=False), rank_features.columns.tolist()),
        ('num', StandardScaler(), num_features.columns.tolist())
    ],
    remainder='drop',
    verbose_feature_names_out=False
)



In [8]:
model = CatBoostRegressor()

pipeline = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('model', model)
    ]
)
pipeline.fit(X_train, y_train)

Learning rate set to 0.0646
0:	learn: 6130080.9263715	total: 50.8ms	remaining: 50.8s
1:	learn: 5935207.0536453	total: 55.1ms	remaining: 27.5s
2:	learn: 5727360.8867192	total: 59.1ms	remaining: 19.6s
3:	learn: 5547409.7464499	total: 63.2ms	remaining: 15.7s
4:	learn: 5374555.2365617	total: 67.4ms	remaining: 13.4s
5:	learn: 5224620.7513583	total: 71.5ms	remaining: 11.8s
6:	learn: 5082710.6025057	total: 75.5ms	remaining: 10.7s
7:	learn: 4956189.6147623	total: 81ms	remaining: 10s
8:	learn: 4846123.7490559	total: 84.8ms	remaining: 9.34s
9:	learn: 4737908.0269159	total: 88.8ms	remaining: 8.79s
10:	learn: 4648870.5435199	total: 92.9ms	remaining: 8.35s
11:	learn: 4549653.1664227	total: 96.9ms	remaining: 7.98s
12:	learn: 4467106.7560939	total: 101ms	remaining: 7.66s
13:	learn: 4395598.9480642	total: 105ms	remaining: 7.38s
14:	learn: 4331777.1047923	total: 109ms	remaining: 7.14s
15:	learn: 4254611.1648500	total: 113ms	remaining: 6.94s
16:	learn: 4183605.8330446	total: 117ms	remaining: 6.77s
17:	l

##### Оценка модели и логирование артефактов

In [9]:
import mlflow
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


* 'schema_extra' has been renamed to 'json_schema_extra'


In [10]:
# расчет основных метрик регрессии для последующего логирования

prediction = model.predict(X_test)

mse = mean_squared_error(y_test, prediction)
mae = mean_absolute_error(y_test, prediction)
r2 = r2_score(y_test, prediction)



In [11]:
# константы для логирования в mlflow

EXPERIMENT_NAME = "real_estate_model_alexdem"
RUN_NAME = "baseline_model"
REGISTRY_MODEL_NAME = "model_real_estate_alexndem"


os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID")
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY")

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

pip_requirements = "../requirements.txt"
signature = mlflow.models.infer_signature(X_test.values, prediction)
input_example = X_test[:10]


In [12]:
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if not experiment:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    model_info = mlflow.catboost.log_model( 
			cb_model=model,
            artifact_path="models",
            registered_model_name=REGISTRY_MODEL_NAME,
            pip_requirements=pip_requirements,
            signature=signature,
            input_example=input_example,
            await_registration_for=60
		)
    mlflow.log_metrics({'mse': mse, 'mae': mae, 'r2': r2})

Successfully registered model 'model_real_estate_alexndem'.
2025/06/08 18:15:59 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: model_real_estate_alexndem, version 1
Created version '1' of model 'model_real_estate_alexndem'.


In [15]:
y_train

id
593      12800000.0
7555      8100000.0
2685     11200000.0
4744     15500000.0
5169     11800000.0
            ...    
11965    11000000.0
21576    15500000.0
5391      8000000.0
861      27000000.0
15796     9200000.0
Name: price, Length: 17927, dtype: float64