In [1]:
import mlflow
import boto3
import os
import joblib
import json
import pandas as pd
import psycopg2

TABLE_NAME = 'clean_data'

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000
MLFLOW_TRACKING_URI = f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"

EXPERIMENT_NAME = os.environ['EXPERIMENT_NAME']
RUN_NAME = 'baseline_model'

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_registry_uri(MLFLOW_TRACKING_URI)

* 'schema_extra' has been renamed to 'json_schema_extra'


In [2]:
connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}

connection.update(postgres_credentials)

with psycopg2.connect(**connection) as conn:
    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

df.head(2)

Unnamed: 0,flat_id,building_id,build_year,building_type_int,latitude,longitude,ceiling_height,flats_count,floors_total,has_elevator,floor,kitchen_area,living_area,rooms,is_apartment,studio,total_area,price
0,69899,14700,1987,4,55.509476,37.564724,2.7,287,12,True,9,0.0,0.0,3,False,False,65.800003,12500000.0
1,130947,8418,1969,4,55.755894,37.831425,2.65,212,9,True,5,6.0,27.9,2,False,False,44.400002,9500000.0


In [3]:
with open('../models/baseline_model.pkl', 'rb') as fd:
    model = joblib.load(fd)

In [4]:
with open('../cv_results/cv_res.json', 'r') as f:
    cv_metrics = json.load(f)

In [5]:
y = df['price']
X = df.drop('price', axis=1)

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [7]:
import time

start_time = time.time()
test_predictions = model.predict(X_test)
end_time = time.time()

test_predict_duration = end_time - start_time
print(f"Время предсказания: {test_predict_duration:.6f} сек.")

Время предсказания: 0.180528 сек.


In [9]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
import numpy as np
metrics = {}

mae = mean_absolute_error(y_test, test_predictions)
mse = mean_squared_error(y_test, test_predictions)
rmse = np.sqrt(mse)
mape = mean_absolute_percentage_error(y_test, test_predictions)
r2 = r2_score(y_test, test_predictions)

metrics["MAE"] = mae
metrics["RMSE"] = rmse
metrics["R2"] = r2
metrics["MAPE"] = mape
metrics["learn_time"] = cv_metrics["fit_time"]
metrics["predict_time"] = test_predict_duration

In [12]:
metrics

{'MAE': 1875771.2651467226,
 'RMSE': 2404477.38736345,
 'R2': 0.7025861229899222,
 'MAPE': 0.17068341885180735,
 'learn_time': 5.28,
 'predict_time': 0.1805276870727539}

In [10]:
pip_requirements = '../requirements.txt'
signature = mlflow.models.infer_signature(
    X_test, 
    test_predictions
)
input_example = X_test[:10]

  inputs = _infer_schema(model_input) if model_input is not None else None


In [11]:
experiment_id = mlflow.set_experiment(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(experiment_id=experiment_id, run_name=RUN_NAME) as run:
    run_id = run.info.run_id

    mlflow.log_metrics(metrics)
    mlflow.sklearn.log_model(
        sk_model=model,
        pip_requirements=pip_requirements,
        signature=signature,
        input_example=input_example,
        registered_model_name="estate_prices_model",
        artifact_path='ep'
    )

Registered model 'estate_prices_model' already exists. Creating a new version of this model...
2026/02/09 15:33:04 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: estate_prices_model, version 14
Created version '14' of model 'estate_prices_model'.
