In [1]:
import os
import psycopg2 as psycopg
import pandas as pd
import joblib
import mlflow
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder

Выгрузим данные с помощью psycopg2

In [2]:
connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": "rc1b-uh7kdmcx67eomesf.mdb.yandexcloud.net", 
    "port": "6432",
    "dbname": "playground_mle_20240827_f58c68b5ea",
    "user": "mle_20240827_f58c68b5ea",
    "password": "4734c8048cac4fb782b922ef54ad31be",
}
assert all([var_value != "" for var_value in list(postgres_credentials.values())])

connection.update(postgres_credentials)


TABLE_NAME = "clean_flats"


with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
				
				# извлекаем все строки, полученные в результате выполнения запроса
        data = cur.fetchall()

				# получаем список имён столбцов из объекта курсора
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

print(f"Размер нашей таблицы: {df.shape[0]} строк; {df.shape[1]} столбцов")

Размер нашей таблицы: 19403 строк; 19 столбцов


Сделаем предобработку данных и создадим модель

In [3]:
def fit_model(data, data_target):

    data.drop(columns=['id', 'price'], inplace=True)

    binary_features = data[['has_elevator', 'studio', 'is_apartment']]
    cat_features = data[['building_type_int']]
    num_features = data.select_dtypes(['float', 'int']).drop(columns=['building_type_int'])

    preprocessor = ColumnTransformer(
        [
        ('num', StandardScaler(), num_features.columns.tolist()),
        ('binary', OneHotEncoder(drop='if_binary', handle_unknown='ignore'), binary_features.columns.tolist()),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features.columns.tolist())
        ],
        remainder='drop',
        verbose_feature_names_out=False
    )


    transformed_data = preprocessor.fit_transform(data, data_target)
    model = LinearRegression()

    return model, transformed_data

    

In [4]:
model, transformed_data = fit_model(df, df["target"])

Разделим датасет на тренировочную и тестовую выборки

In [5]:
features = transformed_data
target = df["target"]

split_column = "build_year"

df = df.sort_values(by=[split_column])

X_train, X_test, y_train, y_test = train_test_split(
    features,
    target,
    test_size=0.2,
    shuffle=False
) 

Обучим модель

In [6]:
model.fit(X_train, y_train)
prediction = model.predict(X_test)

Определим метрики

In [7]:
from sklearn.metrics import max_error, r2_score, mean_absolute_percentage_error 

metrics = {}

max_err = max_error(y_test, prediction)
r2 = r2_score(y_test, prediction)
percentage_error = mean_absolute_percentage_error(y_test, prediction)

metrics["max_err"] = max_err
metrics["r2"] = r2
metrics["percentage_error"] = percentage_error

Залогируем метрики и модель

In [9]:
EXPERIMENT_NAME = "flats_andreybikmulinvik"
RUN_NAME = "base_model_registry"
REGISTRY_MODEL_NAME = "model_flats_andreybikmulinvik"


os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = 'YCAJEaw2pH6ASixkVD1V6OqIw'
os.environ["AWS_SECRET_ACCESS_KEY"] = 'YCNViAgYJAXurxFt-5ZAAH_ZQauS37kGWk4od83K'

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}") # tracking uri
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}") # registry uri

experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if not experiment:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id

pip_requirements = "../requirements.txt"
signature = mlflow.models.infer_signature(X_test, prediction)
input_example = X_test[:10]


experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

    mlflow.log_metrics(metrics)
    model_info = mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="models",
        registered_model_name=REGISTRY_MODEL_NAME,
        pip_requirements=pip_requirements,
        signature=signature,
        input_example=input_example,
        await_registration_for=60
		)

Registered model 'model_flats_andreybikmulinvik' already exists. Creating a new version of this model...
2024/10/22 21:49:20 INFO mlflow.tracking._model_registry.client: Waiting up to 60 seconds for model version to finish creation. Model name: model_flats_andreybikmulinvik, version 8
Created version '8' of model 'model_flats_andreybikmulinvik'.
