In [2]:
import warnings 
warnings.filterwarnings('ignore')

import os
import pandas as pd

import mlflow
from mlflow.models import infer_signature

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, median_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sqlalchemy import create_engine

In [3]:
os.getenv("MLFLOW_S3_ENDPOINT_URL", None)

'http://localhost:9000'

In [4]:
os.getenv("MLFLOW_TRACKING_URI", None)

'postgresql+psycopg2://postgres:postgres@localhost:5434/mlflow_db'

In [5]:
FEATURES = [
    "MedInc", "HouseAge", "AveRooms", "AveBedrms", "Population", "AveOccup",
    "Latitude", "Longitude"
]
TARGET = "MedHouseVal"

models = dict(zip(["RandomForest", "LinearRegression", "HistGB"], 
                  [RandomForestRegressor(), LinearRegression(), HistGradientBoostingRegressor()]))

In [6]:
def get_data():
    engine = create_engine("postgresql://postgres:postgres@localhost:5433/postgres")
    data = pd.read_sql_query("SELECT * FROM california_housing", engine)
    return data

In [7]:
data = get_data()
data.head()


Unnamed: 0,index,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [8]:
def preprocess_data(data):
    # Сделать препроцессинг
    # Разделить на фичи и таргет
    X, y = data[FEATURES], data[TARGET]

    # Разделить данные на обучение и тест
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)

    # Обучить стандартизатор на train
    scaler = StandardScaler()
    X_train_fitted = scaler.fit_transform(X_train)
    X_test_fitted = scaler.transform(X_test)
    
    return X_train_fitted, X_test_fitted, y_train, y_test

In [9]:
X_train_fitted, X_test_fitted, y_train, y_test = preprocess_data(data)

In [10]:
def train_model(model, name, X_train, X_test, y_train, y_test):

    # Обучить модель
    model.fit(X_train, y_train)

    # Сделать predict
    prediction = model.predict(X_test)

    # Получить описание данных
    signature = infer_signature(X_test, prediction)
    # Сохранить модель в артифактори
    model_info = mlflow.sklearn.log_model(model, name, signature=signature)
    # Сохранить метрики модели
    mlflow.evaluate(
        model_info.model_uri,
        data=X_test,
        targets=y_test.values,
        model_type="regressor",
        evaluators=["default"],
    )

In [16]:
# Создать новый эксперимент
try:
    exp_name = "parent_run_experiment"
    experiment_id = mlflow.create_experiment(
        exp_name,
        artifact_location=os.getenv("MLFLOW_S3_ARTIFACT_ROOT", None)
    )
    mlflow.set_experiment(exp_name)
except mlflow.exceptions.MlflowException as err:
    if "already exists" not in err.message:
        raise err

In [19]:
mlflow.get_experiment_by_name("parent_run_experiment").experiment_id

'1'

In [21]:
with mlflow.start_run(run_name="parent_run", experiment_id = 1, description = "parent") as parent_run:
    for model_name in models.keys():
        with mlflow.start_run(run_name=model_name, experiment_id= 1, nested=True) as child_run:
            train_model(models[model_name], model_name, X_train_fitted, X_test_fitted, y_train, y_test)

cbdd2d99e723493ea842050fa159921a
Model RandomForest training started


KeyboardInterrupt: 