In [1]:
import sys
import os

import pandas as pd 
import numpy as np

import mlflow
import mlflow.sklearn
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import (
    LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge, HuberRegressor
)
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from datetime import datetime
import numpy as np


project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from data_collection.data_collector import DataCollector


In [2]:
data_col = DataCollector()
data = data_col.get_historical_data(symbol="EURUSD")
data.head() 

Unnamed: 0_level_0,Open,Close,High,Low,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1971-01-04,0.5369,0.5369,0.5369,0.5369,1
1971-01-05,0.5366,0.5366,0.5366,0.5366,1
1971-01-06,0.5365,0.5365,0.5365,0.5365,1
1971-01-07,0.5368,0.5368,0.5368,0.5368,1
1971-01-08,0.5371,0.5371,0.5371,0.5371,1


In [3]:

def tune_and_log_models(X, y, models=None, param_grids=None, cv=5):
    """
    Проходит по списку моделей, подбирает гиперпараметры и логирует лучшие модели в MLflow с RMSE.

    Параметры:
    ----------
    X : array-like
        Массив признаков (матрица X).
    y : array-like
        Целевая переменная (вектор y).
    models : dict или None
        Словарь моделей, где ключи - названия моделей, а значения - сами модели.
    param_grids : dict или None
        Словарь параметров, где ключи - названия моделей, а значения - сетки гиперпараметров.
    cv : int
        Количество фолдов для кросс-валидации.

    Возвращает:
    -----------
    results : dict
        Словарь с лучшими моделями, их параметрами и оценками.
    """

    if models is None:
        models = {
            'linear': LinearRegression(),
            'ridge': Ridge(),
            'lasso': Lasso(),
            'elasticnet': ElasticNet(),
            'svr': SVR(),
            'decision_tree': DecisionTreeRegressor(),
            'random_forest': RandomForestRegressor(),
            'gbr': GradientBoostingRegressor(),
            'knn': KNeighborsRegressor(),
            'polynomial': Pipeline([
                ('poly', PolynomialFeatures()),
                ('linear', LinearRegression())
            ]),
            'bayesian': BayesianRidge(),
            'huber': HuberRegressor()
        }

    if param_grids is None:
        param_grids = {
            'ridge': {'alpha': [0.1, 1.0, 10.0]},
            'lasso': {'alpha': [0.001, 0.01, 0.1, 1.0]},
            'elasticnet': {'alpha': [0.001, 0.01, 0.1, 1.0], 'l1_ratio': [0.1, 0.5, 0.7, 1.0]},
            'svr': {'C': [0.1, 1, 10], 'kernel': ['linear', 'poly', 'rbf']},
            'decision_tree': {'max_depth': [None, 10, 20, 30]},
            'random_forest': {'n_estimators': [10, 50, 100], 'max_depth': [None, 10, 20, 30]},
            'gbr': {'n_estimators': [50, 100], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
            'knn': {'n_neighbors': [3, 5, 7, 10]},
            'polynomial': {'poly__degree': [2, 3, 4]},
            'bayesian': {'alpha_1': [1e-6, 1e-3, 1e-1], 'lambda_1': [1e-6, 1e-3, 1e-1]},
            'huber': {'epsilon': [1.35, 1.5, 1.75]}
        }

    results = {}

    # Начало родительского запуска
    parent_run_name = f"Model_Tuning_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
    with mlflow.start_run(run_name=parent_run_name) as parent_run:
        for model_name, model in models.items():
            with mlflow.start_run(run_name=model_name, nested=True):
                # Параметры для данной модели
                param_grid = param_grids.get(model_name, {})

                # Настройка модели с использованием GridSearchCV
                grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv, scoring='neg_mean_squared_error')
                grid_search.fit(X, y)

                # Лучшая модель, параметры и результат
                best_model = grid_search.best_estimator_
                best_params = grid_search.best_params_
                best_rmse = np.sqrt(-grid_search.best_score_)

                # Логируем модель и параметры в MLflow
                mlflow.log_params(best_params)
                mlflow.log_metric("rmse", best_rmse)
                mlflow.sklearn.log_model(best_model, "model")

                # Сохраняем результаты в словарь
                results[model_name] = {
                    'model': best_model,
                    'params': best_params,
                    'rmse': best_rmse
                }

    return results

# Пример использования:
X = data.drop( columns="Close")
y = data["Close"]
results = tune_and_log_models(X, y)
print(results)


MlflowException: Detected out-of-date database schema (found version 5b0e9adcef9c, but expected 4465047574b1). Take a backup of your database, then run 'mlflow db upgrade <database_uri>' to migrate your database to the latest schema. NOTE: schema migration may result in database downtime - please consult your database's documentation for more detail.