In [18]:
import numpy as np 
import pandas as pd 
import mlflow
import mlflow.catboost
from catboost import CatBoostRegressor
from mlflow.models import infer_signature
from mlflow.tracking import MlflowClient
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error

import optuna
from optuna.integration.mlflow import MLflowCallback
import joblib

from typing import Optional,Dict,Tuple
from pathlib import Path
from enefit_challenge.utils.dataset import load_enefit_training_data

In [7]:
df_train = load_enefit_training_data()
df_train.describe()

Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id,year,...,week_cosine,month,month_sine,month_cosine,target_2_days_ago,eic_count,installed_capacity,euros_per_mwh,lowest_price_per_mwh,highest_price_per_mwh
count,2017824.0,2017824.0,2017824.0,2017824.0,2017824.0,2017824,2017824.0,2017824.0,2017824.0,2017824.0,...,2017824.0,2017824.0,2017824.0,2017824.0,2008656.0,2009184.0,2009184.0,2014628.0,2014896.0,2014896.0
mean,7.297039,0.5368268,1.898927,274.8556,0.5,2022-07-20 08:30:42.518673152,321.8754,1009178.0,33.0454,2022.057,...,0.1239993,6.430605,0.003706141,0.1254672,274.7798,73.42647,1452.349,157.4607,95.46883,108.311
min,0.0,0.0,0.0,0.0,0.0,2021-09-01 00:00:00,0.0,0.0,0.0,2021.0,...,-1.0,1.0,-1.0,-1.0,0.0,5.0,6.0,-10.06,28.1,34.0
25%,3.0,0.0,1.0,0.378,0.0,2022-02-14 04:00:00,166.0,504581.8,16.0,2022.0,...,-0.4647232,3.0,-0.8660254,-0.5,0.374,14.0,323.7,85.29,60.0,67.67
50%,7.0,1.0,2.0,31.133,0.5,2022-07-21 09:00:00,323.0,1009172.0,33.0,2022.0,...,0.2393157,6.0,0.0,6.123234000000001e-17,31.15,32.0,647.08,128.71,85.9,94.0
75%,11.0,1.0,3.0,180.2062,1.0,2022-12-24 14:00:00,479.0,1513763.0,50.0,2022.0,...,0.7485107,10.0,0.8660254,0.8660254,180.129,70.0,1567.15,199.97,109.74,133.0
max,15.0,1.0,3.0,15480.27,1.0,2023-05-31 23:00:00,637.0,2018351.0,68.0,2023.0,...,1.0,12.0,1.0,1.0,15480.27,1517.0,19314.31,4000.0,250.0,305.0
std,4.780994,0.4986421,1.081766,909.5024,0.5,,182.631,582637.8,19.59062,0.6452196,...,0.6650222,3.664933,0.7402501,0.6605109,909.4796,144.1306,2423.222,121.3226,47.58257,54.76264


In [8]:
df_train.head()

Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id,date,...,week_cosine,month,month_sine,month_cosine,target_2_days_ago,eic_count,installed_capacity,euros_per_mwh,lowest_price_per_mwh,highest_price_per_mwh
0,0,0,1,0.713,0,2021-09-01,0,0,0,2021-09-01,...,-0.568065,9,-0.866025,-0.5,,,,,,
1,11,0,2,7.62,1,2021-09-01,0,89,44,2021-09-01,...,-0.568065,9,-0.866025,-0.5,,,,,,
2,11,0,2,0.0,0,2021-09-01,0,88,44,2021-09-01,...,-0.568065,9,-0.866025,-0.5,,,,,,
3,11,0,1,21.099,1,2021-09-01,0,87,43,2021-09-01,...,-0.568065,9,-0.866025,-0.5,,,,,,
4,11,0,1,0.0,0,2021-09-01,0,86,43,2021-09-01,...,-0.568065,9,-0.866025,-0.5,,,,,,


In [11]:
not_feature_columns = ['datetime', 'row_id','prediction_unit_id','date','time']

X = df_train.drop(['target', 'data_block_id'] + not_feature_columns, axis=1)
y = df_train['target']

# unique year-month combinations -> to be used in cross-validation
timesteps = np.sort(np.array(
    pd.to_datetime(X[['year', 'month']].assign(day=1)).unique().tolist()
))

## Mlflow + Optuna 
#### Hyperparameter Tuning & Experiment Tracking

In [17]:
from enefit_challenge.models.forecaster import Forecaster

In [28]:
class CatBoostForecaster(Forecaster):
    def __init__(self)-> None:
        pass

    def fit_model(
        self,  
        X:pd.DataFrame,
        y:pd.Series,
        categorical_features: list=[],
        params:Optional[Dict]=None,
        experiment_name: str="catboost",
        artifact_path: str="catboost_model",
        metrics: list=["mae"]
    ) -> CatBoostRegressor:
        """
        Trains a `CatBoostRegressor` with a L1 loss and tracks it using mlflow
        """
        mlflow.set_experiment(experiment_name=experiment_name)
        with mlflow.start_run(nested=True): # mlflow context manager
            model = CatBoostRegressor(
                n_estimators=100, 
                objective='MAE',
                thread_count=1,
                bootstrap_type =  "Bernoulli",
                sampling_frequency= 'PerTree',
                verbose=0,
                cat_features=categorical_features,
                leaf_estimation_iterations=1
            )
            if params:
                model.set_params(**params)

            model.fit(X, y)

            mlflow.catboost.log_model(
                model, 
                artifact_path=artifact_path,
            )
            mlflow.log_params(params)
    
        return model
    
    def fit_and_test_fold(
        self, 
        params:Dict,
        X: pd.DataFrame, 
        y: pd.Series, 
        year_month_train, 
        year_month_test,
        categorical_features: list=[],
        experiment_name: str="catboost",
        artifact_path: str="catboost_model",
        metrics: list=["mae"]
    ) -> float:
        
        first_dates_month = pd.to_datetime(X[['year', 'month']].assign(day=1))
        train_index = first_dates_month.isin(year_month_train)
        test_index = first_dates_month.isin(year_month_test)
        X_train = X[train_index];X_test = X[test_index]
        y_train = y[train_index]; y_test = y[test_index]
        # fit model on training data
        model = self.fit_model(
            X_train, 
            y_train, 
            categorical_features,
            params
        )
        # generate predictions
        y_test_pred = model.predict(X_test)
        # signature = infer_signature(X_train, y_test_pred)
        
        return mean_absolute_error(y_test, y_test_pred)

    def train_model(
        self, 
        train_df: pd.DataFrame, 
        target_col: str,
        exclude_cols: list=[],
        categorical_features: list=[],
        experiment_name: str="catboost",
        artifact_path: str="catboost_model",
        # params: Optional[Dict]=None,
        metrics: list=["mae"]
    ) -> None:
        """ 
        Trains an instance of `CatBoostRegressor` model and tracks the hyperparameter tuning
        experiment with `mlflow` and `optuna`
        
        -------     
        params:
        -------
        `experiment_name`: `str`
            the name of the experiment used to store runs in mlflow
        `train_df`: `pd.DataFrame`
            the training data for the model.
        `target_col`: `str`
            the time-series target column
        `exclude_cols`: `list`  
            columns in dataset that should not be used
        `categorical_features`: `list`
            list of categorical features in the dataset
        `artifact_path`: `str`
            the path pointing to the mlflow artifact
        `metrics`: `list`
            list of the metrics to track in the mlflow experiment run.
        `params`: `Optional[Dict]`
            optional dictionary of parameters to use
        """

        X = train_df.drop([target_col] + exclude_cols, axis=1)
        y = train_df[target_col]
        # unique year-month combinations -> to be used in cross-validation
        timesteps = np.sort(np.array(
            pd.to_datetime(X[['year', 'month']].assign(day=1)).unique().tolist()
        ))

        # define mlflow callback Handler for optuna 
        mlflc = MLflowCallback(
            metric_name="MAE - Enefit",
        )
    
        @mlflc.track_in_mlflow() # decorator to allow mlflow logging
        def objective(trial):
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 50, 200),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.95,log=True),
                'depth': trial.suggest_int('depth', 3, 10, log=True),
                'l2_leaf_reg': trial.suggest_float('l2_leaf_reg',1e-8,100,log=True),
                'model_size_reg': trial.suggest_float('model_size_reg',1e-8,100,log=True),
                'colsample_bylevel': trial.suggest_float("colsample_bylevel", 0.1, 1),
                'subsample': trial.suggest_float("subsample", 0.5, 1)
            }
            cv = TimeSeriesSplit(n_splits=3) # cross validation
            cv_mae = [None]*3
            for i, (train_index, test_index) in enumerate(cv.split(timesteps)):
                cv_mae[i] = self.fit_and_test_fold(
                    params,
                    X, 
                    y, 
                    timesteps[train_index], 
                    timesteps[test_index],
                    categorical_features
                )
            trial.set_user_attr('split_mae', cv_mae)
            return np.mean(cv_mae)

        
        sampler = optuna.samplers.TPESampler(
            n_startup_trials=10, 
            seed=42
        )

        study = optuna.create_study(
            directions=['minimize'],
            sampler=sampler,
            study_name='catboost'
        )

        study.optimize(objective, n_trials=2, timeout= 3600, callbacks=[mlflc]) 

    def forecast():
        pass

In [29]:
not_feature_columns = ['datetime', 'row_id','prediction_unit_id','date','time']
cat_columns = ['county', 'product_type']

cbf = CatBoostForecaster()
cbf.train_model(
    train_df=df_train,
    target_col="target",
    exclude_cols=not_feature_columns,
    categorical_features=cat_columns,
    )

  mlflc = MLflowCallback(
  @mlflc.track_in_mlflow() # decorator to allow mlflow logging
[I 2023-11-12 18:42:58,269] A new study created in memory with name: catboost
[I 2023-11-12 18:44:12,736] Trial 0 finished with value: 132.17236164028296 and parameters: {'n_estimators': 106, 'learning_rate': 0.7590145927293601, 'depth': 7, 'l2_leaf_reg': 0.009695826644515229, 'model_size_reg': 3.6323392569431376e-07, 'colsample_bylevel': 0.2403950683025824, 'subsample': 0.5290418060840998}. Best is trial 0 with value: 132.17236164028296.
[I 2023-11-12 18:46:57,883] Trial 1 finished with value: 81.56286116006612 and parameters: {'n_estimators': 180, 'learning_rate': 0.15446829333828344, 'depth': 7, 'l2_leaf_reg': 1.6063676259174453e-08, 'model_size_reg': 50.014798288569374, 'colsample_bylevel': 0.8491983767203796, 'subsample': 0.6061695553391381}. Best is trial 1 with value: 81.56286116006612.


## Old

In [13]:
def fit_model(
    X:pd.DataFrame,
    y:pd.Series,
    config:Optional[Dict]=None,
    n_jobs:int=1,
    verbose:int=0
) -> CatBoostRegressor:
    """
    Trains a `CatBoostRegressor` with a L1 loss
    """
    model = CatBoostRegressor(
        n_estimators=100, 
        objective='MAE',
        thread_count=n_jobs,
        bootstrap_type =  "Bernoulli",
        sampling_frequency= 'PerTree',
        verbose=verbose,
        cat_features=['county', 'product_type'],
        leaf_estimation_iterations=1
    )
    
    if config:
        # if config is supplied, set the model hyperparameters
        model.set_params(**config)
        
    
    return model.fit(X,y)

def fit_and_test_fold(params:Dict, X, y, year_month_train, year_month_test) -> float:
    first_dates_month = pd.to_datetime(X[['year', 'month']].assign(day=1))
    
    train_index = first_dates_month.isin(year_month_train)
    test_index = first_dates_month.isin(year_month_test)
    
    X_train = X[train_index];X_test = X[test_index]
    y_train = y[train_index]; y_test = y[test_index]
    
    # fit model on training data
    model = fit_model(X_train, y_train, params, n_jobs=4)
    
    # generate predictions
    y_test_pred = model.predict(X_test)
    
    return mean_absolute_error(y_test, y_test_pred)

In [14]:
# define mlflow callback Handler for optuna 
mlflc = MLflowCallback(
    metric_name="MAE - Enefit",
)

# loss function
@mlflc.track_in_mlflow() # decorator to allow mlflow logging
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.95,log=True),
        'depth': trial.suggest_int('depth', 3, 10, log=True),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg',1e-8,100,log=True),
        'model_size_reg': trial.suggest_float('model_size_reg',1e-8,100,log=True),
        'colsample_bylevel': trial.suggest_float("colsample_bylevel", 0.1, 1),
        'subsample': trial.suggest_float("subsample", 0.5, 1)
    }
    cv = TimeSeriesSplit(n_splits=3) # cross validation
    cv_mae = [None]*3
    for i, (train_index, test_index) in enumerate(cv.split(timesteps)):
        cv_mae[i] = fit_and_test_fold(
            params, 
            X, 
            y, 
            timesteps[train_index], 
            timesteps[test_index]
        )
    trial.set_user_attr('split_mae', cv_mae)
        
    return np.mean(cv_mae)

  mlflc = MLflowCallback(
  @mlflc.track_in_mlflow()


In [16]:
sampler = optuna.samplers.TPESampler(
    n_startup_trials=10, seed=1234
)

study = optuna.create_study(
    directions=['minimize'],
    sampler=sampler,
    study_name='catboost'
)

study.optimize(objective, n_trials=10, timeout= 3600, callbacks=[mlflc]) 

[I 2023-11-12 16:22:04,586] A new study created in memory with name: catboost
2023/11/12 16:22:04 INFO mlflow.tracking.fluent: Experiment with name 'catboost' does not exist. Creating a new experiment.
[I 2023-11-12 16:22:33,028] Trial 0 finished with value: 101.25185644797455 and parameters: {'n_estimators': 78, 'learning_rate': 0.16996488374291047, 'depth': 5, 'l2_leaf_reg': 0.7138152748525242, 'model_size_reg': 0.6306059747250784, 'colsample_bylevel': 0.3453333447543775, 'subsample': 0.6382321275715483}. Best is trial 0 with value: 101.25185644797455.
[I 2023-11-12 16:23:56,877] Trial 1 finished with value: 116.92957519377278 and parameters: {'n_estimators': 171, 'learning_rate': 0.7851177877907137, 'depth': 9, 'l2_leaf_reg': 3.785931044085381e-05, 'model_size_reg': 0.001023178145374356, 'colsample_bylevel': 0.7151166416549227, 'subsample': 0.85635101349145}. Best is trial 0 with value: 101.25185644797455.
[I 2023-11-12 16:24:38,774] Trial 2 finished with value: 90.94295828509344 an