In [1]:
import ast
import numpy as np 
import pandas as pd 
import mlflow
import mlflow.xgboost
from mlflow.models import infer_signature
from mlflow.tracking import MlflowClient
from sklearn.model_selection import TimeSeriesSplit
from sktime.performance_metrics.forecasting import (MeanAbsoluteScaledError, 
    MeanAbsolutePercentageError, MeanAbsoluteError, MeanSquaredError)
from xgboost import XGBRegressor

import optuna
from optuna.integration.mlflow import MLflowCallback
import joblib

from typing import Optional, Dict, Tuple, Literal
from enefit_challenge.dataset.dataset import EnefitDataset
from enefit_challenge.models.forecaster import Forecaster

import warnings
warnings.filterwarnings('ignore')


TRACKING_URI = "http://127.0.0.1:5000/" # local tracking URI -> launch mlflow before training 


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
not_feature_columns = ['datetime', 'row_id','prediction_unit_id','date','time', 'data_block_id']
cat_columns = ['county', 'product_type']
to_drop_cols = [
    '10_metre_u_wind_component_mean_f',
    '10_metre_v_wind_component_min_f',
    'cloudcover_low_mean_f',
    'dayofweek_sine',
    'direct_solar_radiation_max_f',
    'eic_count',
    'euros_per_mwh', # not sure about this one
    'hour_sine',
    'temperature_min_f',
    'total_precipitation_max_f',
    'week_sine',
    '10_metre_u_wind_component_min_f',
    '10_metre_v_wind_component_max_f',
    '10_metre_v_wind_component_std_f',
    'cloudcover_high_mean_f',
    'cloudcover_high_std_f',
    'cloudcover_low_min_f',
    'cloudcover_low_std_f',
    'cloudcover_mid_std_f',
    'cloudcover_total_std_f',
    'county_12', # find a way to drop
    'county_3', # find a way to drop
    'county_9', # find a way to drop
    'direct_solar_radiation_min_f',
    'direct_solar_radiation_std_f',
    'highest_price_per_mwh', # not sure about this one
    # 'installed_capacity', #this one should alpo be dropped?
    'month_cosine',
    'product_type_3', # find a way to drop
    'snowfall_max_f',
    'temperature_std_f',
    'total_precipitation_min_f',
    'total_precipitation_std_f',
    # 'year'
]

In [2]:
dataset = EnefitDataset()
train_df = dataset.load_enefit_training_data()

print(train_df.shape)
train_df.head()

(2017824, 80)


Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id,date,time,year,datediff_in_days,hour,hour_sine,hour_cosine,dayofweek,dayofweek_sine,dayofweek_cosine,week,week_sine,week_cosine,month,month_sine,month_cosine,target_1_days_ago,target_7_days_ago,eic_count,installed_capacity,euros_per_mwh,lowest_price_per_mwh,highest_price_per_mwh,temperature_min_f,temperature_mean_f,temperature_max_f,temperature_std_f,dewpoint_min_f,dewpoint_mean_f,dewpoint_max_f,dewpoint_std_f,cloudcover_high_min_f,cloudcover_high_mean_f,cloudcover_high_max_f,cloudcover_high_std_f,cloudcover_low_min_f,cloudcover_low_mean_f,cloudcover_low_max_f,cloudcover_low_std_f,cloudcover_mid_min_f,cloudcover_mid_mean_f,cloudcover_mid_max_f,cloudcover_mid_std_f,cloudcover_total_min_f,cloudcover_total_mean_f,cloudcover_total_max_f,cloudcover_total_std_f,10_metre_u_wind_component_min_f,10_metre_u_wind_component_mean_f,10_metre_u_wind_component_max_f,10_metre_u_wind_component_std_f,10_metre_v_wind_component_min_f,10_metre_v_wind_component_mean_f,10_metre_v_wind_component_max_f,10_metre_v_wind_component_std_f,direct_solar_radiation_min_f,direct_solar_radiation_mean_f,direct_solar_radiation_max_f,direct_solar_radiation_std_f,surface_solar_radiation_downwards_min_f,surface_solar_radiation_downwards_mean_f,surface_solar_radiation_downwards_max_f,surface_solar_radiation_downwards_std_f,snowfall_min_f,snowfall_mean_f,snowfall_max_f,snowfall_std_f,total_precipitation_min_f,total_precipitation_mean_f,total_precipitation_max_f,total_precipitation_std_f
0,0,0,1,0.713,0,2021-09-01,0,0,0,2021-09-01,00:00:00,2021,0,0,0.0,1.0,2,0.974928,-0.222521,35,-0.822984,-0.568065,9,-0.866025,-0.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,11,0,2,7.62,1,2021-09-01,0,89,44,2021-09-01,00:00:00,2021,0,0,0.0,1.0,2,0.974928,-0.222521,35,-0.822984,-0.568065,9,-0.866025,-0.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,11,0,2,0.0,0,2021-09-01,0,88,44,2021-09-01,00:00:00,2021,0,0,0.0,1.0,2,0.974928,-0.222521,35,-0.822984,-0.568065,9,-0.866025,-0.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,11,0,1,21.099,1,2021-09-01,0,87,43,2021-09-01,00:00:00,2021,0,0,0.0,1.0,2,0.974928,-0.222521,35,-0.822984,-0.568065,9,-0.866025,-0.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,11,0,1,0.0,0,2021-09-01,0,86,43,2021-09-01,00:00:00,2021,0,0,0.0,1.0,2,0.974928,-0.222521,35,-0.822984,-0.568065,9,-0.866025,-0.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [28]:
class XGBoostForecaster(Forecaster):
    """
        Implementation of a Forecaster using `XGBRegressor` as base model, 
        `optuna` for hyperparameters optimization and `mlflow` as backend to track experiments
        and register best-in-class model for time series prediction.
    """
    def __init__(
        self,
        experiment_name: str="xgboost",
        artifact_path: str="xgboost_model",
        model_name: str="enefit_xgboost"
    )-> None:
        """
        Initializes the `XGBForecaster`
        -------
        params:
        -------
        `experiment_name`: `str`
            the name the of the experiment under which mlflow's runs (or Optuna's trials) 
            will be collected
        `model_name`: `str`
            the name the final model will have in the registry
        `artifact_path`: `str`
            the path pointing to the mlflow artifact
        """
        self.tracking_uri = mlflow.set_tracking_uri(TRACKING_URI)
        self.experiment_name = experiment_name
        self.model_name = model_name
        self.artifact_path = artifact_path
        pass


    def fit_model(
        self,  
        X:pd.DataFrame,
        y:pd.Series,
        params:Optional[Dict]=None,
    ) -> XGBRegressor:
        """
        Trains a `XGBRegressor`
        -------     
        params:
        -------
        `X`:`pd.DataFrame`
            Features to use for fitting
        `y`:`pd.Series`
            Target variable
        `params`: `Optional[Dict]`
            optional dictionary of parameters to use
        -------     
        returns:
        -------
        fitted `XGBRegressor`
        """
        model = XGBRegressor(
            n_estimators=100, 
            objective='reg:squarederror',
            eval_metric='mae'
        )
        if params:
            model.set_params(**params)

        model.fit(X, y)
    
        return model
    
    def fit_and_test_fold(
        self, 
        params:Dict,
        X: pd.DataFrame, 
        y: pd.Series, 
        year_month_train, 
        year_month_test,
        metrics: list=["mae"]
    ) -> float:
        """
        Used for cross validation on different time splits; 
        also in charge of logging every experiment run / study trial into the backend.
        """
        
        first_dates_month = pd.to_datetime(X[['year', 'month']].assign(day=1))
        train_index = first_dates_month.isin(year_month_train)
        test_index = first_dates_month.isin(year_month_test)

        X_train = X[train_index]
        X_test = X[test_index]
        y_train = y[train_index]
        y_test = y[test_index]

        # fit model on training data
        mlflow.xgboost.autolog(log_datasets=False, log_models=False)
        model = self.fit_model(
            X_train, 
            y_train, 
            params
        )
        
        # generate predictions
        y_test_pred = model.predict(X_test)
        self.signature = infer_signature(X_train, y_test_pred)
        MAE = MeanAbsoluteError()
        mae = MAE(y_test, y_test_pred)
        MASE = MeanAbsoluteScaledError()
        mase = MASE(y_test, y_test_pred, y_train=y_train)
        MAPE = MeanAbsolutePercentageError()
        mape = MAPE(y_test, y_test_pred)
        MSE = MeanSquaredError()
        mse = MSE(y_test, y_test_pred)
        RMSE = MeanSquaredError(square_root=True)
        rmse = RMSE(y_test, y_test_pred)

        mlflow.xgboost.log_model(
            model, 
            artifact_path=self.artifact_path,
            signature=self.signature
        )
        mlflow.log_params(params)
        
        return mae, mase, mse, rmse, mape

    def train_model(
        self, 
        train_df: pd.DataFrame, 
        target_col: str,
        exclude_cols: list=[],
        categorical_features: list=[],
        params: Optional[Dict]=None,
        n_trials: int=100
    ) -> None:
        """ 
        Takes an instance of `XGBRegressor` model and tracks the hyperparameter tuning
        experiment on training set using `mlflow` and `optuna`.  
        Registers the best version of the model according to a specified metric (to be implemented).
        -------     
        params:
        -------
        `train_df`: `pd.DataFrame`
            the training data for the model.
        `target_col`: `str`
            the time-series target column
        `exclude_cols`: `list`  
            columns in dataset that should not be used
        `categorical_features`: `list`
            list of categorical features in the dataset
        `params`: `Optional[Dict]`
            optional dictionary of parameters to use
        `n_trials`: `int=100`
            number of optuna trials to conduct for hyperparameters tuning
        """
        self.categorical_features = categorical_features
        if len(self.categorical_features) > 0: 
           train_df = pd.get_dummies(train_df, columns=self.categorical_features)

        X = train_df.drop([target_col] + exclude_cols, axis=1)
        y = train_df[target_col]
        # unique year-month combinations -> to be used in cross-validation
        timesteps = np.sort(np.array(
            pd.to_datetime(X[['year', 'month']].assign(day=1)).unique().tolist()
        ))
        
        # define mlflow callback Handler for optuna 
        mlflc = MLflowCallback(
            metric_name=["MAE"]
        )
    
        @mlflc.track_in_mlflow() # decorator to allow mlflow logging
        def objective(trial):
            params = {
                'eval_metric': 'mae',
                'n_estimators': trial.suggest_int('n_estimators', 50, 250, log=True),
                'eta': trial.suggest_float('eta', 0.01, 0.95,log=True),
                'max_depth': trial.suggest_int('max_depth', 1, 10, log=True),
                'min_child_weight': trial.suggest_int('min_child_weight', 1, 25, log=True),
                'colsample_bytree': trial.suggest_float("colsample_bytree", 0.1, 1, log=True),
                'colsample_bylevel': trial.suggest_float("colsample_bylevel", 0.1, 1, log=True),
                'colsample_bynode': trial.suggest_float("colsample_bynode", 0.1, 1, log=True),
                'subsample': trial.suggest_float("subsample", 0.5, 1, log=True),
                'lambda': trial.suggest_float('lambda', 1e-3, 10.0, log=True),
                'alpha': trial.suggest_float('alpha', 1e-3, 10.0, log=True)
            }
            cv = TimeSeriesSplit(n_splits=3) # cross validation
            cv_mae = [None]*3
            cv_mase = [None]*3
            cv_mse = [None]*3
            cv_rmse = [None]*3
            cv_mape = [None]*3
            for i, (train_index, test_index) in enumerate(cv.split(timesteps)):
                cv_mae[i], cv_mase[i], cv_mse[i], cv_rmse[i], cv_mape[i] = self.fit_and_test_fold(
                    params,
                    X, 
                    y, 
                    timesteps[train_index], 
                    timesteps[test_index]
                )
            trial.set_user_attr('split_mae', cv_mae)
            trial.set_user_attr('split_mase', cv_mase)
            trial.set_user_attr('split_mse', cv_mse)
            trial.set_user_attr('split_rmse', cv_rmse)
            trial.set_user_attr('split_mape', cv_mape)

            mlflow.log_metrics(
                {
                    "MAE":np.mean(cv_mae),
                    "MASE": np.mean(cv_mase),
                    "MSE": np.mean(cv_mse),
                    "RMSE":np.mean(cv_rmse),
                    "MAPE":np.mean(cv_mape)
                }
            )
            mlflow.log_dict(
                dictionary={
                    "categorical_features": self.categorical_features
                },
                artifact_file="categorical_features.json"
            )

            return np.mean(cv_mae) 

        
        sampler = optuna.samplers.TPESampler(
            n_startup_trials=10, 
            seed=0
        )

        self.study = optuna.create_study(
            directions=['minimize'],
            sampler=sampler,
            study_name=self.experiment_name
        )

        self.study.optimize(objective, n_trials=n_trials, timeout= 7200, callbacks=[mlflc]) 


    def predict(
        self, 
        input_data: pd.DataFrame,
        use_best_from_run: bool=True,
        use_env_model: Literal["Staging", "Production", None]=None,
        use_version: int=None,
        ) -> pd.DataFrame:
        """ 
        Fetches a version of the model from the mlflow backend and uses it
        to perform prediction on new input data.  
        What version is used depends on params settings, 
        defaults to using the best version from the last experiment run. 
        -------     
        params:
        -------
        `input_data`: `pd.DataFrame`
            the input data for prediction,
            must have the same schema as what's in the model's signature.
        `use_best_from_run`: `bool=True`      
            use the best model from the current series of iterations, defaults to True
        `use_env_model`: `Literal["Staging", "Production", None]=None`
            use model from a given mlflow environment, defaults to None.  
            Said model might come from past iterations, depending on what you decide in the UI
        `use_version`: `int=None`
            use a previously trained version of the model. 
            Said version must have been registered from a previous iteration,  
            either by the UI or with mlflow's API
        """
        client = MlflowClient(tracking_uri=TRACKING_URI)

        if (use_best_from_run) & (use_env_model is None) & (use_version is None):

            experiment = mlflow.search_experiments(
                filter_string=f"name='{self.experiment_name}'"
            )
            experiment_id = experiment[0]._experiment_id
            best_run = client.search_runs(
                experiment_ids=[experiment_id],
                filter_string="",
                max_results=1,
                order_by=["metrics.MAE ASC"], # best run according to MAE
            )[0]
            model = mlflow.xgboost.load_model(
                model_uri=f"runs:/{best_run.info.run_id}/{self.artifact_path}"
            )
            model_info = mlflow.models.get_model_info(
                f"runs:/{best_run.info.run_id}/{self.artifact_path}"
            )

        elif (not use_best_from_run) & (use_env_model in ["Staging", "Production"]) & (use_version is None):

            model_metadata = client.get_latest_versions(
                name=self.model_name, 
                stages=["Staging"]
            )
            run_id = model_metadata[0].run_id
            model = mlflow.xgboost.load_model(
                model_uri=f"runs:/{run_id}/{self.artifact_path}"
            )
            model_info = mlflow.models.get_model_info(
                f"runs:/{run_id}/{self.artifact_path}"
            )
            
        elif (not use_best_from_run) & (use_env_model is None) & (use_version is not None):
            model = mlflow.xgboost.load_model(
                f"models:/{self.model_name}/{use_version}"
            )
            model_info = mlflow.models.get_model_info(
                f"models:/{self.model_name}/{use_version}"
            )
            
        if (not use_best_from_run) & (use_env_model is None) & (use_version is None):
            return ValueError(
                    "You must specify which kind of XGBoostForecaster you intend to use for prediction"
            )
        
        inputs = ast.literal_eval(model_info.signature_dict["inputs"])
        input_features = [d['name'] for d in inputs]
        artifact_uri = mlflow.search_runs(
            experiment_names=[self.experiment_name], 
            filter_string=f"run_id='{model_info.run_id}'"
        )["artifact_uri"][0]
        # access categorical features used in the training iterations
        categorical_features = mlflow.artifacts.load_dict(
            artifact_uri=artifact_uri+"/categorical_features.json"
        )["categorical_features"]
        
        if len(categorical_features) > 0: 
           input_data = pd.get_dummies(input_data, columns=categorical_features)
        
        X = input_data[input_features]

        y_pred = model.predict(X)

        return y_pred

In [29]:
xgb = XGBoostForecaster()

In [12]:
xgb.train_model(
    train_df=train_df,
    target_col="target",
    exclude_cols=not_feature_columns+to_drop_cols,
    categorical_features=cat_columns,
    n_trials=1
)

[I 2023-12-30 11:04:35,938] A new study created in memory with name: xgboost
[I 2023-12-30 11:05:21,351] Trial 0 finished with value: 162.28998810887705 and parameters: {'n_estimators': 121, 'eta': 0.25968501651282105, 'max_depth': 3, 'min_child_weight': 4, 'colsample_bytree': 0.2652496376759476, 'colsample_bylevel': 0.4424804764191636, 'colsample_bynode': 0.2738969595234697, 'subsample': 0.927727492754704, 'lambda': 7.155682161754866, 'alpha': 0.03417952912061012}. Best is trial 0 with value: 162.28998810887705.


### Prediction Test

In [25]:
test = pd.read_csv("../../input/example_test_files/test.csv")
revealed_targets = pd.read_csv("../../input/example_test_files/revealed_targets.csv")
client = pd.read_csv("../../input/example_test_files/client.csv")
ee = pd.read_csv("../../input/example_test_files/electricity_prices.csv")
gas = pd.read_csv("../../input/example_test_files/gas_prices.csv")
fcst_weather = pd.read_csv(
    "../../input/example_test_files/forecast_weather.csv",
    parse_dates=['origin_datetime', 'forecast_datetime']
)

test_df = dataset.prepare_enefit_new_data(
    new_df=test,
    revealed_targets=revealed_targets,
    df_client=client,
    df_electricity=ee,
    df_gas=gas,
    df_weather_fc=fcst_weather,
    train_df=train_df
)

print(test_df.shape)
test_df.head()

(12480, 80)


Unnamed: 0,county,is_business,product_type,is_consumption,datetime,data_block_id,row_id,prediction_unit_id,date,time,year,datediff_in_days,hour,hour_sine,hour_cosine,dayofweek,dayofweek_sine,dayofweek_cosine,week,week_sine,week_cosine,month,month_sine,month_cosine,target_2_days_ago,euros_per_mwh,lowest_price_per_mwh,highest_price_per_mwh,eic_count,installed_capacity,temperature_min_f,temperature_mean_f,temperature_max_f,temperature_std_f,dewpoint_min_f,dewpoint_mean_f,dewpoint_max_f,dewpoint_std_f,cloudcover_high_min_f,cloudcover_high_mean_f,cloudcover_high_max_f,cloudcover_high_std_f,cloudcover_low_min_f,cloudcover_low_mean_f,cloudcover_low_max_f,cloudcover_low_std_f,cloudcover_mid_min_f,cloudcover_mid_mean_f,cloudcover_mid_max_f,cloudcover_mid_std_f,cloudcover_total_min_f,cloudcover_total_mean_f,cloudcover_total_max_f,cloudcover_total_std_f,10_metre_u_wind_component_min_f,10_metre_u_wind_component_mean_f,10_metre_u_wind_component_max_f,10_metre_u_wind_component_std_f,10_metre_v_wind_component_min_f,10_metre_v_wind_component_mean_f,10_metre_v_wind_component_max_f,10_metre_v_wind_component_std_f,direct_solar_radiation_min_f,direct_solar_radiation_mean_f,direct_solar_radiation_max_f,direct_solar_radiation_std_f,surface_solar_radiation_downwards_min_f,surface_solar_radiation_downwards_mean_f,surface_solar_radiation_downwards_max_f,surface_solar_radiation_downwards_std_f,snowfall_min_f,snowfall_mean_f,snowfall_max_f,snowfall_std_f,total_precipitation_min_f,total_precipitation_mean_f,total_precipitation_max_f,total_precipitation_std_f,target_1_days_ago,target_7_days_ago
0,0,0,1,0,2023-05-28,634,2005872,0,2023-05-28,00:00:00,2023,0,0,0.0,1.0,6,-0.781831,0.62349,21,0.663123,-0.748511,5,0.866025,-0.5,2.675,87.54,28.3,34.1,507,4960.215,5.969629,8.281689,10.380518,1.621562,-2.081885,0.808301,4.113916,2.567398,0.011505,0.151587,0.413666,0.134898,0.0,0.0,0.0,0.0,0.0,0.002127,0.007644,0.003439,0.018768,0.153369,0.413666,0.134676,-0.63899,0.782006,2.180101,0.892457,2.240726,4.062454,6.556399,1.529462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.821,3.77
1,0,0,1,1,2023-05-28,634,2005873,0,2023-05-28,00:00:00,2023,0,0,0.0,1.0,6,-0.781831,0.62349,21,0.663123,-0.748511,5,0.866025,-0.5,471.887,87.54,28.3,34.1,507,4960.215,5.969629,8.281689,10.380518,1.621562,-2.081885,0.808301,4.113916,2.567398,0.011505,0.151587,0.413666,0.134898,0.0,0.0,0.0,0.0,0.0,0.002127,0.007644,0.003439,0.018768,0.153369,0.413666,0.134676,-0.63899,0.782006,2.180101,0.892457,2.240726,4.062454,6.556399,1.529462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,537.429,588.634
2,0,0,2,0,2023-05-28,634,2005874,1,2023-05-28,00:00:00,2023,0,0,0.0,1.0,6,-0.781831,0.62349,21,0.663123,-0.748511,5,0.866025,-0.5,0.0,87.54,28.3,34.1,11,34.0,5.969629,8.281689,10.380518,1.621562,-2.081885,0.808301,4.113916,2.567398,0.011505,0.151587,0.413666,0.134898,0.0,0.0,0.0,0.0,0.0,0.002127,0.007644,0.003439,0.018768,0.153369,0.413666,0.134676,-0.63899,0.782006,2.180101,0.892457,2.240726,4.062454,6.556399,1.529462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,2,1,2023-05-28,634,2005875,1,2023-05-28,00:00:00,2023,0,0,0.0,1.0,6,-0.781831,0.62349,21,0.663123,-0.748511,5,0.866025,-0.5,5.414,87.54,28.3,34.1,11,34.0,5.969629,8.281689,10.380518,1.621562,-2.081885,0.808301,4.113916,2.567398,0.011505,0.151587,0.413666,0.134898,0.0,0.0,0.0,0.0,0.0,0.002127,0.007644,0.003439,0.018768,0.153369,0.413666,0.134676,-0.63899,0.782006,2.180101,0.892457,2.240726,4.062454,6.556399,1.529462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.593,3.817
4,0,0,3,0,2023-05-28,634,2005876,2,2023-05-28,00:00:00,2023,0,0,0.0,1.0,6,-0.781831,0.62349,21,0.663123,-0.748511,5,0.866025,-0.5,13.899,87.54,28.3,34.1,1516,15977.56,5.969629,8.281689,10.380518,1.621562,-2.081885,0.808301,4.113916,2.567398,0.011505,0.151587,0.413666,0.134898,0.0,0.0,0.0,0.0,0.0,0.002127,0.007644,0.003439,0.018768,0.153369,0.413666,0.134676,-0.63899,0.782006,2.180101,0.892457,2.240726,4.062454,6.556399,1.529462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.473,19.662


In [31]:
preds = xgb.predict(
    input_data=test_df,
    use_best_from_run=False,
    use_version=3
)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 284.39it/s]
Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 401.60it/s]
Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 227.21it/s]


In [32]:
len(preds)

12480

In [33]:
pd.DataFrame(preds)

Unnamed: 0,0
0,239.794586
1,560.375122
2,-24.540211
3,108.391579
4,700.190308
...,...
12475,314.661804
12476,-25.649927
12477,107.281860
12478,149.229828


### Testing access to mlflow's dictionary artifact

In [13]:
model = mlflow.xgboost.load_model(
    f"models:/{xgb.model_name}/3"
)
model_info = mlflow.models.get_model_info(
    f"models:/{xgb.model_name}/3"
)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 223.48it/s]
Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 316.72it/s]


In [14]:
artifact_uri = mlflow.search_runs(experiment_names=["xgboost"], filter_string=f"run_id='{model_info.run_id}'")["artifact_uri"][0]

artifact_uri

'mlflow-artifacts:/927635481385254895/a3691d7a82e0462eb76cdd1df574bc7b/artifacts'

In [18]:
cat_feats_dict = mlflow.artifacts.load_dict(artifact_uri=artifact_uri+"/categorical_features.json")
cat_feats_dict

Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 203.16it/s]


{'categorical_features': ['county', 'product_type']}

In [22]:
cat_feats_dict["categorical_features"]

['county', 'product_type']