# Setting enviroment

In [None]:
%reload_ext kedro.ipython

In [None]:
import pandas as pd
import numpy as np
# import shap

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from feature_engine.imputation import CategoricalImputer,ArbitraryNumberImputer,MeanMedianImputer
from feature_engine.encoding import RareLabelEncoder, CountFrequencyEncoder
from feature_engine.creation import MathFeatures, RelativeFeatures
from feature_engine.wrappers import SklearnTransformerWrapper
from feature_engine.selection import (
    DropDuplicateFeatures, 
    DropConstantFeatures,
    DropHighPSIFeatures,
    SelectByInformationValue,
    SmartCorrelatedSelection,
    RecursiveFeatureElimination,
    RecursiveFeatureAddition,
    SelectByShuffling
)

from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from scipy.stats import ks_2samp

import matplotlib.pyplot as plt
import seaborn as sns

import mlflow

import warnings
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import (
    RandomForestClassifier, ExtraTreesClassifier,
    HistGradientBoostingClassifier, GradientBoostingClassifier
)
from sklearn.linear_model import LogisticRegression
from olist_project.utils.model import mlflow_experiment_run_cv
from category_encoders.cat_boost import CatBoostEncoder
from catboost import Pool, EShapCalcType, EFeaturesSelectionAlgorithm

from olist_project.utils.model import (
    mlflow_experiment_run_cv, ModelType, get_model,
    MetricType, objective
)

import copy

import optuna
from optuna.storages import JournalStorage, JournalFileStorage
from optuna.storages.journal import JournalFileBackend
from optuna.samplers import TPESampler

In [None]:
pd.set_option('display.max_columns', 110)
pd.set_option('display.max_rows', 110)

In [None]:
experiment_name = 'hyperparameters_tuning'
mlflow_tracking_uri = context.project_path.as_uri()+'/mlflow'
mlflow.set_tracking_uri(mlflow_tracking_uri)
mlflow.set_experiment(experiment_name)

[1m<[0m[1;95mExperiment:[0m[39m [0m[33martifact_location[0m[39m=[0m[32m'file:///home/bruno/Documents/Programming/Programming_projects/olist_project/mlflow/661832313454193859'[0m[39m, [0m[33mcreation_time[0m[39m=[0m[1;36m1735658302073[0m[39m, [0m[33mexperiment_id[0m[39m=[0m[32m'661832313454193859'[0m[39m, [0m[33mlast_update_time[0m[39m=[0m[1;36m1735658302073[0m[39m, [0m[33mlifecycle_stage[0m[39m=[0m[32m'active'[0m[39m, [0m[33mname[0m[39m=[0m[32m'feature_selection'[0m[39m, [0m[33mtags[0m[39m=[0m[1;39m{[0m[1;39m}[0m[1m>[0m

# Functions

In [None]:
def _get_trial_by_id(study, trial_id):
    for trial in study.trials:
        if trial._trial_id == trial_id:
            return trial
    message = f'Trial ID {trial_id} not found'
    raise ValueError(message)

def _get_top_n_trials(study, n_top_trials, feature_selection, validation_type):
    if feature_selection:
        sel_trials  = [trial for trial in study.trials if trial.params['n_features']>0]
    else:
        sel_trials  = [trial for trial in study.trials if (trial.value is not None) and (trial.value<1)]
    if validation_type == 'all':
        max_metrics = [0]*4
        id_max_metrics = [0]*4
        for trial in sel_trials:
            for i in range(4):
                max_metrics[i], id_max_metrics[i] = (
                    (trial.values[i], trial._trial_id ) if trial.values[i]>max_metrics[i] 
                            else (max_metrics[i],id_max_metrics[i])
                )
        top_n_trials = []
        id_max_metrics = list(set(id_max_metrics))
        for id_max_metric in id_max_metrics:
            top_n_trials.append(_get_trial_by_id(study,id_max_metric))
    else:
        top_n_trials = sorted(sel_trials, key=lambda trial: trial.value)[-n_top_trials:]
    return top_n_trials

In [None]:
def _optimize(validation_type,
              X_dev,
              y_dev,
              cohort_dev,
              features,
              feat_set_name,
              with_std_penalization,
              feature_selection,
              base_model_type,
              get_model_func,
              min_n_features,
              n_folds,
              random_state,
              n_trials,
              performance_group=None):
    posfix_std_pen = f'_with_std_penalization' if with_std_penalization else ''
    posfix_feat = f'_with_feature_selection_min{min_n_features}' if feature_selection else ''
    name_run = f'{validation_type}_{feat_set_name}{posfix_std_pen}{posfix_feat}_opt_{base_model_type.value}'.lower()
    print(name_run)
    storage = JournalStorage(JournalFileBackend(f"./tmp/optuna-journal-{name_run}.log"))
    sampler = TPESampler(seed=random_state)
    if validation_type == 'all':
        study = optuna.create_study(study_name=f'study_{name_run}',storage=storage, 
                                    directions=['maximize','maximize','maximize','maximize'],
                                    load_if_exists=True,
                                    sampler=sampler)
    else: 
        study = optuna.create_study(study_name=f'study_{name_run}',storage=storage, 
                                    directions=['maximize'], load_if_exists=True,
                                    sampler=sampler)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        X_dev_new = X_dev[features].copy()
        obj_func = lambda trial: objective(trial,
                                           X_dev_new, y_dev, cohort_dev,
                                           validation_type = validation_type,
                                           with_std_penalization = with_std_penalization,
                                           feature_selection = feature_selection,
                                           model_type = base_model_type,
                                           get_model_function = get_model_func,
                                           min_n_features = min_n_features,
                                           cv_n_folds = n_folds,
                                           performance_group = performance_group)
        if validation_type == MetricType.ALL:
            study.optimize(obj_func, n_trials=n_trials*7//10)
        else:
            study.optimize(obj_func,n_trials=n_trials)

    return name_run,study

def _save_top_n_trials_to_mlflow(
    study,
    save_n_top_trials,
    mlflow_log,
    nested_run,
    parent_run_name,
    parent_run_id,
    child_run_name,
    feature_selection,
    base_model_type,
    validation_type,
    n_percentiles,
    shap_plots,
    X_dev,
    y_dev,
    cohort_dev,
    X_oot,
    y_oot,
    cohort_oot,
    features,
    n_folds,
    get_model_func,
    group_dev=None,
    group_oot=None
):
    top_n_trials = _get_top_n_trials(study, save_n_top_trials, feature_selection, validation_type)
    if mlflow_log and nested_run:
        mlflow.start_run(run_name=parent_run_name,
                         run_id=parent_run_id)
    for best_trial in top_n_trials:
        id = best_trial._trial_id
        best_params = best_trial.params
        best_params.pop('n_features',None)
        child_run_name = f'{child_run_name}_id_{id}'
        if feature_selection:
            sel_features = best_trial.user_attrs['sel_features']
        else:
            sel_features = features
        X_dev_new = X_dev[sel_features].copy()
        X_oot_new = X_oot[sel_features].copy()
        model = get_model_func(X_dev[sel_features],best_params,model_type=base_model_type)
        print(f'\033[92mTrial {id}\033[0m')
        if validation_type == 'all':
            mlflow_experiment_run_cv(model, X_dev_new, X_oot_new,
                                        y_dev, y_oot,
                                        cohort_dev, cohort_oot,
                                        n_percentiles=n_percentiles,
                                        optuna_study=None,
                                        metric_plots=True,
                                        n_folds=n_folds,
                                        shap_plots=shap_plots,
                                        mlflow_log=mlflow_log,
                                        run_name=child_run_name,
                                        nested_run=nested_run,
                                        log_datasets=False,
                                        log_model=True,
                                        features=None,
                                        group_dev=group_dev,
                                        group_oot=group_oot)
        else:
            mlflow_experiment_run_cv(model, X_dev_new, X_oot_new,
                                    y_dev, y_oot,
                                    cohort_dev, cohort_oot,
                                    n_percentiles=n_percentiles,
                                    n_folds=n_folds,
                                    optuna_study=study,
                                    metric_plots=True,
                                    shap_plots=shap_plots,
                                    mlflow_log=mlflow_log,
                                    run_name=child_run_name,
                                    nested_run=nested_run,
                                    log_datasets=False,
                                    log_model=True,
                                    features=None,
                                    group_dev=group_dev,
                                    group_oot=group_oot)
    if mlflow_log and nested_run:
        mlflow.end_run()

In [None]:
def download_mlflow_model(run_id):
    model_uri = f'runs:/{run_id}/model'
    model = mlflow.sklearn.load_model(model_uri=model_uri)
    return model

def get_feature_names_in_model(run_id):
    return list(download_mlflow_model(run_id).feature_names_in_)

# Retrieving runs

In [None]:
exp_ids = ['52d22bf5-de12-4351-a2b4-87190001e5d7']
filter_string = "tags.mlflow.runName = 'dcf95_ddf_psi10_smartRFECorr80_lgbm'"
feature_sel_parent_run_id = [
    "88f05110-afd1-4cb1-ae63-9c9ffe138482",
    "69cb1e1e-09b3-47fe-abbe-b32a8a167119"
]
feature_sel_child_run_id = (
    mlflow.search_runs(experiment_ids=exp_ids)
    .query(f'`tags.mlflow.parentRunId`.isin({feature_sel_parent_run_id})')
    [['tags.mlflow.runName','run_id']].set_index('run_id')
    .to_dict(orient='dict')['tags.mlflow.runName']
)
feature_sel_child_run_id

# Load data

In [None]:
random_state = catalog.load('params:random_state')
id_col = catalog.load('params:audience_building.id_col')
cohort_col = catalog.load('params:audience_building.cohort_col')
target_name = catalog.load('params:modeling.target')
X_dev = catalog.load("X_train")
y_dev = catalog.load("y_train")
y_dev = y_dev[target_name]
id_model_dev = catalog.load("id_model_train")
cohort_dev = pd.to_datetime(id_model_dev[cohort_col], format='%Y%m')

X_oot = catalog.load("X_test_oot")
y_oot = catalog.load("y_test_oot")
y_oot = y_oot[target_name]
id_model_oot = catalog.load("id_model_test_oot")
cohort_oot = pd.to_datetime(id_model_oot[cohort_col], format='%Y%m')

# Tuning

In [None]:
WITH_STD_PENALIZATION = False
FEATURE_SELECTION = False
min_n_features = 10
base_model_type = ModelType.LGBM
validation_type = MetricType.TEST_CV_PREDICT
mlflow_log = True
n_trials = 70
top_n_trials = 1
parent_run_name = f"model_optimization"
nested_run = False
shap_plots = True
n_percentiles = 5
n_folds = 5
get_model_func = get_model
for run_id, feat_set_name in list(feature_sel_child_run_id.items()):
    features = get_feature_names_in_model(run_id)
    child_run_name, study = _optimize(
        X_dev=X_dev,
        y_dev=y_dev,
        cohort_dev=cohort_dev,
        features=features,
        validation_type=validation_type,
        with_std_penalization=WITH_STD_PENALIZATION,
        feat_set_name=feat_set_name,
        feature_selection=FEATURE_SELECTION,
        base_model_type=base_model_type,
        get_model_func=get_model_func,
        min_n_features=min_n_features,
        n_folds=n_folds,
        random_state=random_state,
        n_trials=n_trials
    )
    _save_top_n_trials_to_mlflow(
        study=study,
        save_n_top_trials=top_n_trials,
        mlflow_log=mlflow_log,
        nested_run=nested_run,
        parent_run_name=parent_run_name,
        parent_run_id='',
        child_run_name=child_run_name,
        feature_selection=FEATURE_SELECTION,
        base_model_type=base_model_type,
        validation_type=validation_type,
        n_percentiles=n_percentiles,
        shap_plots=shap_plots,
        X_dev=X_dev,
        y_dev=y_dev,
        cohort_dev=cohort_dev,
        X_oot=X_oot,
        y_oot=y_oot,
        cohort_oot=cohort_oot,
        features=features,
        n_folds=n_folds,
        get_model_func=get_model_func,
        group_dev = X_dev.CAD_TIPO_NEGOCIO.map(map_tn),
        group_oot = X_oot.CAD_TIPO_NEGOCIO.map(map_tn)
    )