**Setup**

- Import the required libraries and helper functions
- Load configuration settings

In [None]:
import sys
import os
sys.path.append(os.path.abspath('../..'))

import yaml
import itertools
import mlflow
from src.data.data_loader import load_data, prepare_data, load_queries
from src.data.data_preparation import feature_selection
from src.models.clustering import AnomalyDetection
from src.models.utils import log_artifact
from src.models.explainability import ModelExplainability
from src.visuals.plots import kde_group

In [None]:
# read config
with open('config.yml', 'r') as file:
    config=yaml.load(file, Loader=yaml.SafeLoader)

**Data Loading and Feature Engineering**
- Load raw data using specified paths from the configuration file and preprocesses it for modeling
- Extracts visitor-level features and applies a feature selection process

In [None]:
# load and prepare raw data
data = load_data(data_paths = config['data_loader'])
data = prepare_data(data = data, config = config['data_preparation'])

In [None]:
# create visitor level features and perform feature selection
data_features = load_queries(data_paths= config['features'], data= data)
features_visitor = data_features['visitor']
config['model']['anomaly_detection']['features'] = feature_selection(dataframe = features_visitor)
del data, data_features

**Model Evalutation and Explainability**
- Fit the Isolation Forest model using grid search over multiple parameter combinations
- Leverages MLflow for tracking runs, logging metrics, and storing models

In [None]:
# prepare parameter combinations
params_space = config['model']['anomaly_detection']['isolation_forest']['params']
param_combinations = [
    dict(zip(params_space.keys(), combo)) for combo in itertools.product(*params_space.values())
]

In [None]:
# set up MLflow tracking
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment(experiment_name='Anomaly_Detection')
mlflow.autolog(disable=True)

# open a run for Isolation Forest algorithm
with mlflow.start_run(run_name='Isolation_Forest'):
    for params in param_combinations:
        # select isolation forest algo and respective params
        if_Model = AnomalyDetection(method='isolation_forest', **params)

        # each param combination is logged under a new nested run
        with mlflow.start_run(nested=True):
            # train algorithm and infer prediction label and its score
            if_Model.fit(features_visitor[config['model']['anomaly_detection']['features']])
            features_visitor['anomaly_label'] = if_Model.predict(features_visitor[config['model']['anomaly_detection']['features']])
            features_visitor['anomaly_score'] = if_Model.scoring(features_visitor[config['model']['anomaly_detection']['features']])

            # logging config, params and model
            mlflow.log_dict(dictionary=config, artifact_file="config.yml")
            mlflow.log_params(params=params)
            signature = mlflow.models.infer_signature(
                model_input = features_visitor[config['model']['anomaly_detection']['features']]
                , model_output = features_visitor['anomaly_label']
                )
            mlflow.sklearn.log_model(sk_model=if_Model.model, artifact_path='model_instance', signature=signature)

            # set model explainability for the model instance
            if_ModelExplainability = ModelExplainability(model=if_Model, data=features_visitor[config['model']['anomaly_detection']['features']])
            # temporally locally save visuals to be logged as artifacts
            kde_group(features_visitor, measure='anomaly_score', column_group='anomaly_label', xlabel='Anomaly Score', save_path='kde_anomaly_score')
            if_ModelExplainability.shap_values.plot(method='global', save_path='shap_feature_importance')
            if_ModelExplainability.shap_values.plot(method='instance', save_path='shap_example_instance')
            if_ModelExplainability.tree_estimator(save_path='example_estimator')
            # logging artifacts
            artifacts = [
                ('datasets', features_visitor, 'visitor_features', None)
                , ('visuals', None, None, 'kde_anomaly_score')
                , ('stats', features_visitor.groupby(by=['anomaly_label'])['anomaly_score'].describe().round(2).reset_index(), 'anomaly_score_stats', None)
                , ('feature_importance', if_ModelExplainability.feature_importance(), 'naive_feature_importance', None)
                , ('feature_importance', if_ModelExplainability.shap_values.importance_values(), 'shap_feature_importance', None)
                , ('visuals', None, None, 'shap_feature_importance')
                , ('visuals', None, None, 'shap_example_instance')
                , ('visuals', None, None, 'example_estimator')
                , ('stats',
                   features_visitor.groupby(by=['anomaly_label']).agg({
                       col: ['min', 'median', 'max', 'std'] for col in if_ModelExplainability.feature_importance().iloc[:10,].index.to_list()
                       }).round(2).T, 'naive_top10_features_stats', None)
                ]
            for artifact_path, df, df_name, image_name in artifacts:
                log_artifact(artifact_path, df, df_name, image_name)

**Model Explainability**

In [None]:
# plot 1 or 2 features
#kde_group(dataframe = features_visitor, measure = 'num_views', column_group='anomaly_label', xlabel='num_views')