**Load libraries and config**

In [None]:
import sys
import os

# add the root of the project to the Python path
sys.path.append(os.path.abspath('../..'))

import yaml
from src.data.data_loader import load_data, prepare_data
from src.data.features import visitor_features
from src.data.data_preparation import feature_selection
import mlflow
import itertools
from src.models.clustering import AnomalyDetection
from src.models.utils import predictions_correction, log_artifact
from src.models.explainability import ModelExplainability
from src.visuals.plots import kde_group

In [None]:
# read config
with open('config.yml', 'r') as file:
    config=yaml.load(file, Loader=yaml.SafeLoader)
del file

**Load visitorid features**

In [None]:
# load and prepare data
data = load_data(data_paths = config['data_loader'])
data = prepare_data(data = data, config = config['data_preparation'])

In [None]:
# create features at visitorid level and go through feature selection process
features_visitor = visitor_features(data = data, config = config['data_preparation'], drop_bouncers=True)
config['model']['anomaly_detection']['features'] = feature_selection(dataframe = features_visitor)
del data

**Fit Isolation Forest model**

In [None]:
# Set up MLflow tracking
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment(experiment_name='Anomaly_Detection')
mlflow.autolog(disable=True)

# Prepare parameter combinations
params_space = config['model']['anomaly_detection']['isolation_forest']['params']
param_combinations = [
    dict(zip(params_space.keys(), combo)) for combo in itertools.product(*params_space.values())
]

# each param combination is logged under a new run
with mlflow.start_run(run_name='Isolation_Forest'):
    for params in param_combinations:
        with mlflow.start_run(nested=True):

            # select isolation forest algo and respective params
            if_Model = AnomalyDetection(method='isolation_forest', **params)

            # train algorithm and infer prediction label and its score
            if_Model.fit(features_visitor[config['model']['anomaly_detection']['features']])
            features_visitor['anomaly_label'] = if_Model.predict(features_visitor[config['model']['anomaly_detection']['features']])
            features_visitor['anomaly_score'] = if_Model.scoring(features_visitor[config['model']['anomaly_detection']['features']])
            # predictions correction (rule-based)
            #features_visitor = predictions_correction(dataframe = features_visitor)

            if_ModelExplainability = ModelExplainability(model=if_Model, data=features_visitor[config['model']['anomaly_detection']['features']])

            # logging
            mlflow.log_dict(dictionary=config, artifact_file="config.yml")
            mlflow.log_params(params=params)
            mlflow.sklearn.log_model(sk_model=if_Model.model, artifact_path='model_instance')
            artifacts = [
                ('visuals', None, None, 'kde_anomaly_score')
                , ('stats', features_visitor.groupby(by=['anomaly_label'])['anomaly_score'].describe().round(2).reset_index(), 'anomaly_score_stats', None)
                , ('feature_importance', if_ModelExplainability.feature_importance(), 'naive_feature_importance', None)
                , ('feature_importance', if_ModelExplainability.shap_values.importance_values(), 'shap_feature_importance', None)
                , ('stats',
                   features_visitor.groupby(by=['anomaly_label']).agg({
                       col: ['min', 'median', 'max', 'std'] for col in if_ModelExplainability.feature_importance().iloc[:10,].index.to_list()
                       }).round(2).T, 'naive_top10_features_stats', None)
                ]
            kde_group(features_visitor, measure='anomaly_score', column_group='anomaly_label', xlabel='Anomaly Score', save_path='kde_anomaly_score')
            for artifact_path, df, df_name, image_name in artifacts:
                log_artifact(artifact_path, df, df_name, image_name)

**Model Explainability**

In [None]:
if_ModelExplainability = ModelExplainability(model=if_Model, data=features_visitor[config['model']['anomaly_detection']['features']])

#if_ModelExplainability.shap_values.plot(method='global')
#if_ModelExplainability.tree_estimator()

In [None]:
# plot 1 or 2 features
#kde_group(dataframe = features_visitor, measure = 'num_views', column_group='anomaly_label', xlabel='num_views')