**Setup**

- Import the required libraries and helper functions
- Load configuration settings

In [None]:
import sys
import os
sys.path.append(os.path.abspath('../..'))

import numpy as np
import pandas as pd
import yaml
import itertools
import mlflow
from src.data.data_loader import load_data, prepare_data, load_queries
from src.data.data_preparation import feature_selection
from src.data.utils import downcast_cols, normalization
from src.models.clustering import AnomalyDetection
from src.models.utils import log_artifact
from sklearn.metrics import silhouette_score

In [None]:
# read config
with open('config.yml', 'r') as file:
    config=yaml.load(file, Loader=yaml.SafeLoader)

**Data Loading and Feature Engineering**
- Load raw data using specified paths from the configuration file and preprocesses it for modeling
- Extracts visitor-level features and applies a feature selection process

In [None]:
# load and prepare raw data
data = load_data(data_paths = config['data_loader'])
data = prepare_data(data = data, config = config['data_preparation'])

In [None]:
# create visitor level features and perform feature selection
data_features = load_queries(data_paths= config['features'], data= data)
features_visitor = data_features['visitor']
config['model']['anomaly_detection']['features'] = feature_selection(dataframe = features_visitor)

# data normalization
features_visitor = normalization(dataframe = features_visitor[config['model']['anomaly_detection']['features']], method = 'min_max')
features_visitor = downcast_cols(dataframe = features_visitor)

del data, file, data_features

**Model Evalutation and Explainability**
- Fit the HDBSCAN model using grid search over multiple parameter combinations
- Leverages MLflow for tracking runs, logging metrics, and storing models

In [None]:
# prepare parameter combinations
params_space = config['model']['anomaly_detection']['hdbscan']['params']
param_combinations = [
    dict(zip(params_space.keys(), combo)) for combo in itertools.product(*params_space.values())
]

In [None]:
# set up MLflow tracking
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment(experiment_name='Anomaly_Detection')
mlflow.autolog(disable=True)

# open a run for HDBSCAN algorithm
with mlflow.start_run(run_name='HDBSCAN'):
    for params in param_combinations:
        # select hdbscan algo and respective params
        hdbscan_Model = AnomalyDetection(method='hdbscan', **params)

        # each param combination is logged under a new nested run
        with mlflow.start_run(nested=True):

            # train algorithm and infer prediction label and its score
            hdbscan_Model.fit(np.array(features_visitor))
            features_visitor['anomaly_label'] = hdbscan_Model.predict(features_visitor[config['model']['anomaly_detection']['features']])
            features_visitor['anomaly_score'] = hdbscan_Model.scoring(features_visitor[config['model']['anomaly_detection']['features']])
            features_visitor['outlier_score'] = hdbscan_Model.model.outlier_scores_

            # logging config, params and model
            mlflow.log_dict(dictionary=config, artifact_file="config.yml")
            mlflow.log_params(params=params)
            signature = mlflow.models.infer_signature(
                model_input = features_visitor[config['model']['anomaly_detection']['features']]
                , model_output = features_visitor['anomaly_label']
                )
            mlflow.sklearn.log_model(sk_model=hdbscan_Model.model, artifact_path='model_instance', signature=signature)

            # logging artifacts
            artifacts = [
                ('datasets', features_visitor, 'visitor_features', None)
                , ('stats', hdbscan_Model.model.condensed_tree_.to_pandas(), 'condensed_tree', None)
                , ('stats', hdbscan_Model.model.single_linkage_tree_.to_pandas(), 'single_linkage_tree', None)
                , ('stats', pd.DataFrame(hdbscan_Model.model.cluster_persistence_, columns=['persistence']), 'cluster persistence', None)
            ]
            for artifact_path, df, df_name, image_name in artifacts:
                log_artifact(artifact_path, df, df_name, image_name)

            # logging metrics
            mlflow.log_metric(
                key = 'silhouette_score_sample'
                , value = silhouette_score( X = np.array(features_visitor[features_visitor['anomaly_label']!=-1].iloc[:,:-3])
                                           , labels = features_visitor[features_visitor['anomaly_label']!=-1]['anomaly_label']
                                           , sample_size = 100000
                                           )
                                           )