**Setup**

- Import the required libraries and helper functions
- Load configuration settings

In [None]:
import sys
import os
sys.path.append(os.path.abspath('../..'))

import numpy as np
import yaml
import itertools
import mlflow
from src.data.data_loader import load_data, prepare_data
from src.data.features import visitor_features
from src.data.data_preparation import feature_selection
from src.data.utils import downcast_cols, normalization
from src.models.clustering import AnomalyDetection
from src.models.utils import predictions_correction, log_artifact
from src.models.explainability import ModelExplainability
from src.visuals.plots import kde_group

In [None]:
# read config
with open('config.yml', 'r') as file:
    config=yaml.load(file, Loader=yaml.SafeLoader)

**Data Loading and Feature Engineering**
- Load raw data using specified paths from the configuration file and preprocesses it for modeling
- Extracts visitor-level features and applies a feature selection process

In [None]:
# load and prepare raw data
data = load_data(data_paths = config['data_loader'])
data = prepare_data(data = data, config = config['data_preparation'])

In [None]:
# create visitor level features and perform feature selection
features_visitor = visitor_features(data = data, config = config['data_preparation'], drop_bouncers=True)
config['model']['anomaly_detection']['features'] = feature_selection(dataframe = features_visitor)

# data normalization
features_visitor = normalization(dataframe = features_visitor[config['model']['anomaly_detection']['features']], method = 'min_max')
features_visitor = downcast_cols(dataframe = features_visitor)
features_visitor = np.array(features_visitor)

del data, file

**Model Evalutation and Explainability**
- Fit the HDBSCAN model using grid search over multiple parameter combinations
- Leverages MLflow for tracking runs, logging metrics, and storing models

In [None]:
# prepare parameter combinations
params_space = config['model']['anomaly_detection']['hdbscan']['params']
param_combinations = [
    dict(zip(params_space.keys(), combo)) for combo in itertools.product(*params_space.values())
]

In [None]:
hdbscan_Model = AnomalyDetection(method='hdbscan', **params)

# train algorithm and infer prediction label and its score
hdbscan_Model.fit(features_visitor[config['model']['anomaly_detection']['features']])
#features_visitor['anomaly_label'] = hdbscan_Model.predict(features_visitor[config['model']['anomaly_detection']['features']])
#features_visitor['anomaly_score'] = hdbscan_Model.scoring(features_visitor[config['model']['anomaly_detection']['features']])