In [1]:
import sys
import os


# add the root of the project to the Python path
sys.path.append(os.path.abspath('..'))

import yaml
from src.data.data_loader import load_data, prepare_data
from src.data.features import visitor_features
from src.data.data_preparation import feature_selection
from src.models.clustering import AnomalyDetection
from src.models.explainability import ModelExplainability

In [None]:
# read config
with open('config/config.yml', 'r') as file:
    config=yaml.load(file, Loader=yaml.SafeLoader)

# load and prepare data
data = load_data(data_paths = config['data_loader'])
data = prepare_data(data = data, config = config['data_preparation'])

In [None]:
# create features at visitorid level and go through feature selection process
features_visitor = visitor_features(data = data, config = config['data_preparation'], drop_bouncers=True)
features_visitor = feature_selection(dataframe = features_visitor)

del data, file

In [None]:
# select isolation forest algo and respective params
if_Model = AnomalyDetection(method='isolation_forest', **config['model']['isolation_forest']['params'])

# train algorithm and infer prediction label and its score
if_Model.fit(features_visitor)
features_visitor['anomaly_label'], features_visitor['anomaly_score'] = if_Model.predict(features_visitor), if_Model.scoring(features_visitor)

In [None]:
if_ModelExplainability = ModelExplainability(model=if_Model, data=features_visitor.drop(columns=['anomaly_label', 'anomaly_score']))

if_ModelExplainability.feature_importance()
if_ModelExplainability.shap_values.plot(method='global')
if_ModelExplainability.shap_values.importance_values()
if_ModelExplainability.tree_estimator()

In [None]:
from src.visuals.plots import kde_group

kde_group(dataframe = features_visitor, measure = 'anomaly_score', column_group='anomaly_label', xlabel='Anomaly Score')

In [None]:
features_visitor.groupby(by=['anomaly_label']).agg({
    'min_view_delta': ['min', 'mean', 'max']
    , 'mean_view_delta': ['min', 'mean', 'max']
    , 'max_view_delta': ['min', 'mean', 'max']
    #, 'numevents_1824h': ['min', 'mean', 'max']
    #, 'numevents_0006h': ['min', 'mean', 'max']
    , 'num_views': ['min', 'mean', 'max']
    , 'repetitive_action_count': ['min', 'mean', 'max']
    })