In [None]:
import sys
import os


# add the root of the project to the Python path
sys.path.append(os.path.abspath('..'))

import yaml
from src.data.data_loader import load_data, prepare_data
from src.data.features import visitor_features
from src.models.clustering import AnomalyDetection
from src.models.explainability import ModelExplainability

In [None]:
# read config
with open('config/config.yml', 'r') as file:
    config=yaml.load(file, Loader=yaml.SafeLoader)

# load and prepare data
data = load_data(data_paths = config['data_loader'])
data = prepare_data(data = data, config = config['data_preparation'])

In [None]:
# create features at visitorid level
features_visitor = visitor_features(data = data, config = config['data_preparation'], drop_bouncers=True)
modeling_columns = features_visitor.columns
del data

In [None]:
# select isolation forest algo and respective params
if_params = config['model']['isolation_forest']['params']
if_model = AnomalyDetection(method='isolation_forest', **if_params)

# train algorithm and infer prediction label and its score
if_model.fit(features_visitor[modeling_columns])
features_visitor['anomaly_label'], features_visitor['anomaly_score'] = if_model.predict(features_visitor[modeling_columns]), if_model.scoring(features_visitor[modeling_columns])

# find the number of anomalies and normal points here points classified -1 are anomalous
print(features_visitor['anomaly_label'].value_counts())

In [None]:
# select dbscan algo and respective params
dbscan_params = config['model']['dbscan']['params']
dbscan_model = AnomalyDetection(method='dbscan', **dbscan_params)

# train algorithm and infer prediction label and its score
dbscan_model.fit(features_visitor[modeling_columns])
features_visitor['dbscan_label'] = dbscan_model.model.labels_

In [None]:
from sklearn.metrics import silhouette_score

features_visitor['dbscan_score'] = silhouette_score(
    X=features_visitor.drop(columns=['dbscan_label'])
    , labels=features_visitor['dbscan_label']
    )

In [None]:
if_ModelExplainability = ModelExplainability(model=if_model, data=features_visitor[modeling_columns])

#if_ModelExplainability.feature_importance()
# if_ModelExplainability.shap_values.plot(method='global')
#if_ModelExplainability.shap_values.importance_values()

In [None]:
from src.visuals.plots import kde_group

kde_group(dataframe = features_visitor, measure = 'anomaly_score', column_group='anomaly_label', xlabel='Anomaly Score')

In [None]:
features_visitor.groupby(by=['anomaly_label']).agg({
    'min_view_delta': ['min', 'mean', 'max']
    , 'mean_view_delta': ['min', 'mean', 'max']
    , 'max_view_delta': ['min', 'mean', 'max']
    #, 'numevents_1824h': ['min', 'mean', 'max']
    #, 'numevents_0006h': ['min', 'mean', 'max']
    , 'total_events': ['min', 'mean', 'max']
    , 'num_views': ['min', 'mean', 'max']
    , 'repetitive_action_count': ['min', 'mean', 'max']
    })