In [None]:
import sys
import os


# add the root of the project to the Python path
sys.path.append(os.path.abspath('..'))

import yaml
from src.data.data_loader import load_data, prepare_data
from src.data.features import visitor_features
from src.models.clustering import AnomalyDetection

In [None]:
# read config
with open('config/config.yml', 'r') as file:
    config=yaml.load(file, Loader=yaml.SafeLoader)

# load and prepare data
data = load_data(data_paths = config['data_loader'])
data = prepare_data(data = data, config = config['data_preparation'])

In [None]:
features_visitor = visitor_features(data = data, config = config['data_preparation'])
del data

In [None]:
params={
    'n_estimators': 100
    , 'max_samples': 'auto'
    , 'contamination': 'auto'
    , 'max_features': 1.0
    , 'bootstrap': False
    , 'n_jobs': -1, 'random_state': 42, 'verbose': 0
    }
model = AnomalyDetection(method='isolation_forest', **params)

# train algorithm
model_cols = features_visitor.columns
model.fit(features_visitor[model_cols])
# inference phase with prediction label and its score
features_visitor['anomaly_label'], features_visitor['anomaly_score'] = model.predict(features_visitor), model.get_scores(features_visitor)

# find the number of anomalies and normal points here points classified -1 are anomalous
print(features_visitor['anomaly_label'].value_counts())

In [None]:
from src.visuals.plots import kde_group

kde_group(dataframe = features_visitor, measure = 'anomaly_score', column_group='anomaly_label', xlabel='Anomaly Score')

In [None]:
features_visitor.groupby(by=['anomaly']).agg({
    'min_view_delta': ['min', 'mean', 'max']
    , 'mean_view_delta': ['min', 'mean', 'max']
    , 'max_view_delta': ['min', 'mean', 'max']
    #, 'numevents_1824h': ['min', 'mean', 'max']
    #, 'numevents_0006h': ['min', 'mean', 'max']
    , 'total_events': ['min', 'mean', 'max']
    , 'num_views': ['min', 'mean', 'max']
    , 'repetitive_action_count': ['min', 'mean', 'max']
    })

In [None]:
import numpy as np
import pandas as pd

# Initialize array to store feature contributions
feature_importances = np.zeros(features_visitor[model_cols].shape[1])

# Access 'estimators_' safely
estimators = model.get_model_attribute("estimators_")
for tree in estimators:
    tree_features = tree.tree_.feature
    # Count how many times each feature is used across all splits in this tree
    for feature in range(features_visitor[model_cols].shape[1]):
        # Sum path lengths for nodes where this feature is used to split
        feature_importances[feature] += np.sum(tree_features == feature)

# Normalize to get relative feature contributions
feature_importances /= feature_importances.sum()
feature_contributions = pd.Series(feature_importances, index=features_visitor[model_cols].columns)
display("Feature Contributions to Anomaly Scores:",
        pd.DataFrame(feature_contributions, columns=['weight']).sort_values(by='weight', ascending=False))

In [None]:
import shap

# SHAP Explanation for Tree-based Models
explainer = shap.TreeExplainer(model.model)

# Calculate SHAP values for each instance
shap_values = explainer.shap_values(features_visitor[model_cols])

# SHAP values give us a per-instance, per-feature contribution to the anomaly score
# For a specific instance, view SHAP values and plot them
instance_index = 0  # Choose an index to explain
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[instance_index], features_visitor[model_cols].iloc[instance_index])

# Global summary plot to view the overall feature importance for anomaly detection
shap.summary_plot(shap_values, features_visitor[model_cols], plot_type="bar")

# Optional: Use a SHAP dependence plot for specific features
shap.dependence_plot("mean_view_delta", shap_values, features_visitor[model_cols])