In [90]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import shap

In [72]:

# Set random seed for reproducibility
np.random.seed(42)

# Create a sample DataFrame
n_samples = 200
n_anomalies = 10

# Generate normal data
data = {
    'feature1': np.random.normal(loc=50, scale=10, size=n_samples),  # Normal distribution
    'feature2': np.random.normal(loc=30, scale=5, size=n_samples),   # Normal distribution
    'feature3': np.random.normal(loc=100, scale=20, size=n_samples)  # Normal distribution
}

# Create a DataFrame
df = pd.DataFrame(data)

# Introduce anomalies (outliers)
anomaly_indices = np.random.choice(df.index, size=n_anomalies, replace=False)

# Add anomalies to the DataFrame
df.loc[anomaly_indices, 'feature1'] += np.random.uniform(30, 50, size=n_anomalies)  # Large increase
df.loc[anomaly_indices, 'feature2'] += np.random.uniform(10, 20, size=n_anomalies)  # Large increase
df.loc[anomaly_indices, 'feature3'] += np.random.uniform(50, 100, size=n_anomalies)  # Large increase

# Display the sample DataFrame
display(df.head(10))


Unnamed: 0,feature1,feature2,feature3
0,54.967142,31.788937,68.111447
1,48.617357,32.803923,88.0125
2,56.476885,35.415256,100.104874
3,65.230299,35.26901,100.939612
4,47.658466,23.111653,90.998691
5,97.149576,42.299333,188.940083
6,65.792128,32.575176,78.647591
7,57.674347,32.56893,97.15241
8,90.781947,44.39631,197.434528
9,55.4256,49.263657,110.288777


In [73]:
# Standardize the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)


In [74]:
# Fit KMeans with a specified number of clusters
n_clusters = 6  # Choose the number of clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(df_scaled)

# Predict cluster labels
df['cluster'] = kmeans.labels_


In [75]:
cluster_counts.nsmallest(2).index.tolist()

[2, 3]

In [76]:
# Find the smallest cluster
cluster_counts = df['cluster'].value_counts()
anomaly_cluster = cluster_counts.idxmin()

# Label anomalies
df['is_anomaly'] = df['cluster'].isin(cluster_counts.nsmallest(2).index.tolist())


In [77]:
# Calculate cluster centers
cluster_centers = kmeans.cluster_centers_

# Function to find significant changes and generate rules
def generate_rules(row, cluster_centers, anomaly_cluster):
    if row['is_anomaly']:
        anomaly_center = cluster_centers[anomaly_cluster]
        other_centers = np.delete(cluster_centers, anomaly_cluster, axis=0)
        deviations = np.abs(other_centers - row[:-2].values)  # Exclude 'cluster' and 'is_anomaly' columns
        
        rules = []
        for i, feature in enumerate(df.columns[:-2]):  # Exclude 'cluster' and 'is_anomaly' columns
            feature_value = row[feature]
            mean_non_anomaly = np.mean(other_centers[:, i])
            std_non_anomaly = np.std(other_centers[:, i])
            
            if feature_value < mean_non_anomaly - std_non_anomaly:
                rule = f"{feature} was below {mean_non_anomaly - std_non_anomaly:.2f}"
                rules.append(rule)
            elif feature_value > mean_non_anomaly + std_non_anomaly:
                rule = f"{feature} was above {mean_non_anomaly + std_non_anomaly:.2f}"
                rules.append(rule)
                
        return ', '.join(rules)
    return np.nan

# Apply the function to generate rules for each row
df['anomaly_rules'] = df.apply(generate_rules, axis=1, cluster_centers=cluster_centers, anomaly_cluster=anomaly_cluster)


In [78]:
# Display anomalies and their generated rules
anomalies = df[df['is_anomaly']]
display(anomalies[['is_anomaly', 'anomaly_rules']])


Unnamed: 0,is_anomaly,anomaly_rules
5,True,"feature1 was above 0.42, feature2 was above 0...."
8,True,"feature1 was above 0.42, feature2 was above 0...."
9,True,"feature1 was above 0.42, feature2 was above 0...."
10,True,"feature1 was above 0.42, feature2 was above 0...."
11,True,"feature1 was above 0.42, feature2 was above 0...."
13,True,"feature1 was above 0.42, feature2 was above 0...."
20,True,"feature1 was above 0.42, feature2 was above 0...."
25,True,"feature1 was above 0.42, feature2 was above 0...."
34,True,"feature1 was above 0.42, feature2 was above 0...."
35,True,"feature1 was above 0.42, feature2 was above 0...."


In [82]:
anomalies.iloc[0]['anomaly_rules']

'feature1 was above 0.42, feature2 was above 0.53, feature3 was above 0.56'

In [79]:
anomalies.index

Index([  5,   8,   9,  10,  11,  13,  20,  25,  34,  35,  38,  41,  43,  51,
        53,  56,  66,  81,  88,  92,  94, 103, 116, 121, 123, 127, 128, 152,
       153, 160, 161, 171, 172, 174, 176, 188, 193, 194, 199],
      dtype='int64')

In [80]:
anomaly_indices

array([  8,  11,  41,  38,   5,  92, 153,  81,  43, 128], dtype=int64)

In [81]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# True labels: 1 for original anomalies, 0 for normal samples
true_labels = np.zeros(len(df), dtype=int)  # Initialize all to normal
true_labels[anomaly_indices] = 1  # Set original anomalies to 1

# Predicted labels: 1 for detected anomalies, 0 for normal samples
predicted_labels = df['is_anomaly'].astype(int).values  # Convert boolean to int (True=1, False=0)

# Calculate metrics
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels)
recall = recall_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels)

# Print the results
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


Accuracy: 0.85
Precision: 0.26
Recall: 1.00
F1 Score: 0.41


In [85]:
from sklearn.tree import DecisionTreeClassifier, export_text

# Prepare labels: 1 for anomalies, 0 for normal samples
labels = df['is_anomaly'].astype(int).values

# Fit a Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(df_scaled, labels)

In [89]:
rules = export_text(dt_classifier, feature_names=list(df.columns[:3]))
print(rules)

|--- feature2 <= 0.24
|   |--- feature3 <= 1.07
|   |   |--- class: 0
|   |--- feature3 >  1.07
|   |   |--- feature2 <= -0.05
|   |   |   |--- class: 0
|   |   |--- feature2 >  -0.05
|   |   |   |--- class: 1
|--- feature2 >  0.24
|   |--- feature3 <= -0.05
|   |   |--- feature1 <= -0.78
|   |   |   |--- feature2 <= 0.84
|   |   |   |   |--- class: 0
|   |   |   |--- feature2 >  0.84
|   |   |   |   |--- class: 1
|   |   |--- feature1 >  -0.78
|   |   |   |--- class: 0
|   |--- feature3 >  -0.05
|   |   |--- feature3 <= -0.04
|   |   |   |--- feature2 <= 0.37
|   |   |   |   |--- class: 0
|   |   |   |--- feature2 >  0.37
|   |   |   |   |--- class: 1
|   |   |--- feature3 >  -0.04
|   |   |   |--- feature2 <= 0.46
|   |   |   |   |--- feature2 <= 0.39
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- feature2 >  0.39
|   |   |   |   |   |--- class: 0
|   |   |   |--- feature2 >  0.46
|   |   |   |   |--- class: 1



In [103]:
# Initialize SHAP explainer
explainer = shap.TreeExplainer(dt_classifier)
shap_values = explainer.shap_values(df_scaled)

shap_values_class1 = shap_values[:, :, 1]
shap_values_abs_class1 = np.abs(shap_values_class1)

# Determine the most important feature for each sample
most_important_features = np.argmax(shap_values_abs_class1, axis=1)
most_important_feature_names = [df.columns[i] for i in most_important_features]

# Add the most important feature to the DataFrame
df['most_important_feature'] = most_important_feature_names

# Display the DataFrame with the new column
display(df.head(10))

Unnamed: 0,feature1,feature2,feature3,cluster,is_anomaly,anomaly_rules,most_important_feature
0,54.967142,31.788937,68.111447,1,False,,feature3
1,48.617357,32.803923,88.0125,1,False,,feature3
2,56.476885,35.415256,100.104874,1,False,,feature3
3,65.230299,35.26901,100.939612,1,False,,feature3
4,47.658466,23.111653,90.998691,0,False,,feature2
5,97.149576,42.299333,188.940083,2,True,"feature1 was above 0.42, feature2 was above 0....",feature2
6,65.792128,32.575176,78.647591,1,False,,feature3
7,57.674347,32.56893,97.15241,1,False,,feature3
8,90.781947,44.39631,197.434528,2,True,"feature1 was above 0.42, feature2 was above 0....",feature2
9,55.4256,49.263657,110.288777,3,True,"feature1 was above 0.42, feature2 was above 0....",feature2


In [95]:
shap_values.shape

(200, 3, 2)

In [101]:
shap_values[0][1]

array([ 0.0554122, -0.0554122])