# imports

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist
import plotly.graph_objects as go
from scipy.spatial.distance import euclidean
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import skfuzzy as fuzz
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.cluster import AgglomerativeClustering, DBSCAN
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.decomposition import PCA
import hdbscan
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor, NearestNeighbors

# load data

In [None]:
df = pd.read_csv("report.csv")

# Keep only the desired columns
df = df[['date', 'switch_number', 'interface_name', 'broadcast']]
df['date'] = pd.to_datetime(df['date'])

# Create a combined switch_interface column
df['switch_interface'] = df['switch_number'].astype(str) + ":" + df['interface_name']
df = df[['date', 'switch_interface', 'broadcast']]

df = df.groupby('switch_interface').filter(lambda x: x['broadcast'].sum() > 0)

def is_constant_difference(x):
    # Calculate the difference between consecutive broadcasts
    diff = x['broadcast'].diff().dropna()
    
    # Return whether the set of unique differences contains only one item (indicating a constant difference)
    return len(diff.unique()) == 1

# Group by switch_interface and filter out those with constant differences in broadcasts
df = df.groupby('switch_interface').filter(lambda x: not is_constant_difference(x))

# Ensure the data is sorted by interface and date
df = df.sort_values(by=['switch_interface', 'date'])

# Compute the delta for broadcasts
df['broadcast_delta'] = df.groupby('switch_interface')['broadcast'].diff().fillna(0)

# Identify interfaces with all broadcast_delta values as 0 using the broadcasts_delta_per_week dataframe
interfaces_to_drop = df.groupby('switch_interface').filter(lambda x: x['broadcast_delta'].sum() == 0)['switch_interface'].unique()

# Remove these interfaces from the main dataframe df
df = df[~df['switch_interface'].isin(interfaces_to_drop)]

# Drop the first occurrence for each switch_interface
df = df.groupby('switch_interface').apply(lambda x: x.tail(len(x) - 1)).reset_index(drop=True)

df

# drop interfaces

those interfaces do not have data about everyday for unknown reason

In [None]:
total_days = df['date'].dt.date.nunique()


# Define a function to check if the interface has data for every day
def has_data_everyday(x):
    return x['date'].dt.date.nunique() == total_days

# Use groupby and filter to find interfaces that don't have data every day
incomplete_interfaces = df.groupby('switch_interface').filter(lambda x: not has_data_everyday(x))

# Print unique interfaces from the filtered result
print(incomplete_interfaces['switch_interface'].unique())

# Get the list of incomplete interfaces
incomplete_interface_list = incomplete_interfaces['switch_interface'].unique()

# Remove these interfaces from the dataframe
df = df[~df['switch_interface'].isin(incomplete_interface_list)]

## add nulls where data is missing

### show amount of time gaps

In [None]:
df['time_diff'] = df.groupby('switch_interface')['date'].diff()

# Get counts of each unique time difference
time_diff_counts = df['time_diff'].value_counts()

# Convert to DataFrame and sort by index (which is the time difference in timedelta format)
time_diff_table = time_diff_counts.sort_index().reset_index()
time_diff_table.columns = ['Time Gap', 'Frequency']

# Convert the 'Time Gap' column to MM:SS format
time_diff_table['Time Gap (MM:SS)'] = time_diff_table['Time Gap'].apply(lambda x: f'{x.seconds // 60}:{x.seconds % 60:02d}')

# Drop the original 'Time Gap' column and reorder the columns for display
time_diff_table = time_diff_table[['Time Gap (MM:SS)', 'Frequency']]

print(time_diff_table)


In [None]:
# Ensure the data is sorted by interface and date
df = df.sort_values(by=['switch_interface', 'date'])

# Convert broadcast to int64 to avoid potential overflow
df['broadcast'] = df['broadcast'].astype(np.int64)

dfs = []  # this will store each interface's dataframe with missing data added

for interface in df['switch_interface'].unique():
    subset = df[df['switch_interface'] == interface]
    
    # Create a specific range for each interface, based on its minimum and maximum dates
    full_range = pd.date_range(start=subset['date'].min(), end=subset['date'].max(), freq='5T')
    
    # Set index to date for reindexing
    subset = subset.set_index('date').reindex(full_range, method='pad').reset_index().rename(columns={'index': 'date'})
    
    # Fill switch_interface for missing rows
    subset['switch_interface'].fillna(interface, inplace=True)
    
    dfs.append(subset)

# Combine back into a single dataframe
df = pd.concat(dfs).sort_values(by=['switch_interface', 'date']).reset_index(drop=True)

df['time_gap'] = df.groupby('switch_interface')['date'].diff()

df

# data exploration

In [None]:
selected_switch_interfaces = df['switch_interface'].unique()

subset = df[df['switch_interface'].isin(selected_switch_interfaces)]

# Use broadcast_delta instead of broadcast for the y-axis
fig = px.line(subset, x='date', y='broadcast_delta', color='switch_interface', title='Broadcast Delta Trend for Selected Switch-Interfaces Over Time')

fig.show()

In [None]:
# Get the week number from the date
df['week_number'] = df['date'].dt.isocalendar().week

# Group by week number, switch number, and interface name to sum up the broadcast deltas
broadcasts_delta_per_week = df.groupby(['week_number', 'switch_interface'])['broadcast_delta'].sum().reset_index()

# Construct the week label
broadcasts_delta_per_week['week_label'] = 'Week ' + broadcasts_delta_per_week['week_number'].astype(str)

heatmap_data = broadcasts_delta_per_week.pivot('switch_interface', 'week_label', 'broadcast_delta')
sorted_columns = sorted(heatmap_data.columns, key=lambda x: int(x.split()[1]))
heatmap_data = heatmap_data[sorted_columns]

plt.figure(figsize=(20, 15))
sns.heatmap(heatmap_data, cmap='YlGnBu', linewidths=.5, annot=True, fmt=".0f")
plt.title('Heatmap of Broadcast Delta by Week and Interface')
plt.xlabel('Week In 2023')
plt.show()


In [None]:
# Create a FacetGrid that displays a grid of line plots for each switch_interface
g = sns.FacetGrid(df, col="switch_interface", col_wrap=5, height=3, aspect=1.5)  # col_wrap defines how many plots per row, adjust as needed
g.map(plt.plot, "date", "broadcast_delta", marker=".") 

# Rotating the x-axis labels for clarity and setting titles for each subplot
g.set_xticklabels(rotation=45)
g.set_titles("{col_name}")

plt.tight_layout()
plt.show()

In [None]:
# Extract day of the week and create a new column
df['day_of_week'] = df['date'].dt.day_name()

# Order the days for proper plotting
days_order = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']

# Vibrant color palette
palette = sns.color_palette("viridis", 7)
weekend_palette = [palette[i] if day not in ['Friday', 'Saturday'] else sns.color_palette("rocket")[2] for i, day in enumerate(days_order)]

# Create FacetGrid with box plots
g = sns.FacetGrid(df, col="switch_interface", col_wrap=3, height=6, sharey=False)
g.map(sns.boxplot, 'day_of_week', 'broadcast_delta', order=days_order, palette=weekend_palette)
g.map(sns.pointplot, 'day_of_week', 'broadcast_delta', order=days_order, color='red', markers='D', scale=0.7)

# Set titles for each facet and other aesthetics
g.set_titles("{col_name}")
g.set_axis_labels("", "Broadcasts")
g.set_xticklabels(rotation=30)

plt.tight_layout()
plt.show()


In [None]:
# Sort by 'switch_interface' and 'date'
df = df.sort_values(by=['switch_interface', 'date'])

# Use a merge operation to get the broadcast values from 7 days ago
df_lagged = df.copy()
df_lagged['date'] = df_lagged['date'] + pd.Timedelta(days=7)

# Merge the dataframe with its lagged version
merged = pd.merge(df, df_lagged, on=['switch_interface', 'date'], suffixes=('', '_lagged'))

# Now, 'broadcast_lagged' will have the broadcast values from 7 days ago
g = sns.FacetGrid(merged, col="switch_interface", col_wrap=4, height=4, sharex=False, sharey=False)
g = g.map(plt.scatter, "broadcast_delta_lagged", "broadcast_delta")

plt.show()

In [None]:
df = df.sort_values(by=['switch_interface', 'date'])
df['broadcast_delta'] = df.groupby('switch_interface')['broadcast'].diff().fillna(0)

# FacetGrid with KDE for each interface
g = sns.FacetGrid(df, col="switch_interface", col_wrap=4, height=3, sharex=False, sharey=False)
g.map(sns.kdeplot, 'broadcast_delta', shade=True)

# Adjusting the subplots for readability
g.set_titles("{col_name} interface")
g.set_axis_labels("Delta Broadcast", "Density")

plt.tight_layout()
plt.show()



In [None]:
grouped = df.groupby('switch_interface')['broadcast_delta']

stats_df = pd.DataFrame({
    'Mean': grouped.mean(),
    'Median': grouped.median(),
    'Min': grouped.min(),
    'Max': grouped.max(),
    'Mode': grouped.apply(lambda x: x.mode().iloc[0]),
    'Skewness': grouped.skew(),
    'Standard Deviation': grouped.std(),
    'Kurtosis': grouped.apply(pd.Series.kurt),
    '25th Percentile': grouped.quantile(0.25),
    '75th Percentile': grouped.quantile(0.75)
})

styled = stats_df.style.background_gradient(cmap='viridis').set_precision(2)
display(styled)


# Modeling

## normalization

In [None]:
# Normalize delta_broadcast
scaler = MinMaxScaler()  # or StandardScaler() for Z-score normalization
df['normalized_broadcast_delta'] = scaler.fit_transform(df[['broadcast_delta']])

### fuzzy-Kmeans

model and scores

In [None]:
# List of unique interfaces
interfaces = df['switch_interface'].unique()

# This will store the best parameters for each interface
interface_params = {}
interface_metrics = {}

# Define range of clusters and fuzziness coefficients to test
clusters_range = range(2, 3)  # starting from 2 because 1 cluster is trivial
m_values = np.linspace(1.5, 3, 6)  # dividing the range [1.5,3] into 6 values


# 1. Determine best parameters for each interface
for interface in interfaces:
    interface_data = df[df['switch_interface'] == interface]
    data_to_cluster = interface_data['normalized_broadcast_delta'].values.reshape(-1, 1).T

    best_obj_func = float('inf')
    best_params = {}

    for n_clusters in clusters_range:
        for m in m_values:
            _, _, _, _, jm, _, _ = fuzz.cluster.cmeans(data_to_cluster, c=n_clusters, m=m, error=0.005, maxiter=1000)
            if jm[-1] < best_obj_func:
                best_obj_func = jm[-1]
                best_params = {'clusters': n_clusters, 'm': m}

    interface_params[interface] = best_params

# 2. Compute silhouette and Davies-Bouldin scores using best parameters
for interface in interfaces:
    interface_data = df[df['switch_interface'] == interface]
    data_to_cluster = interface_data['normalized_broadcast_delta'].values.reshape(-1, 1)
    
    best_params = interface_params[interface]
    _, u, _, _, _, _, _ = fuzz.cluster.cmeans(data_to_cluster.T, c=best_params['clusters'], m=best_params['m'], error=0.005, maxiter=1000)
    
    labels = np.argmax(u, axis=0)
    labels = labels.flatten()
    silhouette = silhouette_score(data_to_cluster, labels)
    davies_bouldin = davies_bouldin_score(data_to_cluster, labels)

    interface_metrics[interface] = {
        'best_params': best_params,
        'silhouette_score': silhouette,
        'davies_bouldin_score': davies_bouldin
    }


# Compute the mean of the scores across all interfaces
total_silhouette = 0
total_davies_bouldin = 0
num_interfaces = len(interface_metrics)

for metrics in interface_metrics.values():
    total_silhouette += metrics['silhouette_score']
    total_davies_bouldin += metrics['davies_bouldin_score']

mean_silhouette = total_silhouette / num_interfaces
mean_davies_bouldin = total_davies_bouldin / num_interfaces

for interface, metrics in interface_metrics.items():
    print(f"For interface {interface}, best parameters are: {metrics['best_params']}")
    print(f"Silhouette Score: {metrics['silhouette_score']:.4f}")
    print(f"Davies-Bouldin Index: {metrics['davies_bouldin_score']:.4f}")
    print('-'*50)

# Print the mean scores
print("Mean Silhouette Score across all interfaces:", mean_silhouette)
print("Mean Davies-Bouldin Score across all interfaces:", mean_davies_bouldin)


evaluation

In [None]:
# Extracting interface names and their corresponding scores for plotting
interface_names = list(interface_metrics.keys())
silhouette_scores = [metrics['silhouette_score'] for metrics in interface_metrics.values()]
davies_bouldin_scores = [metrics['davies_bouldin_score'] for metrics in interface_metrics.values()]

# Sort the interface names and scores based on the size of the column (broadcast_delta)
sorted_indices = np.argsort(silhouette_scores)  # You can use silhouette_scores or davies_bouldin_scores
interface_names = [interface_names[i] for i in sorted_indices]
silhouette_scores = [silhouette_scores[i] for i in sorted_indices]
davies_bouldin_scores = [davies_bouldin_scores[i] for i in sorted_indices]

# Plotting Silhouette Scores
plt.figure(figsize=(15,7))
plt.bar(interface_names, silhouette_scores, color='blue')
plt.xlabel('Interface')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score per Interface')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

# Plotting Davies-Bouldin Scores
plt.figure(figsize=(15,7))
plt.bar(interface_names, davies_bouldin_scores, color='red')
plt.xlabel('Interface')
plt.ylabel('Davies-Bouldin Index')
plt.title('Davies-Bouldin Index per Interface')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
df['early_kmeans_cluster_label'] = -1  # Initialize to -1

# 3. Assign cluster labels to each point using best parameters
for interface in interfaces:
    interface_data = df[df['switch_interface'] == interface]
    data_to_cluster = interface_data['broadcast_delta'].values.reshape(-1, 1)

    # Check if the interface exists in the dictionary
    if interface in interface_params:
        best_params = interface_params[interface]
        _, u, _, _, _, _, _ = fuzz.cluster.cmeans(data_to_cluster.T, c=best_params['clusters'], m=best_params['m'], error=0.005, maxiter=1000)

        labels = np.argmax(u, axis=0)
        labels = labels.flatten()

        # Assign cluster labels to the corresponding data points in the original DataFrame
        df.loc[df['switch_interface'] == interface, 'early_kmeans_cluster_label'] = labels

# Create a custom color palette to ensure unique colors for each cluster
num_clusters = len(df['early_kmeans_cluster_label'].unique())
custom_palette = sns.color_palette("Set1", num_clusters)


# Create a larger facet grid with individual plots for each interface
g = sns.FacetGrid(df, col="switch_interface", col_wrap=3, height=5, sharey=False)
g.map_dataframe(sns.scatterplot, x="date", y="broadcast_delta", hue="early_kmeans_cluster_label", palette="Set1", alpha=0.5)
g.add_legend(title="Cluster")
g.set_axis_labels("Date", "Broadcast Delta")
g.set_titles("Interface {col_name}")

# Change legend labels to 'Cluster A', 'Cluster B', 'Cluster C', etc.
new_legend_labels = [f"Cluster {chr(65 + i)}" for i in range(len(df['early_kmeans_cluster_label'].unique()))]
g._legend.set_title("Cluster")
for t, l in zip(g._legend.texts, new_legend_labels):
    t.set_text(l)

# Show the facet grid plots
plt.show()

### HDBSCAN

In [None]:
# 1. Preprocess the data
scaled_data = StandardScaler().fit_transform(df['broadcast_delta'].values.reshape(-1, 1))

# 2. Apply HDBSCAN clustering
# Adjust min_cluster_size to a larger value to reduce the number of small clusters.
clusterer = hdbscan.HDBSCAN(min_samples=100, min_cluster_size=100000, gen_min_span_tree=True)
cluster_labels = clusterer.fit_predict(scaled_data)
df['hdbscan_cluster'] = cluster_labels

# 3. Visualization
# Histogram of cluster assignments
plt.figure(figsize=(10, 5))
sns.countplot(cluster_labels)
plt.title('Cluster Sizes with HDBSCAN')
plt.xlabel('Cluster Label')
plt.ylabel('Number of Points')
plt.show()

# Scatter plot for each interface's broadcast delta with cluster labels
g = sns.FacetGrid(df, col="switch_interface", col_wrap=3, height=5, sharey=False, hue='hdbscan_cluster', palette='Spectral')
g.map(plt.scatter, "date", "broadcast_delta", alpha=0.5).add_legend()
g.set_axis_labels("Time", "Broadcast Delta")
g.set_titles("Interface {col_name}")
plt.show()


### isolation tree

In [None]:
# Custom hyperparameters for specific interfaces
custom_hyperparameters = {
    'FastEthernet14': {'contamination': 0.15},
    'FastEthernet30': {'contamination': 0.15},
    'FastEthernet43': {'contamination': 0.15},
    'GigabitEthernet1': {'contamination': 0.15},
    'GigabitEthernet2': {'contamination': 0.15},
    'GigabitEthernet36': {'contamination': 0.03},
    'GigabitEthernet37': {'contamination': 0.03},
    'GigabitEthernet39': {'contamination': 0.03},
    'GigabitEthernet1/1/4': {'contamination': 0.03},
}

anomalies_df = pd.DataFrame()

# Loop through each unique interface
for interface in df['switch_interface'].unique():
    
    # Extract data corresponding to the current interface
    subset = df[df['switch_interface'] == interface].copy()
    
    # Set default hyperparameters
    isolation_forest_params = {
        'contamination': 0.05  # default value
    }
    
    # Check if there are custom hyperparameters for this interface
    if interface in custom_hyperparameters:
        isolation_forest_params.update(custom_hyperparameters[interface])
    
    # Define the Isolation Forest with parameters
    clf = IsolationForest(**isolation_forest_params)
    
    # Apply Isolation Forest
    df.loc[subset.index, 'early_ist_anomaly'] = clf.fit_predict(subset[['broadcast_delta']])
    subset['early_ist_anomaly'] = df.loc[subset.index, 'early_ist_anomaly']

    
    # Append anomalies to the main DataFrame
    anomalies_df = anomalies_df.append(subset[subset['early_ist_anomaly'] == -1])

# Plotting the results using FacetGrid
g = sns.FacetGrid(df, col="switch_interface", col_wrap=4, height=4, sharey=False)
g.map_dataframe(sns.lineplot, x="date", y="broadcast_delta", hue="early_ist_anomaly", palette={-1: "r", 1: "b"})
g.set_axis_labels("Time", "Broadcast Delta")
g.set_titles(col_template="{col_name} interface")
g.set(xticks=[])
plt.show()

## hybrid with Threshold-based Anomaly Detection

### detecting anomalies on daily basis

calculate days that contains anomalies

In [None]:
# List of unique interfaces
unique_interfaces = df['switch_interface'].unique()

window_size = 12
multiplier = 3

# Add a new column 'date_only' which contains only the date without the time component
df['date_only'] = df['date'].dt.date

# Initialize a new anomaly column in the original df with False values
df['day_anomaly'] = False
df['daily_rolling_mean'] = None
df['daily_rolling_std'] = None

for interface in unique_interfaces:
    subset_df = df[df['switch_interface'] == interface].copy()

    # Aggregate on a daily basis
    subset_daily = subset_df.set_index('date').resample('D').sum().reset_index()
    
    # Compute rolling metrics
    subset_daily['daily_rolling_mean'] = subset_daily['broadcast_delta'].rolling(window=window_size).mean()
    subset_daily['daily_rolling_std'] = subset_daily['broadcast_delta'].rolling(window=window_size).std()
    
    # Detect anomalies
    subset_daily['anomaly'] = subset_daily['broadcast_delta'] > (subset_daily['daily_rolling_mean'] + multiplier * subset_daily['daily_rolling_std'])
    
    # Map the rolling metrics and anomaly flags back to the original dataframe
    df.loc[df['switch_interface'] == interface, 'day_anomaly'] = df['date_only'].map(subset_daily.set_index('date')['anomaly'])
    df.loc[df['switch_interface'] == interface, 'daily_rolling_mean'] = df['date_only'].map(subset_daily.set_index('date')['daily_rolling_mean'])
    df.loc[df['switch_interface'] == interface, 'daily_rolling_std'] = df['date_only'].map(subset_daily.set_index('date')['daily_rolling_std'])


plot interfaces in which a daily anomaly was detedted

In [None]:
# Plotting
for interface in unique_interfaces:
    subset_df = df[df['switch_interface'] == interface].copy()

    # Skip plotting if no anomalies detected for this interface
    if not subset_df['day_anomaly'].any():
        continue

    # Create traces for broadcast delta
    trace0 = go.Scatter(
        x=subset_df['date'],
        y=subset_df['broadcast_delta'],
        mode='lines',
        name='Broadcast Delta Daily'
    )

    # Get anomaly windows for shaded regions
    anomaly_windows = []
    in_anomaly_window = False
    window_start = None
    SHIFT_PERIODS = window_size // 1000  # Using the window size here, adjust based on observation
    previous_window_end = None

    for idx, row in subset_df.iterrows():
        if row['day_anomaly'] and not in_anomaly_window:
            in_anomaly_window = True
            window_start = row['date'] - pd.Timedelta(days=SHIFT_PERIODS)
            
            # Ensure no overlap with previous window
            if previous_window_end and window_start <= previous_window_end:
                window_start = previous_window_end + pd.Timedelta(days=1)

        elif not row['day_anomaly'] and in_anomaly_window:
            in_anomaly_window = False
            window_end = row['date'] + pd.Timedelta(days=1)
            anomaly_windows.append((window_start, window_end))
            previous_window_end = window_end

    # In case data ends within an anomaly window
    if in_anomaly_window:
        anomaly_windows.append((window_start, subset_df['date'].iloc[-1] + pd.Timedelta(days=1)))

    shapes = []
    for start, end in anomaly_windows:
        shapes.append({
            'type': 'rect',
            'x0': start,
            'x1': end,
            'y0': 0,
            'y1': subset_df['broadcast_delta'].max(),
            'fillcolor': 'red',
            'opacity': 0.4,
            'line_width': 0,
        })

    layout = go.Layout(
        title=f'Daily Broadcast Delta with Anomalies for Interface {interface}',
        xaxis=dict(title='Date'),
        yaxis=dict(title='Broadcast Delta Daily'),
        shapes=shapes
    )

    fig = go.Figure(data=[trace0], layout=layout)
    fig.show()


In [None]:
unique_interfaces = df['switch_interface'].unique()

window_size_5min = 24  # This corresponds to 2 hours of data if each record is 5 minutes apart
multiplier_5min = 2

anomalies_data_5min = {}

unique_interfaces = df['switch_interface'].unique()

window_size_5min = 24  # This corresponds to 2 hours of data if each record is 5 minutes apart
multiplier_5min = 2

anomalies_data_5min = {}

for interface in unique_interfaces:
    subset_df = df[df['switch_interface'] == interface].copy()
    
    # Compute rolling metrics on the 5-min data
    subset_df['rolling_mean_5min'] = subset_df['broadcast_delta'].rolling(window=window_size_5min).mean()
    subset_df['rolling_std_5min'] = subset_df['broadcast_delta'].rolling(window=window_size_5min).std()
    
    # Detect anomalies on the 5-min data
    subset_df['anomaly_5min'] = subset_df['broadcast_delta'] > (subset_df['rolling_mean_5min'] + multiplier_5min * subset_df['rolling_std_5min'])
    
    # Store the data in the anomalies dictionary
    anomalies_data_5min[interface] = subset_df

# Initialize columns in the original df 
df['anomaly_5min'] = False
df['rolling_mean_5min'] = np.nan
df['rolling_std_5min'] = np.nan

# Update the 'anomaly_5min', 'rolling_mean_5min', and 'rolling_std_5min' columns in the original df
for interface, subset_df in anomalies_data_5min.items():
    idx = df['switch_interface'] == interface
    df.loc[idx, 'anomaly_5min'] = subset_df['anomaly_5min'].values
    df.loc[idx, 'rolling_mean_5min'] = subset_df['rolling_mean_5min'].values
    df.loc[idx, 'rolling_std_5min'] = subset_df['rolling_std_5min'].values


# Plotting
for interface, subset_df in anomalies_data_5min.items():
    
    # Skip plotting if no anomalies detected for this interface
    if not subset_df['anomaly_5min'].any():
        continue

    # Create traces for broadcast delta
    trace0 = go.Scatter(
        x = subset_df['date'],
        y = subset_df['broadcast_delta'],
        mode = 'lines',
        name = 'Broadcast Delta'
    )
    
    # Get anomaly windows for shaded regions
    anomaly_windows = []
    in_anomaly_window = False
    window_start = None

    # Constant to shift the anomaly window to better capture the start of the spike
    SHIFT_PERIODS = window_size_5min // 10

    previous_window_end = None

    for idx, row in subset_df.iterrows():
        if row['anomaly_5min'] and not in_anomaly_window:
            in_anomaly_window = True
            window_start = row['date'] - pd.Timedelta(minutes=5*SHIFT_PERIODS)
            
            # Ensure no overlap with previous window
            if previous_window_end and window_start <= previous_window_end:
                window_start = previous_window_end + pd.Timedelta(minutes=5)

        elif not row['anomaly_5min'] and in_anomaly_window:
            in_anomaly_window = False
            window_end = row['date'] + pd.Timedelta(minutes=5)
            anomaly_windows.append((window_start, window_end))
            previous_window_end = window_end

    # In case data ends within an anomaly window
    if in_anomaly_window:
        anomaly_windows.append((window_start, subset_df['date'].iloc[-1] + pd.Timedelta(minutes=5)))

    shapes = []
    for start, end in anomaly_windows:
        shapes.append({
            'type': 'rect',
            'x0': start,
            'x1': end,
            'y0': 0,
            'y1': subset_df['broadcast_delta'].max(),
            'fillcolor': 'red',
            'opacity': 0.4,
            'line_width': 0,
        })
    
    layout = go.Layout(
        title=f'Broadcast Delta with 5-min Anomalies for Interface {interface}',
        xaxis=dict(title='Date'),
        yaxis=dict(title='Broadcast Delta'),
        shapes=shapes
    )
    
    fig = go.Figure(data=[trace0], layout=layout)
    fig.show()


# Initialize a new 5-minute anomaly column in the original df with False values
df['anomaly_5min'] = False

# Update the 'anomaly_5min' column in the original df based on the detected anomalies
for interface, subset_df in anomalies_data_5min.items():
    # Create a dictionary with date as key and anomaly flag as value for the current subset
    anomaly_dict_5min = dict(zip(subset_df['date'], subset_df['anomaly_5min']))
    
    # Update the 'anomaly_5min' column in the original df
    df.loc[df['switch_interface'] == interface, 'anomaly_5min'] = df['date'].map(anomaly_dict_5min)

### add anomalies by the hour

In [None]:
unique_interfaces = df['switch_interface'].unique()

# Adjusting for hourly window. If each record is 5 minutes apart, 12 records would make up an hour.
window_size_hour = 12
multiplier_hour = 2

# Iterate through each unique interface
for interface in unique_interfaces:
    # Get the indices of rows corresponding to the current interface
    subset_indices = df[df['switch_interface'] == interface].index
    
    # Compute rolling metrics for the current interface
    df.loc[subset_indices, 'rolling_mean_hour'] = df.loc[subset_indices, 'broadcast_delta'].rolling(window=window_size_hour).mean()
    df.loc[subset_indices, 'rolling_std_hour'] = df.loc[subset_indices, 'broadcast_delta'].rolling(window=window_size_hour).std()

    # Detect and mark anomalies for the current interface based on the rolling metrics
    df.loc[subset_indices, 'anomaly_hour'] = df.loc[subset_indices, 'broadcast_delta'] > (df.loc[subset_indices, 'rolling_mean_hour'] + multiplier_hour * df.loc[subset_indices, 'rolling_std_hour'])


In [None]:
# Plotting
for interface in unique_interfaces:
    subset_df = df[df['switch_interface'] == interface].copy()

    # Skip plotting if no anomalies detected for this interface
    if not subset_df['anomaly_hour'].any():
        continue

    # Create traces for broadcast delta
    trace0 = go.Scatter(
        x=subset_df['date'],
        y=subset_df['broadcast_delta'],
        mode='lines',
        name='Broadcast Delta Hourly'
    )

    # Get anomaly windows for shaded regions
    anomaly_windows = []
    in_anomaly_window = False
    window_start = None
    
    # Using one-tenth the window size as shift. Adjust based on observation.
    SHIFT_PERIODS = window_size_hour // 10  
    previous_window_end = None

    for idx, row in subset_df.iterrows():
        if row['anomaly_hour'] and not in_anomaly_window:
            in_anomaly_window = True
            window_start = row['date'] - pd.Timedelta(hours=SHIFT_PERIODS)
            
            # Ensure no overlap with previous window
            if previous_window_end and window_start <= previous_window_end:
                window_start = previous_window_end + pd.Timedelta(hours=1)

        elif not row['anomaly_hour'] and in_anomaly_window:
            in_anomaly_window = False
            window_end = row['date'] + pd.Timedelta(hours=1)
            anomaly_windows.append((window_start, window_end))
            previous_window_end = window_end

    # In case data ends within an anomaly window
    if in_anomaly_window:
        anomaly_windows.append((window_start, subset_df['date'].iloc[-1] + pd.Timedelta(hours=1)))

    shapes = []
    for start, end in anomaly_windows:
        shapes.append({
            'type': 'rect',
            'x0': start,
            'x1': end,
            'y0': 0,
            'y1': subset_df['broadcast_delta'].max(),
            'fillcolor': 'red',
            'opacity': 0.4,
            'line_width': 0,
        })

    layout = go.Layout(
        title=f'Hourly Broadcast Delta with Anomalies for Interface {interface}',
        xaxis=dict(title='Date'),
        yaxis=dict(title='Broadcast Delta Hourly'),
        shapes=shapes
    )

    fig = go.Figure(data=[trace0], layout=layout)
    fig.show()


### IsolationForest

In [None]:
unique_interfaces = df['switch_interface'].unique()

# Hyperparameters for the Isolation Forest
n_estimators = 100
max_samples = 'auto'
contamination = 0.05  # proportion of outliers in the data set, adjust based on your domain knowledge

# Create columns for anomaly scores and flags
df['anomaly_iforest_score'] = 0
df['anomaly_iforest'] = 0

for interface in unique_interfaces:
    subset_df = df[df['switch_interface'] == interface].copy()
    
    # Extracting features
    features = subset_df[['normalized_broadcast_delta', 'day_anomaly', 'anomaly_5min', 'anomaly_hour']]
    
    # Since 'day_anomaly' and '5min_anomaly' are boolean, convert them to integer type
    features['day_anomaly'] = features['day_anomaly'].astype(int)
    features['anomaly_5min'] = features['anomaly_5min'].astype(int)
    features['anomaly_hour'] = features['anomaly_hour'].astype(int)

    
    # Normalize the data
    scaler = StandardScaler()
    features_normalized = scaler.fit_transform(features)
    
    # Train the Isolation Forest
    clf = IsolationForest(n_estimators=n_estimators, max_samples=max_samples, contamination=contamination, random_state=42)
    clf.fit(features_normalized)
    
    # Predict anomalies and store them in the main dataframe
    df.loc[df['switch_interface'] == interface, 'anomaly_iforest_score'] = clf.decision_function(features_normalized)
    df.loc[df['switch_interface'] == interface, 'anomaly_iforest'] = clf.predict(features_normalized)

# Convert the anomaly flags: -1 means anomaly and 1 means normal data in Isolation Forest
df['anomaly_iforest'] = df['anomaly_iforest'].apply(lambda x: True if x == -1 else False)

In [None]:
for interface in unique_interfaces:
    subset_df = df[df['switch_interface'] == interface].copy()
    
    # Create traces for broadcast delta
    trace0 = go.Scatter(
        x = subset_df['date'],
        y = subset_df['broadcast_delta'],
        mode = 'lines',
        name = 'Broadcast Delta'
    )
    
    # Create traces for anomalies detected by Isolation Forest
    trace1 = go.Scatter(
        x = subset_df[subset_df['anomaly_iforest']]['date'],
        y = subset_df[subset_df['anomaly_iforest']]['broadcast_delta'],
        mode = 'markers',
        name = 'Anomalies by Isolation Forest',
        marker=dict(color='red', size=8)
    )
    
    layout = go.Layout(
        title=f'Broadcast Delta with Anomalies by Isolation Forest for Interface {interface}',
        xaxis=dict(title='Date'),
        yaxis=dict(title='Broadcast Delta'),
    )
    
    fig = go.Figure(data=[trace0, trace1], layout=layout)
    fig.show()

### One-class SVM

In [None]:

# Parameter for One-Class SVM. You might need to adjust based on performance.
nu_val = 0.05  # An upper bound on the fraction of margin errors and a lower bound of the fraction of support vectors

df['anomaly_ocsvm'] = 0  # Initialize with 0


for interface in unique_interfaces:
    subset_df = df[df['switch_interface'] == interface].copy()
    
    # Prepare the data: normalized_broadcast_delta and anomaly flags
    X = subset_df[['normalized_broadcast_delta', 'day_anomaly', 'anomaly_5min']]
    
    # Train One-Class SVM
    ocsvm = OneClassSVM(nu=nu_val, kernel="rbf", gamma='auto')
    ocsvm.fit(X)
    
    # Get predictions
    preds = ocsvm.predict(X)
    
    # Convert predictions to a boolean anomaly flag (True if anomaly, else False)
    subset_df['anomaly_ocsvm'] = preds == -1

    # Update the original DataFrame
    df.loc[subset_df.index, 'anomaly_ocsvm'] = subset_df['anomaly_ocsvm']



In [None]:
# Now, plotting the results

for interface in unique_interfaces:
    subset_df = df[df['switch_interface'] == interface].copy()
    
    # Create traces for broadcast delta
    trace0 = go.Scatter(
        x = subset_df['date'],
        y = subset_df['broadcast_delta'],
        mode = 'lines',
        name = 'Broadcast Delta'
    )
    
    # Create traces for anomalies detected by One-Class SVM
    trace1 = go.Scatter(
        x = subset_df[subset_df['anomaly_ocsvm']]['date'],
        y = subset_df[subset_df['anomaly_ocsvm']]['broadcast_delta'],
        mode = 'markers',
        name = 'Anomalies by One-Class SVM',
        marker=dict(color='red', size=8)
    )
    
    layout = go.Layout(
        title=f'Broadcast Delta with Anomalies by One-Class SVM for Interface {interface}',
        xaxis=dict(title='Date'),
        yaxis=dict(title='Broadcast Delta'),
    )
    
    fig = go.Figure(data=[trace0, trace1], layout=layout)
    fig.show()


### LOF

In [None]:
df['anomaly_lof'] = False


unique_interfaces = df['switch_interface'].unique()

for interface in unique_interfaces:
    
    subset_df = df[df['switch_interface'] == interface].copy()

    # Extract features
    X = subset_df[['normalized_broadcast_delta', 'day_anomaly', 'anomaly_5min', 'anomaly_hour']]

    # Train LOF Model
    lof = LocalOutlierFactor(n_neighbors=20, contamination=0.05)
    lof_predictions = lof.fit_predict(X)
    
    # Convert LOF predictions (-1 for outliers, 1 for inliers) to boolean (True for outliers, False for inliers)
    subset_df['anomaly_lof'] = lof_predictions == -1

    # Update the main DataFrame with LOF results for the current interface
    df.loc[df['switch_interface'] == interface, 'anomaly_lof'] = subset_df['anomaly_lof']


In [None]:
for interface in unique_interfaces:
    
    subset_df = df[df['switch_interface'] == interface].copy()

    # Plot data
    trace1 = go.Scatter(
        x=subset_df['date'],
        y=subset_df['broadcast_delta'],  # Using the original broadcast_delta for visualization
        mode='lines',
        name='Data'
    )

    # Highlight anomalies with a different color
    anomalies = subset_df[subset_df['anomaly_lof'] == True]
    
    if not anomalies.empty:  # Only plot if there are anomalies
        trace2 = go.Scatter(
            x=anomalies['date'],
            y=anomalies['broadcast_delta'],
            mode='markers',
            name='Anomaly',
            marker=dict(color='red', size=10)
        )

        layout = go.Layout(
            title=f'Broadcast Delta with LOF Anomalies for Interface {interface}',
            xaxis=dict(title='Date'),
            yaxis=dict(title='Broadcast Delta')
        )

        fig = go.Figure(data=[trace1, trace2], layout=layout)
        fig.show()


### fuzzy kmeans with tagged anomalies

In [None]:
# List of unique interfaces
interfaces = df['switch_interface'].unique()

# This will store the best parameters for each interface
interface_params = {}
interface_metrics = {}

# Define range of clusters and fuzziness coefficients to test
clusters_range = range(2, 3)  # starting from 2 because 1 cluster is trivial
m_values = np.linspace(1.5, 3, 6)  # dividing the range [1.5,3] into 6 values

# 1. Determine best parameters for each interface
for interface in interfaces:
    interface_data = df[df['switch_interface'] == interface]
    data_to_cluster = interface_data[['normalized_broadcast_delta', 'day_anomaly', 'anomaly_hour', 'rolling_mean_5min', 'rolling_std_5min']].values.T

    best_obj_func = float('inf')
    best_params = {}

    for n_clusters in clusters_range:
        for m in m_values:
            _, _, _, _, jm, _, _ = fuzz.cluster.cmeans(data_to_cluster, c=n_clusters, m=m, error=0.005, maxiter=1000)
            if jm[-1] < best_obj_func:
                best_obj_func = jm[-1]
                best_params = {'clusters': n_clusters, 'm': m}

    interface_params[interface] = best_params

# 2. Compute silhouette and Davies-Bouldin scores using best parameters
for interface in interfaces:
    interface_data = df[df['switch_interface'] == interface]
    data_to_cluster_df = interface_data[['normalized_broadcast_delta', 'day_anomaly','anomaly_hour', 'rolling_mean_5min', 'rolling_std_5min']]
    
    # Remove rows with NaN values
    data_to_cluster_df = data_to_cluster_df.dropna()
    data_to_cluster = data_to_cluster_df.values

    best_params = interface_params[interface]
    _, u, _, _, _, _, _ = fuzz.cluster.cmeans(data_to_cluster.T, c=best_params['clusters'], m=best_params['m'], error=0.005, maxiter=1000)
    
    labels = np.argmax(u, axis=0)
    labels = labels.flatten()
    silhouette = silhouette_score(data_to_cluster, labels)
    davies_bouldin = davies_bouldin_score(data_to_cluster, labels)

    interface_metrics[interface] = {
        'best_params': best_params,
        'silhouette_score': silhouette,
        'davies_bouldin_score': davies_bouldin
    }

# Compute the mean of the scores across all interfaces
total_silhouette = 0
total_davies_bouldin = 0
num_interfaces = len(interface_metrics)

for metrics in interface_metrics.values():
    total_silhouette += metrics['silhouette_score']
    total_davies_bouldin += metrics['davies_bouldin_score']

mean_silhouette = total_silhouette / num_interfaces
mean_davies_bouldin = total_davies_bouldin / num_interfaces

for interface, metrics in interface_metrics.items():
    print(f"For interface {interface}, best parameters are: {metrics['best_params']}")
    print(f"Silhouette Score: {metrics['silhouette_score']:.4f}")
    print(f"Davies-Bouldin Index: {metrics['davies_bouldin_score']:.4f}")
    print('-'*50)

# Print the mean scores
print("Mean Silhouette Score across all interfaces:", mean_silhouette)
print("Mean Davies-Bouldin Score across all interfaces:", mean_davies_bouldin)


In [None]:
# Extracting interface names and their corresponding scores for plotting
interface_names = list(interface_metrics.keys())
silhouette_scores = [metrics['silhouette_score'] for metrics in interface_metrics.values()]
davies_bouldin_scores = [metrics['davies_bouldin_score'] for metrics in interface_metrics.values()]

# Sort the interface names and scores based on the size of the column (broadcast_delta)
sorted_indices = np.argsort(silhouette_scores)  # You can use silhouette_scores or davies_bouldin_scores
interface_names = [interface_names[i] for i in sorted_indices]
silhouette_scores = [silhouette_scores[i] for i in sorted_indices]
davies_bouldin_scores = [davies_bouldin_scores[i] for i in sorted_indices]

# Plotting Silhouette Scores
plt.figure(figsize=(15,7))
plt.bar(interface_names, silhouette_scores, color='blue')
plt.xlabel('Interface')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score per Interface')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

# Plotting Davies-Bouldin Scores
plt.figure(figsize=(15,7))
plt.bar(interface_names, davies_bouldin_scores, color='red')
plt.xlabel('Interface')
plt.ylabel('Davies-Bouldin Index')
plt.title('Davies-Bouldin Index per Interface')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
df['kmeans_cluster_label'] = -1  # Initialize to -1

# 3. Assign cluster labels to each point using best parameters
for interface in interfaces:
    interface_data = df[df['switch_interface'] == interface]
    data_to_cluster = interface_data['broadcast_delta'].values.reshape(-1, 1)

    # Check if the interface exists in the dictionary
    if interface in interface_params:
        best_params = interface_params[interface]
        _, u, _, _, _, _, _ = fuzz.cluster.cmeans(data_to_cluster.T, c=best_params['clusters'], m=best_params['m'], error=0.005, maxiter=1000)

        labels = np.argmax(u, axis=0)
        labels = labels.flatten()

        # Assign cluster labels to the corresponding data points in the original DataFrame
        df.loc[df['switch_interface'] == interface, 'kmeans_cluster_label'] = labels

# Create a custom color palette to ensure unique colors for each cluster
num_clusters = len(df['kmeans_cluster_label'].unique())
custom_palette = sns.color_palette("Set1", num_clusters)


# Create a larger facet grid with individual plots for each interface
g = sns.FacetGrid(df, col="switch_interface", col_wrap=3, height=5, sharey=False)
g.map_dataframe(sns.scatterplot, x="date", y="broadcast_delta", hue="kmeans_cluster_label", palette="Set1", alpha=0.5)
g.add_legend(title="Cluster")
g.set_axis_labels("Date", "Broadcast Delta")
g.set_titles("Interface {col_name}")

# Change legend labels to 'Cluster A', 'Cluster B', 'Cluster C', etc.
new_legend_labels = [f"Cluster {chr(65 + i)}" for i in range(len(df['kmeans_cluster_label'].unique()))]
g._legend.set_title("Cluster")
for t, l in zip(g._legend.texts, new_legend_labels):
    t.set_text(l)

# Show the facet grid plots
plt.show()

## ensemble

In [None]:
# Normalize the iForest score to [0,1]
df['norm_iforest_score'] = (df['anomaly_iforest_score'] + 1) / 2

# Weights
weights = {
    'norm_iforest_score': 3,
    'anomaly_ocsvm': 1,
    'anomaly_lof': 1,
    'early_ist_anomaly': 0.5
}

# Calculate weighted ensemble score
df['ensemble_score'] = sum(df[col] * weight for col, weight in weights.items())

# Thresholding the ensemble score to determine anomalies
threshold = 2.5
df['ensemble_anomaly'] = (df['ensemble_score'] >= threshold).astype(bool)


### fix manually downfall anomalies

In [None]:
def remove_downfall_and_low_anomalies(df, threshold=5):
    interfaces = df['switch_interface'].unique()
    
    for interface in interfaces:
        subset_df = df[df['switch_interface'] == interface].copy()
        
        # Iterate over the rows of the subset_df
        for idx, row in subset_df.iterrows():
            # Ensure that we're not at the boundaries of the dataframe
            if idx > 0 and idx < len(subset_df) - 1:
                prev_broadcast = df.at[idx - 1, 'broadcast_delta']
                next_broadcast = df.at[idx + 1, 'broadcast_delta']
                current_broadcast = row['broadcast_delta']
                
                # Check if the current broadcast is greater than both previous and next
                if current_broadcast > prev_broadcast and current_broadcast > next_broadcast:
                    continue
                
                # Check if current broadcast delta is equal to or below the threshold
                elif current_broadcast <= threshold:
                    for column in ['early_ist_anomaly', 'anomaly_iforest', 'anomaly_ocsvm', 'anomaly_lof', 'ensemble_score']:
                        df.at[idx, column] = False
                    
                else:
                    # If the current broadcast value isn't a spike, set anomalies to False
                    for column in ['early_ist_anomaly', 'anomaly_iforest', 'anomaly_ocsvm', 'anomaly_lof']:
                        df.at[idx, column] = False
    
    return df

# Ensure the specified columns are of type bool
df['early_ist_anomaly'] = df['early_ist_anomaly'].astype(bool)
df['anomaly_ocsvm'] = df['anomaly_ocsvm'].astype(bool)

# Update the dataframe
df = remove_downfall_and_low_anomalies(df)


In [None]:
interface_thresholds = {
    '1:FastEthernet0/10': 25,
    '1:FastEthernet0/14': 25,
    '1:FastEthernet0/2': 25,
    '1:FastEthernet0/22': 50,
    '1:FastEthernet0/30': 2,
    '1:FastEthernet0/36': 25,
    '1:FastEthernet0/37': 20,
    '1:FastEthernet0/43': 40,
    '1:FastEthernet0/45': 35,
    '1:GigabitEthernet0/1': 50,
    '1:GigabitEthernet0/2': 0,
    '2:GigabitEthernet1/0/36': 5,
    '2:GigabitEthernet1/0/37': 5,
    '2:GigabitEthernet1/0/38': 0,
    '2:GigabitEthernet1/0/39': 0,
    '2:GigabitEthernet1/0/40': 0,
    '2:TenGigabitEthernet1/1/4': 0
}

anomaly_columns = ['ensemble_anomaly', 'anomaly_iforest', 'anomaly_ocsvm', 'anomaly_lof', 'early_ist_anomaly']

for interface, threshold in interface_thresholds.items():
    mask = (df['switch_interface'] == interface) & (df['broadcast_delta'] < threshold)
    for col in anomaly_columns:
        df.loc[mask, col] = False


In [None]:

for interface in unique_interfaces:
    subset_df = df[df['switch_interface'] == interface].copy()
    
    # Create traces for broadcast delta
    trace0 = go.Scatter(
        x = subset_df['date'],
        y = subset_df['broadcast_delta'],
        mode = 'lines',
        name = 'Broadcast Delta'
    )
    
    # Create traces for anomalies detected by Ensemble
    trace1 = go.Scatter(
        x = subset_df[subset_df['anomaly_iforest']]['date'],
        y = subset_df[subset_df['anomaly_iforest']]['broadcast_delta'],
        mode = 'markers',
        name = 'Anomalies by iforest',
        marker=dict(color='red', size=8)
    )
    
    layout = go.Layout(
        title=f'Broadcast Delta with Anomalies by iforest for Interface {interface}',
        xaxis=dict(title='Date'),
        yaxis=dict(title='Broadcast Delta'),
    )
    
    fig = go.Figure(data=[trace0, trace1], layout=layout)
    fig.show()


In [None]:
# Function to plot density for normal data
def plot_kde_normal(data, color, label):
    normal_data = data[data['anomaly_iforest'] == False]['broadcast_delta']
    sns.kdeplot(data=normal_data, color=color, label=label)

# Function to plot density for anomalies
def plot_kde_anomaly(data, color, label):
    anomaly_data = data[data['anomaly_iforest'] == True]['broadcast_delta']
    sns.kdeplot(data=anomaly_data, color=color, label=label)

# Create FacetGrid
g = sns.FacetGrid(df, col="switch_interface", col_wrap=4, height=4, sharex=False,sharey=False)
g.map_dataframe(plot_kde_normal, color='b', label='Normal Data')
g.map_dataframe(plot_kde_anomaly, color='r', label='Anomaly').add_legend()

plt.show()


In [None]:
# Subset of dataframe with just the anomaly tags
anomaly_tags = df[['anomaly_iforest', 'anomaly_ocsvm', 'anomaly_lof', 'early_ist_anomaly']]

# Compute correlation matrix
correlation_matrix = anomaly_tags.corr()

# Update column names for plotting
correlation_matrix.columns = ['iforest', 'ocsvm', 'lof', 'ensemble']
correlation_matrix.index = ['iforest', 'ocsvm', 'lof', 'ensemble']

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='RdBu_r', center=0)
plt.title('Correlation Heatmap of Anomaly Taggings')
plt.show()

In [None]:
# Assuming df has a 'switch_interface' column and 'anomaly_iforest' is your iForest binary output (1 for anomalies, 0 for non-anomalies)
interfaces = df['switch_interface'].unique()

silhouette_scores = []
db_scores = []

for interface in interfaces:
    subset = df[df['switch_interface'] == interface]
    if len(subset['anomaly_iforest'].unique()) > 1:  # Need at least 2 clusters to compute these scores
        sil_score = silhouette_score(subset[['broadcast_delta']], subset['anomaly_iforest'])
        db_score = davies_bouldin_score(subset[['broadcast_delta']], subset['anomaly_iforest'])
        silhouette_scores.append(sil_score)
        db_scores.append(db_score)
    else:
        silhouette_scores.append(None)  # or some placeholder value
        db_scores.append(None)

# Plotting
plt.figure(figsize=(14, 5))
plt.subplot(1, 2, 1)
plt.bar(interfaces, silhouette_scores, color='blue')
plt.title('Silhouette Scores per Interface')
plt.xticks(rotation=90)

plt.subplot(1, 2, 2)
plt.bar(interfaces, db_scores, color='red')
plt.title('Davies-Bouldin Scores per Interface')
plt.xticks(rotation=90)

plt.tight_layout()
plt.show()


A high silhouette score could imply that anomalies are well-separated from the normal data. However, it's important to note that not all anomaly detection problems will have clear separations, especially with contextual anomalies.

A low Davies-Bouldin Index could suggest that the anomalies form a tight cluster separate from the normal data