In [None]:
%pip install pandas

In [None]:
import pandas as pd;
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

In [13]:
# Read files
df_train = pd.read_csv('preprocessed/preprocessed_openstack_train.csv')

# Calculate request frequency by IP
ip_activity_train = df_train['internal_ip'].value_counts().reset_index()
ip_activity_train.columns = ['internal_ip', 'request_count']

In [11]:
# Set threshold for anomaly detection
threshold = ip_activity_train['request_count'].quantile(0.95)
anomalous_ips = ip_activity_train[ip_activity_train['request_count'] > threshold]

print(f"Anomalous IPs (95th percentile):\n{anomalous_ips}")

Anomalous IPs (95th percentile):
         internal_ip  request_count  is_anomalous
0   INTERNAL_PROCESS          74381             1
1         10.11.10.1          49430             1
2         10.11.10.2            306             0
3        10.11.28.99             26             0
4        10.11.28.34             26             0
..               ...            ...           ...
56       10.11.26.41             18             0
57      10.11.24.210             18             0
58      10.11.25.250             18             0
59        10.11.25.6             18             0
60       10.11.24.46             18             0

[61 rows x 3 columns]


In [14]:
# Standardize request count for clustering
scaler = StandardScaler()
scaled_counts = scaler.fit_transform(ip_activity_train[['request_count']])

# Apply DBSCAN
db = DBSCAN(eps=1.5, min_samples=5).fit(scaled_counts)
ip_activity_train['is_anomalous'] = (db.labels_ == -1).astype(int)

# View anomalous IPs
anomalous_ips_dbscan = ip_activity_train[ip_activity_train['is_anomalous'] == 1]
print(f"Anomalous IPs (DBSCAN):\n{anomalous_ips_dbscan}")


Anomalous IPs (DBSCAN):
        internal_ip  request_count  is_anomalous
0  INTERNAL_PROCESS          74381             1
1        10.11.10.1          49430             1


In [16]:
# Error Analysis

# Summarize anomalies
if 'error_rate' in df_train.columns:
    anomalous_summary = anomalous_ips[['internal_ip', 'request_count']].merge(
        df_train[['internal_ip', 'error_rate']].drop_duplicates(), on='internal_ip', how='left'
    )
else:
    anomalous_summary = anomalous_ips[['internal_ip', 'request_count']]

print("Anomalous IP Summary:")
print(anomalous_summary)

Anomalous IP Summary:
         internal_ip  request_count
0   INTERNAL_PROCESS          74381
1         10.11.10.1          49430
2         10.11.10.2            306
3        10.11.28.99             26
4        10.11.28.34             26
..               ...            ...
56       10.11.26.41             18
57      10.11.24.210             18
58      10.11.25.250             18
59        10.11.25.6             18
60       10.11.24.46             18

[61 rows x 2 columns]
