In [1]:
import pandas as pd
import numpy as np

np.random.seed(42)

# Normal traffic
normal_traffic = {
    "packet_size": np.random.randint(50, 150, size=1000),  # Small packet sizes
    "num_packets": np.random.randint(1, 10, size=1000),    # Few packets
    "protocol_type": np.random.choice([0, 1], size=1000),  # 0 = TCP, 1 = UDP
    "label": 0
}

# Anomalous traffic
anomalous_traffic = {
    "packet_size": np.random.randint(500, 1000, size=50),  # Large packet sizes
    "num_packets": np.random.randint(20, 50, size=50),     # Many packets
    "protocol_type": np.random.choice([0, 1], size=50),    # 0 = TCP, 1 = UDP
    "label": 1  # 1 = anomaly
}


df_normal = pd.DataFrame(normal_traffic)
df_anomalous = pd.DataFrame(anomalous_traffic)
df = pd.concat([df_normal, df_anomalous], ignore_index=True) # Combine into one dataset

df = df.sample(frac=1, random_state=42).reset_index(drop=True)


print(df.head())

   packet_size  num_packets  protocol_type  label
0           79            5              0      0
1           88            4              0      0
2          107            9              0      0
3           67            5              0      0
4          108            8              1      0


In [3]:
# Predict anomalies
df["anomaly"] = model.predict(X)  # -1 = anomaly, 1 = normal

# Check the results
print(df[["label", "anomaly"]].head())

   label  anomaly
0      0        1
1      0        1
2      0        1
3      0        1
4      0        1


In [4]:
from sklearn.metrics import classification_report

# Convert predictions to binary: 1 for anomaly, 0 for normal
df["anomaly_binary"] = df["anomaly"].apply(lambda x: 1 if x == -1 else 0)

# Print a classification report
print(classification_report(df["label"], df["anomaly_binary"]))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1000
           1       0.94      1.00      0.97        50

    accuracy                           1.00      1050
   macro avg       0.97      1.00      0.98      1050
weighted avg       1.00      1.00      1.00      1050



In [5]:
import time

# Simulate real-time data streaming
for i in range(0, len(df), 10):  # Process 10 rows at a time
    batch = df.iloc[i:i+10]  # Get a batch of data
    predictions = model.predict(batch.drop(columns=["label", "anomaly", "anomaly_binary"]))

    # Print anomalies
    anomalies = batch[predictions == -1]
    if not anomalies.empty:
        print(f"Anomalies detected in batch {i//10 + 1}:")
        print(anomalies[["packet_size", "num_packets", "protocol_type"]])

    time.sleep(1)  # Wait 1 second to simulate real-time

Anomalies detected in batch 2:
    packet_size  num_packets  protocol_type
15          560           39              1
Anomalies detected in batch 5:
    packet_size  num_packets  protocol_type
45           50            1              0
48          565           27              1
Anomalies detected in batch 6:
    packet_size  num_packets  protocol_type
50          971           38              0
Anomalies detected in batch 8:
    packet_size  num_packets  protocol_type
78          826           40              1
Anomalies detected in batch 11:
     packet_size  num_packets  protocol_type
106          844           31              0
Anomalies detected in batch 12:
     packet_size  num_packets  protocol_type
119          941           27              0
Anomalies detected in batch 14:
     packet_size  num_packets  protocol_type
132          611           49              0
Anomalies detected in batch 15:
     packet_size  num_packets  protocol_type
144          527           43        