In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv("synthetic_cyber_dataset.csv")

# Clip negatives (optional)
df["login_attempts"] = df["login_attempts"].clip(lower=0)
df["src_bytes"] = df["src_bytes"].clip(lower=0)

# Scale features
features = df[["login_attempts", "src_bytes"]]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

# Run K-Means
kmeans = KMeans(n_clusters=2, random_state=42)
df["cluster"] = kmeans.fit_predict(X_scaled)

# Decide which cluster is "malicious" vs "normal" based on centroid positions
centroids = kmeans.cluster_centers_
if centroids[0][0] > centroids[1][0]:  # higher login_attempts = likely malicious
    label_map = {0: "malicious", 1: "normal"}
else:
    label_map = {0: "normal", 1: "malicious"}

# Map numeric cluster labels to names
df["cluster_label"] = df["cluster"].map(label_map)

# Plot using renamed cluster labels
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x=df["login_attempts"],
    y=df["src_bytes"],
    hue=df["cluster_label"],
    palette={"normal": "blue", "malicious": "red"},
    s=60
)

# Show centroids (in original scale)
centroids_original = scaler.inverse_transform(centroids)
plt.scatter(
    centroids_original[:, 0],
    centroids_original[:, 1],
    c='black',
    s=200,
    marker='X',
    edgecolors='white',
    label='Centroids'
)

# Plot formatting
plt.title("K-Means Clustering: Normal vs Malicious Behavior")
plt.xlabel("Login Attempts")
plt.ylabel("Source Bytes Transferred")
plt.legend()
plt.grid(True)
plt.show()