In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, silhouette_score

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans, AgglomerativeClustering

# =========================
# 2. Load datasets
# =========================
intrusion_df = pd.read_csv("cybersecurity_intrusion_data.csv")
cloud_df = pd.read_csv("CloudWatch_Traffic_Web_Attack.csv")
global_df = pd.read_csv("Global_Cybersecurity_Threats_2015-2024.csv")

# =========================
# 3. Supervised Learning (Intrusion dataset)
# =========================
X = intrusion_df.drop(columns=["attack_detected","session_id"])
y = intrusion_df["attack_detected"]

num_cols = ["network_packet_size","login_attempts","session_duration",
            "ip_reputation_score","failed_logins","unusual_time_access"]
cat_cols = ["protocol_type","encryption_used","browser_type"]

preprocess = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

models = {
    "LogReg": LogisticRegression(max_iter=200),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "SVM_Linear": SVC(kernel="linear", probability=True),
    "SVM_RBF": SVC(kernel="rbf", probability=True),
    "NaiveBayes": GaussianNB(),
    "DecisionTree": DecisionTreeClassifier(max_depth=10, random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
    "NeuralNet": MLPClassifier(hidden_layer_sizes=(64,32), max_iter=300, random_state=42)
}

results = []
for name, model in models.items():
    if name == "NaiveBayes":
        # NB needs dense input
        pipe = Pipeline([("prep", preprocess), ("to_dense", 
                      lambda X: X.toarray()), ("clf", model)])
    else:
        pipe = Pipeline([("prep", preprocess), ("clf", model)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    pr, rc, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="weighted", zero_division=0)
    results.append((name, acc, pr, rc, f1))
    print(f"{name}: Acc={acc:.3f}, Prec={pr:.3f}, Rec={rc:.3f}, F1={f1:.3f}")

# Confusion matrix for Random Forest
rf = Pipeline([("prep", preprocess), ("clf", models["RandomForest"])])
rf.fit(X_train, y_train)
cm = confusion_matrix(y_test, rf.predict(X_test))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix - Random Forest")
plt.show()

# =========================
# 4. Anomaly Detection (CloudWatch dataset)
# =========================
iso = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)
cloud_df["anomaly_label"] = iso.fit_predict(cloud_df[["bytes_in","bytes_out"]])
print("Isolation Forest anomaly rate:", (cloud_df["anomaly_label"]==-1).mean())

# =========================
# 5. Clustering (Global Threats dataset)
# =========================
Xg = global_df[["Financial Loss (in Million $)", "Number of Affected Users"]].dropna()

# K-Means
kmeans = KMeans(n_clusters=4, random_state=42)
global_df["kmeans_cluster"] = kmeans.fit_predict(Xg)
print("K-Means silhouette:", silhouette_score(Xg, global_df["kmeans_cluster"]))

# Hierarchical
agg = AgglomerativeClustering(n_clusters=4, linkage="ward")
global_df["hier_cluster"] = agg.fit_predict(Xg)

plt.scatter(Xg.iloc[:,0], Xg.iloc[:,1], c=global_df["kmeans_cluster"], cmap="tab10", s=10)
plt.xlabel("Financial Loss ($M)")
plt.ylabel("Affected Users")
plt.title("K-Means Clustering of Global Threats")
plt.show()

FileNotFoundError: [Errno 2] No such file or directory: 'cybersecurity_intrusion_data.xlsx'