In [1]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import accuracy_score, classification_report

# 1. Load datasets

In [2]:

cloud_df = pd.read_csv("CloudWatch_Traffic_Web_Attack.csv")
intrusion_df = pd.read_csv("cybersecurity_intrusion_data.csv")
global_df = pd.read_csv("Global_Cybersecurity_Threats_2015-2024.csv")


# 2. Supervised dataset (intrusion)

In [None]:
x = intrusion_df.drop(columns=["attack_detected","session_id"])
y = intrusion_df["attack_detected"]


# Preprocessing

In [4]:

num_cols = ["network_packet_size","login_attempts","session_duration",
            "ip_reputation_score","failed_logins","unusual_time_access"]
cat_cols = ["protocol_type","encryption_used","browser_type"]

preprocess = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])


# 3. Train/test split

In [None]:

x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size=0.2, random_state=42)




# 4. Logistic Regression pipeline

In [None]:

logreg = Pipeline([
    ("prep", preprocess),
    ("clf", LogisticRegression(max_iter=200))
])
logreg.fit(x_train, y_train)
y_pred = logreg.predict(x_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))

Logistic Regression Accuracy: 0.7285115303983228




# 5. Random Forest

In [None]:

rf = Pipeline([
    ("prep", preprocess),
    ("clf", RandomForestClassifier(n_estimators=200, random_state=42))
])
rf.fit(x_train, y_train)
print("Random Forest Report:\n", classification_report(y_test, rf.predict(x_test)))

Random Forest Report:
               precision    recall  f1-score   support

           0       0.83      0.99      0.90      1055
           1       0.99      0.75      0.85       853

    accuracy                           0.88      1908
   macro avg       0.91      0.87      0.88      1908
weighted avg       0.90      0.88      0.88      1908





# 6. Isolation Forest on CloudWatch traffic

In [8]:

iso = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)
iso.fit(cloud_df[["bytes_in","bytes_out"]])
cloud_df["anomaly_score"] = iso.decision_function(cloud_df[["bytes_in","bytes_out"]])
cloud_df["anomaly_label"] = iso.predict(cloud_df[["bytes_in","bytes_out"]])

# 7. K-Means clustering on global threats


In [9]:
kmeans = KMeans(n_clusters=4, random_state=42)
global_df["cluster"] = kmeans.fit_predict(global_df[["Financial Loss (in Million $)","Number of Affected Users"]])