In [None]:
import os
os.chdir("drive/MyDrive/aics")
print(os.listdir())

['datasets', '.ipynb_checkpoints', 'requirements.txt', 'network_threats.db']


In [None]:
!pip install -r requirements.txt

Collecting asttokens (from -r requirements.txt (line 1))
  Downloading asttokens-3.0.0-py3-none-any.whl.metadata (4.7 kB)
Collecting colorama (from -r requirements.txt (line 2))
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting comm (from -r requirements.txt (line 3))
  Downloading comm-0.2.2-py3-none-any.whl.metadata (3.7 kB)
Collecting exceptiongroup (from -r requirements.txt (line 8))
  Downloading exceptiongroup-1.2.2-py3-none-any.whl.metadata (6.6 kB)
Collecting executing (from -r requirements.txt (line 9))
  Downloading executing-2.2.0-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting jedi (from -r requirements.txt (line 13))
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting pure_eval (from -r requirements.txt (line 32))
  Downloading pure_eval-0.2.3-py3-none-any.whl.metadata (6.3 kB)
Collecting stack-data (from -r requirements.txt (line 41))
  Downloading stack_data-0.6.3-py3-none-any.whl.metadata (18 kB)
Downloading asttokens

In [None]:
import pathlib
import sqlite3
import ipaddress
import joblib
from contextlib import closing

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, log_loss, f1_score, roc_auc_score, roc_curve, auc, precision_recall_curve, average_precision_score, confusion_matrix
from sklearn.model_selection import cross_val_score, train_test_split, learning_curve
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, label_binarize
from sklearn.utils import compute_class_weight
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
import networkx as nx
from collections import defaultdict, Counter

In [None]:
numeric_features = [
    "Flow Duration",
    "Total Fwd Packets",
    "Total Backward Packets",
    "out_degree",
    "in_degree",
    "active_days",
    "activity_span",
    "degree_centrality",
    "days_since_last_seen",
    "source_ip",
    "destination_ip",
    "Source Port",
    "Destination Port",
]

categorical_features = [
    "Protocol",
    "Hour",
    "DayOfWeek",
    "Source_Internal",
    "Dest_Internal",
]

# Load data
DATASET_DIR = pathlib.Path("datasets")
csv_files = list(DATASET_DIR.glob("*.csv"))
DB_NAME = "network_threats.db"

In [None]:
def load_data(files):
    """Load CSV files into a list of DataFrames."""
    dfs = []
    header = None

    for f in files:
        print(f"Loading {f}...")
        try:
            df = pd.read_csv(f, low_memory=False, encoding="utf-8", on_bad_lines="skip")
            # Check if the header is consistent across files
            if not header:
                header = df.columns.tolist()
            elif header != df.columns.tolist():
                print(f"Header mismatch in {f}. Skipping this file.")
                continue
            # drop the header row if it exists in the data
            if df.iloc[0].tolist() == header:
                df = df.iloc[1:]
            # Append the dataframe to the list
            dfs.append(df)
            print(f"Loaded {f} with {len(df)} rows.")
        except Exception as e:
            print(f"Error reading {f}: {e}")

    # Concatenate all dataframes and set the header
    df = pd.concat(dfs, ignore_index=True)
    df.columns = [h.strip() for h in header]  # Clean column names

    # drop duplicates and nans
    df = df.drop_duplicates()
    df = df.dropna()

    return df


In [None]:
def preprocess_data(df):
    print("Starting preprocessing...")
    # Convert timestamp
    print("Converting Timestamp...")
    df["Timestamp"] = pd.to_datetime(df["Timestamp"], errors="coerce")
    print("Timestamp conversion complete.")

    # IP categorization
    def is_internal(ip):
        # check if ip address is string
        if not isinstance(ip, str):
            print(f"Invalid IP address: {ip}")
            return 0
        octets = list(map(int, ip.split(".")))
        if (
            (octets[0] == 10)
            or (octets[0] == 172 and 16 <= octets[1] <= 31)
            or (octets[0] == 192 and octets[1] == 168)
        ):
            return 1
        return 0

    print("Categorizing IP addresses...")
    # Feature engineering
    df["Source_Internal"] = df["Source IP"].apply(is_internal)
    df["Dest_Internal"] = df["Destination IP"].apply(is_internal)
    df["Hour"] = df["Timestamp"].dt.hour
    df["DayOfWeek"] = df["Timestamp"].dt.dayofweek
    # convert IP addresses to integers
    df["source_ip"] = df["Source IP"].apply(
        lambda x: int(ipaddress.ip_address(x)) if isinstance(x, str) else x
    )
    df["destination_ip"] = df["Destination IP"].apply(
        lambda x: int(ipaddress.ip_address(x)) if isinstance(x, str) else x
    )
    print("IP categorization complete.")

    print("Entering categorical columns...")
    label_encoder = LabelEncoder()
    df["label_encoded"] = label_encoder.fit_transform(df["Label"])

    print("Categorical columns entered.")
    return df, label_encoder

In [None]:
def initialize_database():
    """Create SQLite database schema"""
    with closing(sqlite3.connect(DB_NAME)) as conn:
        cursor = conn.cursor()

        # Create main flows table
        cursor.execute(
            """CREATE TABLE IF NOT EXISTS flows (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            timestamp DATETIME,
            src_ip TEXT,
            src_port INTEGER,
            dst_ip TEXT,
            dst_port INTEGER,
            protocol INTEGER,
            duration REAL,
            fwd_packets INTEGER,
            bwd_packets INTEGER,
            label INTEGER,
            src_internal INTEGER,
            dst_internal INTEGER
        )"""
        )

        # Create IP metadata table
        cursor.execute(
            """CREATE TABLE IF NOT EXISTS ip_metadata (
            ip TEXT PRIMARY KEY,
            internal INTEGER,
            degree_centrality REAL,
            last_seen DATETIME)"""
        )

        # Create indexes
        cursor.execute("""CREATE INDEX IF NOT EXISTS idx_src_ip ON flows(src_ip)""")
        cursor.execute("""CREATE INDEX IF NOT EXISTS idx_dst_ip ON flows(dst_ip)""")
        conn.commit()


def update_ip_metadata():
    """Update IP metadata using incremental steps"""
    with closing(sqlite3.connect(DB_NAME)) as conn:
        # Step 1: Create temp tables
        conn.execute(
            """
        CREATE TEMPORARY TABLE IF NOT EXISTS temp_src_ips AS
        SELECT
            src_ip AS ip,
            MAX(src_internal) AS internal,
            COUNT(*) AS out_degree,
            MAX(timestamp) AS last_seen
        FROM flows
        GROUP BY src_ip
        """
        )

        conn.execute(
            """
        CREATE TEMPORARY TABLE IF NOT EXISTS temp_dst_ips AS
        SELECT
            dst_ip AS ip,
            MAX(dst_internal) AS internal,
            COUNT(*) AS in_degree,
            MAX(timestamp) AS last_seen
        FROM flows
        GROUP BY dst_ip
        """
        )

        # Step 2: Combine IP data
        conn.execute(
            """
            CREATE TEMPORARY TABLE IF NOT EXISTS temp_combined_ips AS
            SELECT
                ip,
                MAX(internal) AS internal,
                SUM(out_degree) AS out_degree,
                SUM(in_degree) AS in_degree,
                MAX(last_seen) AS last_seen
            FROM (
                SELECT ip, internal, out_degree, 0 AS in_degree, last_seen FROM temp_src_ips
                UNION ALL
                SELECT ip, internal, 0 AS out_degree, in_degree, last_seen FROM temp_dst_ips
            )
            GROUP BY ip
            """
        )

        # Step 3: Update metadata table
        conn.execute(
            """
        INSERT OR REPLACE INTO ip_metadata
        SELECT
            ip,
            internal,
            (COALESCE(out_degree, 0) + (COALESCE(in_degree, 0))) AS degree_centrality,
            last_seen
        FROM temp_combined_ips
        """
        )

        # Step 4: Cleanup temp tables
        conn.execute("DROP TABLE IF EXISTS temp_src_ips")
        conn.execute("DROP TABLE IF EXISTS temp_dst_ips")
        conn.execute("DROP TABLE IF EXISTS temp_combined_ips")

        conn.commit()


def store_in_database(df):
    """Store preprocessed data in SQLite"""
    flow_data = df.rename(
        columns={
            "Timestamp": "timestamp",
            "Source IP": "src_ip",
            "Source Port": "src_port",
            "Destination IP": "dst_ip",
            "Destination Port": "dst_port",
            "Protocol": "protocol",
            "Flow Duration": "duration",
            "Total Fwd Packets": "fwd_packets",
            "Total Backward Packets": "bwd_packets",
            "Label": "label",
            "Source_Internal": "src_internal",
            "Dest_Internal": "dst_internal",
        }
    )
    with closing(sqlite3.connect(DB_NAME)) as conn:
        # Store flows
        flow_data[
            [
                "timestamp",
                "src_ip",
                "src_port",
                "dst_ip",
                "dst_port",
                "protocol",
                "duration",
                "fwd_packets",
                "bwd_packets",
                "label",
                "src_internal",
                "dst_internal",
            ]
        ].to_sql("flows", conn, if_exists="append", index=False)

        conn.commit()

        # Update IP metadata with latest information
        update_ip_metadata()


def calculate_graph_features(ip_filter=None):
    """Calculate features with data leakage prevention"""
    with closing(sqlite3.connect(DB_NAME)) as conn:
        # Degree features
        degree_query = """
        SELECT ip,
            SUM(out_degree) AS out_degree,
            SUM(in_degree) AS in_degree
        FROM (
            SELECT src_ip AS ip, 1 AS out_degree, 0 AS in_degree FROM flows
            UNION ALL
            SELECT dst_ip AS ip, 0 AS out_degree, 1 AS in_degree FROM flows
        )
        GROUP BY ip
        """
        degrees = pd.read_sql(degree_query, conn)

        # Temporal features
        temp_query = (
            """
            SELECT
                src_ip AS ip,
                COUNT(DISTINCT DATE(timestamp)) AS active_days,
                JULIANDAY(MAX(timestamp)) - JULIANDAY(MIN(timestamp)) AS activity_span
            FROM flows
            """
            + (
                f"WHERE src_ip IN ({','.join(['?']*len(ip_filter))})"
                if len(ip_filter)
                else ""
            )
            + """
            GROUP BY src_ip
            """
        )
        params = tuple(ip_filter) if len(ip_filter) else ()
        temporal = pd.read_sql(temp_query, conn, params=params)

    return degrees.merge(temporal, on="ip", how="left").fillna(0)


def enrich_data(df, graph_features):
    """Enrich with metadata from ip_metadata table"""
    with closing(sqlite3.connect(DB_NAME)) as conn:
        ip_meta = pd.read_sql(
            """
            SELECT
                ip,
                degree_centrality,
                JULIANDAY('now') - JULIANDAY(last_seen) AS days_since_last_seen
            FROM ip_metadata
            """,
            conn,
        )

    # enhanced = (
    #     df.merge(graph_features, left_on="Source IP", right_on="ip", how="left")
    #     .merge(ip_meta, left_on="Source IP", right_on="ip", how="left")
    #     .fillna({"degree_centrality": 0, "days_since_last_seen": 365})
    # )

    # return enhanced.drop(columns=["ip_x", "ip_y"], errors="ignore")

    # merge in chunks
    chunk_size = 10000
    enriched_df = pd.DataFrame()
    for i in range(0, len(df), chunk_size):
        chunk = df.iloc[i : i + chunk_size]
        chunk = (
            chunk.merge(graph_features, left_on="Source IP", right_on="ip", how="left")
            .merge(ip_meta, left_on="Source IP", right_on="ip", how="left")
            .fillna({"degree_centrality": 0, "days_since_last_seen": 365})
        )
        enriched_df = pd.concat([enriched_df, chunk], ignore_index=True)

        print(f"Processed chunk {i // chunk_size + 1}")

    return enriched_df.drop(columns=["ip_x", "ip_y"], errors="ignore")


In [None]:
def train_model(X_train, y_train):
    print("Defining features and labels...")

    # Preprocessing pipeline
    preprocessor = ColumnTransformer(
        [
            ("num", StandardScaler(), numeric_features),
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_features),
        ]
    )

    class_weights = compute_class_weight(
        class_weight="balanced", classes=np.unique(y_train), y=y_train
    )

    print("Preprocessing pipeline defined.")

    model = Pipeline(
        [
            ("preprocessor", preprocessor),
            (
                "classifier",
                RandomForestClassifier(
                    n_estimators=100,
                    max_depth=8,
                    min_samples_split=5,
                    max_features="sqrt",
                    class_weight=dict(zip(np.unique(y_train), class_weights)),
                    random_state=42,
                ),
            ),
        ]
    )

    # param_grid = {
    #     "classifier__n_estimators": [180, 150, 100],
    #     "classifier__max_depth": [7, 8, 5],
    #     "classifier__min_samples_split": [3, 5],
    # }
    # # Perform hyperparameter tuning using GridSearchCV
    # grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring="accuracy", verbose=10)
    # print("Starting hyperparameter tuning...")
    # grid_search.fit(X_train, y_train)
    # best_model = grid_search.best_estimator_
    # print(f"Best parameters: {grid_search.best_params_}")
    # print(f"Best cross-validation score: {grid_search.best_score_}")


    # Generate learning curves
    print("Computing learning curves...")
    train_sizes, train_scores, val_scores = learning_curve(
        model,
        X_train,
        y_train,
        cv=5,
        n_jobs=-1,
        scoring="accuracy",
        random_state=42,
    )

    # Compute mean and std
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    val_scores_mean = np.mean(val_scores, axis=1)
    val_scores_std = np.std(val_scores, axis=1)

    # Plot
    plt.figure(figsize=(10, 6))
    plt.plot(train_sizes, train_scores_mean, label="Training Accuracy", marker='o')
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1)
    plt.plot(train_sizes, val_scores_mean, label="Validation Accuracy", marker='s')
    plt.fill_between(train_sizes, val_scores_mean - val_scores_std,
                     val_scores_mean + val_scores_std, alpha=0.1)
    plt.title("Learning Curve")
    plt.xlabel("Training Set Size")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig("learning_curve.png")
    plt.close()

    # Fit final model
    print("Fitting final model...")
    model.fit(X_train, y_train)

    return model

In [None]:
def plot_graph(df, name, max_nodes=100, desired_clusters=20):
    def get_subnet(ip, mask):
        try:
            return str(ipaddress.ip_network(f"{ip}/{mask}", strict=False).network_address) + f"/{mask}"
        except ValueError:
            return "invalid"

    # Try to pick best subnet mask based on IP distribution
    def choose_subnet_mask(ips, desired_clusters):
        def count_clusters(mask):
            try:
                return len(set(
                    str(ipaddress.ip_network(f"{ip}/{mask}", strict=False).network_address)
                    for ip in ips
                ))
            except ValueError:
                return float("inf")

        best_mask = 24
        best_diff = float("inf")

        for mask in range(16, 29):  # Test from /16 to /28
            num_clusters = count_clusters(mask)
            diff = abs(num_clusters - desired_clusters)
            if diff < best_diff:
                best_mask = mask
                best_diff = diff

        return best_mask

    G = nx.DiGraph()
    all_ips = set()

    for _, row in df.iterrows():
        src, dst = row["Source IP"], row["Destination IP"]
        G.add_edge(src, dst)
        all_ips.update([src, dst])

    # Trim to top N active nodes
    degrees = dict(G.degree)
    top_nodes = sorted(degrees, key=degrees.get, reverse=True)[:max_nodes]
    G = G.subgraph(top_nodes).copy()

    # Re-collect IPs in trimmed graph
    trimmed_ips = list(G.nodes)
    subnet_mask = choose_subnet_mask(trimmed_ips, desired_clusters)

    # Build subnet groups
    subnet_groups = defaultdict(set)
    for node in G.nodes:
        subnet = get_subnet(node, subnet_mask)
        subnet_groups[subnet].add(node)

    # Assign positions to each subnet group (cluster)
    pos = {}
    cluster_offset = 0
    spacing = 5
    for i, (subnet, nodes) in enumerate(subnet_groups.items()):
        subgraph = G.subgraph(nodes)
        cluster_pos = nx.spring_layout(subgraph, k=0.5, seed=42)
        for node in cluster_pos:
            cluster_pos[node][0] += cluster_offset
        pos.update(cluster_pos)
        cluster_offset += spacing

    # Plot
    plt.figure(figsize=(14, 10))
    nx.draw(G, pos, with_labels=True, node_size=700, node_color="lightblue",
            font_size=9, font_weight="bold", edge_color="gray")
    plt.title(f"Knowledge Graph (Subnet Clustered - /{subnet_mask})")
    plt.savefig(f"knowledge_graph_{name}.png")
    plt.close()

In [None]:
df = load_data(csv_files)

# shuffle the data
# df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Total rows after concatenation: {len(df)}")
print(f"Columns: {df.columns.tolist()}")
print(df.head())

print("Preprocessing data...")
df, encoder = preprocess_data(df)
print("Data preprocessing complete.")

print(df.head())

Loading datasets/Friday-WorkingHours-Morning.pcap_ISCX.csv...
Loaded datasets/Friday-WorkingHours-Morning.pcap_ISCX.csv with 191033 rows.
Loading datasets/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv...
Loaded datasets/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv with 225745 rows.
Loading datasets/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv...
Loaded datasets/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv with 288602 rows.
Loading datasets/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv...
Loaded datasets/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv with 286467 rows.
Loading datasets/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv...
Loaded datasets/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv with 458968 rows.
Loading datasets/Wednesday-workingHours.pcap_ISCX.csv...
Loaded datasets/Wednesday-workingHours.pcap_ISCX.csv with 208788 rows.
Loading datasets/Tuesday-WorkingHours.pcap_ISCX.csv...
Loaded datasets/Tuesday-Worki

In [None]:
train_df, test_df = train_test_split(
    df, test_size=0.4, random_state=42, stratify=df["label_encoded"]
)
# import matplotlib.pyplot as plt
# import seaborn as sns

# plt.figure(figsize=(10, 6))
# sns.histplot(data=train_df, x="label_encoded", bins=30, kde=True)
# plt.title("Distribution of Encoded Labels")
# plt.xlabel("Encoded Labels")
# plt.ylabel("Frequency")

# initialize_database()
# store_in_database(train_df)

train_ips = train_df["Source IP"].unique()
graph_features = calculate_graph_features(ip_filter=train_ips)

In [None]:
feature_columns = numeric_features + categorical_features

# Enrich datasets
X_train = enrich_data(train_df, graph_features)
X_test = enrich_data(test_df, graph_features)

# without enrichment
# X_train = train_df.copy()
# X_test = test_df.copy()

# print("Data enrichment complete.")
# print("X_train")
# print(X_train.head())
# print("X_test")
# print(X_test.head())

y_train = X_train["label_encoded"]
y_test = X_test["label_encoded"]
X_train = X_train.drop(columns=["label_encoded"])
X_test = X_test.drop(columns=["label_encoded"])

model = train_model(X_train[feature_columns], y_train)
print("Model training complete.")


Processed chunk 1
Processed chunk 2
Processed chunk 3
Processed chunk 4
Processed chunk 5
Processed chunk 6
Processed chunk 7
Processed chunk 8
Processed chunk 9
Processed chunk 10
Processed chunk 11
Processed chunk 12
Processed chunk 13
Processed chunk 14
Processed chunk 15
Processed chunk 16
Processed chunk 17
Processed chunk 18
Processed chunk 19
Processed chunk 20
Processed chunk 21
Processed chunk 22
Processed chunk 23
Processed chunk 24
Processed chunk 25
Processed chunk 26
Processed chunk 27
Processed chunk 28
Processed chunk 29
Processed chunk 30
Processed chunk 31
Processed chunk 32
Processed chunk 33
Processed chunk 34
Processed chunk 35
Processed chunk 36
Processed chunk 37
Processed chunk 38
Processed chunk 39
Processed chunk 40
Processed chunk 41
Processed chunk 42
Processed chunk 43
Processed chunk 44
Processed chunk 45
Processed chunk 46
Processed chunk 47
Processed chunk 48
Processed chunk 49
Processed chunk 50
Processed chunk 51
Processed chunk 52
Processed chunk 53
Pr

In [None]:
  # Evaluate performance on training data (before validation)
  # print("\n===== TRAINING PERFORMANCE (BEFORE VALIDATION) =====")
  y_train_pred = model.predict(X_train[feature_columns])
  y_train_proba = model.predict_proba(X_train[feature_columns])

  print("Training Accuracy:", accuracy_score(y_train, y_train_pred))
  print("Training F1 Score:", f1_score(y_train, y_train_pred, average="weighted"))
  print("Training ROC AUC Score:", roc_auc_score(y_train, y_train_proba, multi_class="ovr"))
  print("Training Log Loss:", log_loss(y_train, y_train_proba))
  print("Training Average Precision Score:", average_precision_score(y_train, y_train_proba))
  print("Training Classification Report:")
  print(classification_report(y_train, y_train_pred))
  print("Training Confusion Matrix:")
  print(confusion_matrix(y_train, y_train_pred))

  import matplotlib.pyplot as plt
  import seaborn as sns

  plt.figure(figsize=(10, 6))
  classes = model.classes_
  for i, class_name in enumerate(classes):
      precision, recall, _ = precision_recall_curve((y_train == class_name).astype(int), y_train_proba[:, i])
      average_precision = average_precision_score((y_train == class_name).astype(int), y_train_proba[:, i])
      plt.plot(recall, precision, label=f"{class_name} (AP={average_precision:.2f})")

  plt.xlabel("Recall")
  plt.ylabel("Precision")
  plt.title("Precision-Recall Curve")
  plt.legend()
  plt.savefig("precision_recall_curve_before_validation.png")
  plt.close()


  plt.figure(figsize=(10, 6))
  y_train_bin = label_binarize(y_train, classes=classes)
  for i, class_name in enumerate(classes):
      fpr, tpr, _ = roc_curve(y_train_bin[:, i], y_train_proba[:, i])
      roc_auc = auc(fpr, tpr)
      plt.plot(fpr, tpr, label=f"{class_name} (AUC={roc_auc:.2f})")
  plt.plot([0, 1], [0, 1], "k--")
  plt.xlabel("False Positive Rate")
  plt.ylabel("True Positive Rate")
  plt.title("ROC Curve")
  plt.legend()
  plt.savefig("roc_curve_before_validation.png")
  plt.close()

  # Plot confusion matrix
  plt.figure(figsize=(10, 6))
  # Get the original class names from the encoder
  class_names = encoder.classes_

  # Create confusion matrix with numeric indices
  cm = confusion_matrix(y_train, y_train_pred)

  # Plot using class names for labels
  sns.heatmap(
      cm,
      annot=True,
      fmt="d",
      cmap="Blues",
      cbar=False,
      xticklabels=class_names,
      yticklabels=class_names,
  )
  plt.xlabel("Predicted")
  plt.ylabel("True")
  plt.title("Confusion Matrix")
  plt.tight_layout()  # Ensure labels fit properly
  plt.savefig("confusion_matrix_before_validation.png")
  plt.close()

  # load model from file
  # import joblib
  # model = joblib.load("threat_model.pkl")

  print("\n===== TEST PERFORMANCE (AFTER VALIDATION) =====")
  print("Evaluating model...")
  y_pred = model.predict(X_test)
  y_proba = model.predict_proba(X_test)
  print("Model evaluation complete.")

  # dump the model to a file
  import joblib

  joblib.dump(model, "threat_model.pkl")
  print("Model saved as threat_model.pkl")

  print("Test Accuracy:", accuracy_score(y_test, y_pred))
  print(classification_report(y_test, y_pred))
  print("F1 Score:", f1_score(y_test, y_pred, average="weighted"))
  print("Confusion Matrix:")
  print(confusion_matrix(y_test, y_pred))
  print("ROC AUC Score:", roc_auc_score(y_test, y_proba, multi_class="ovr"))
  print("Log Loss:", log_loss(y_test, y_proba))
  print("Average Precision Score:", average_precision_score(y_test, y_proba))

  import matplotlib.pyplot as plt

  import seaborn as sns

  plt.figure(figsize=(10, 6))
  classes = model.classes_
  for i, class_name in enumerate(classes):
      precision, recall, _ = precision_recall_curve((y_test == class_name).astype(int), y_proba[:, i])
      average_precision = average_precision_score((y_test == class_name).astype(int), y_proba[:, i])
      plt.plot(recall, precision, label=f"{class_name} (AP={average_precision:.2f})")
  plt.xlabel("Recall")
  plt.ylabel("Precision")
  plt.title("Precision-Recall Curve")
  plt.legend()
  plt.savefig("precision_recall_curve_after_validation.png")
  plt.close()

  plt.figure(figsize=(10, 6))
  y_test_bin = label_binarize(y_test, classes=classes)
  for i, class_name in enumerate(classes):
      fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_proba[:, i])
      roc_auc = auc(fpr, tpr)
      plt.plot(fpr, tpr, label=f"{class_name} (AUC={roc_auc:.2f})")
  plt.plot([0, 1], [0, 1], "k--")
  plt.xlabel("False Positive Rate")
  plt.ylabel("True Positive Rate")
  plt.title("ROC Curve")
  plt.legend()
  plt.savefig("roc_curve_after_validation.png")
  plt.close()

  # Plot confusion matrix
  plt.figure(figsize=(10, 6))
  # Get the original class names from the encoder
  class_names = encoder.classes_

  # Create confusion matrix with numeric indices
  cm = confusion_matrix(y_test, y_pred)

  # Plot using class names for labels
  sns.heatmap(
      cm,
      annot=True,
      fmt="d",
      cmap="Blues",
      cbar=False,
      xticklabels=class_names,
      yticklabels=class_names,
  )
  plt.xlabel("Predicted")
  plt.ylabel("True")
  plt.title("Confusion Matrix")
  plt.tight_layout()  # Ensure labels fit properly
  plt.savefig("confusion_matrix_after_validation.png")
  plt.close()

  # Add feature importance analysis
  print("\n===== FEATURE IMPORTANCE ANALYSIS =====")
  importances = model.named_steps["classifier"].feature_importances_

  # Get feature names from preprocessor
  num_features = numeric_features
  cat_features = model.named_steps["preprocessor"].transformers_[1][1].get_feature_names_out(categorical_features).tolist()
  feature_names = num_features + cat_features

  # Create DataFrame of feature importances
  feature_importances = pd.DataFrame({
      "Feature": feature_names,
      "Importance": importances,
  }).sort_values(by="Importance", ascending=False)

  print("Feature importances:")
  print(feature_importances)
  print("Feature importance analysis complete.")

  # Visualize feature importance
  plt.figure(figsize=(12, 8))
  sns.barplot(x="Importance", y="Feature", data=feature_importances.head(15))
  plt.title("Top 15 Most Important Features")
  plt.tight_layout()
  plt.savefig("feature_importance.png")
  plt.close()

  # Per-class performance comparison
  print("\n===== PER-CLASS PERFORMANCE COMPARISON =====")
  # Training data class performance
  train_report = classification_report(y_train, y_train_pred, output_dict=True)
  test_report = classification_report(y_test, y_pred, output_dict=True)

  # Convert to DataFrames for easier handling
  train_df = pd.DataFrame(train_report).transpose()
  test_df = pd.DataFrame(test_report).transpose()

  # Filter to keep only class data (remove accuracy, macro avg, weighted avg)
  train_classes = train_df.iloc[:-3].copy()
  test_classes = test_df.iloc[:-3].copy()

  # Add dataset identifier
  train_classes['dataset'] = 'Training'
  test_classes['dataset'] = 'Test'

  # Combine for plotting
  combined_classes = pd.concat([train_classes, test_classes])
  combined_classes = combined_classes.reset_index().rename(columns={'index': 'class'})

  # Map numeric class labels to actual label names
  class_name_mapping = dict(zip(range(len(encoder.classes_)), encoder.classes_))
  combined_classes['class_name'] = combined_classes['class'].map(lambda x: class_name_mapping.get(int(x), x))

  # Plot F1 scores per class using actual label names
  plt.figure(figsize=(14, 8))
  sns.barplot(x='class_name', y='f1-score', hue='dataset', data=combined_classes)
  plt.title('F1 Score by Class: Training vs Test')
  plt.xlabel('Class')
  plt.ylabel('F1 Score')
  plt.xticks(rotation=45)
  plt.grid(axis='y', linestyle='--', alpha=0.7)
  plt.tight_layout()
  plt.savefig('f1_score_by_class.png')
  plt.close()

  # Print detailed per-class comparison with actual label names
  print("\nPer-class F1 scores:")
  class_comparison = pd.DataFrame({
      'Class': [class_name_mapping.get(int(c), c) for c in combined_classes['class'].unique()[:len(classes)]],
      'Training F1': train_classes['f1-score'].values,
      'Test F1': test_classes['f1-score'].values,
      'Difference': train_classes['f1-score'].values - test_classes['f1-score'].values
  }).sort_values(by='Difference', ascending=False)
  print(class_comparison)

Training Accuracy: 0.9945015865431973
Training F1 Score: 0.995491470332102
Training ROC AUC Score: 0.9999818947213694
Training Log Loss: 0.06751720199043325
Training Average Precision Score: 0.9968999537207119
Training Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1025426
           1       0.70      1.00      0.82      1180
           2       1.00      1.00      1.00     76816
           3       1.00      0.92      0.96     75847
           4       0.36      1.00      0.53      3299
           5       0.85      0.94      0.89      3478
           6       1.00      1.00      1.00      4763
           7       0.92      1.00      0.96        22
           8       1.00      1.00      1.00     95358
           9       0.96      1.00      0.98      1784
          10       0.99      1.00      0.99       904
          11       0.87      1.00      0.93        13
          12       0.99      0.99      0.99       391



In [None]:
# Cross-validation
print("\n===== CROSS-VALIDATION =====")
cv_scores = cross_val_score(model, X_train[feature_columns], y_train, cv=5, scoring="f1_weighted", n_jobs=-1)
print("Cross-validation F1 scores:", cv_scores)
print("Mean cross-validation F1 score:", np.mean(cv_scores))
print("Cross-validation complete.")

# print("Feature importances:")
# importances = model.named_steps["classifier"].feature_importances_
# feature_names = (
#     numeric_features + model.named_steps["preprocessor"].transformers_[1][1].get_feature_names_out(categorical_features).tolist()
# )
# feature_importances = pd.DataFrame(
#     {
#         "Feature": feature_names,
#         "Importance": importances,
#     }
# ).sort_values(by="Importance", ascending=False)
# print(feature_importances)
# print("Feature importances computed.")


===== CROSS-VALIDATION =====
Cross-validation F1 scores: [0.99685539 0.99748492 0.99762467 0.99792112 0.9959969 ]
Mean cross-validation F1 score: 0.9971765981567728
Cross-validation complete.


In [None]:
# Save the model
import joblib

joblib.dump(model, "threat_model.pkl")
print("Model saved as threat_model.pkl")

# Save the encoder
joblib.dump(encoder, "label_encoder.pkl")
print("Label encoder saved as label_encoder.pkl")

# Save the graph features, to a db
# GRAPH_FEATURES_DB = "graph_features.db"
# if pathlib.Path(GRAPH_FEATURES_DB).exists():
#     pathlib.Path(GRAPH_FEATURES_DB).unlink()
# with closing(sqlite3.connect(GRAPH_FEATURES_DB)) as conn:
#     graph_features.to_sql("graph_features", conn, if_exists="replace", index=False)
#     conn.commit()