# Install packages

In [None]:
# Install dependencies
# !pip3 uninstall numpy
!pip3 install --upgrade numpy==2.0.0
!pip3 install pandas
!pip3 install scikit-learn mlflow seaborn shap
!pip3 install bayesian-optimization


# Import package

In [26]:
# Import library

import pandas as pd
import numpy as np

import mlflow
from mlflow import MlflowClient

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score, classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.inspection import permutation_importance

import seaborn as sns
import matplotlib.pyplot as plt
import shap
import os

# Prepare dataset
Because test dataset not have label, we must split train dataset to 2 parts. One for train and one for validate. We just do this on the first time

In [3]:
# Read dataset
# df = pd.read_csv('data/data.csv')
# train_test_data, validate_data = train_test_split(df, test_size=0.2, random_state=42)
# train_test_data.to_csv('data/train_.csv', index=False, header=True)
# validate_data.to_csv('data/validate.csv', index=False, header=True)

Now we check some information of dataset

In [4]:
# Check dataset
df = pd.read_csv('data/train_.csv')
df.head()
print("Dataset column")
print(df.columns)
print("Summary of dataset info")
print(df.info)
print("view dimensions of dataset")
df.shape

# for col in df.columns:
#   if df[col].dtype != 'object':  # Exclude non-numeric columns
#     min_val = df[col].min()
#     max_val = df[col].max()
#     print(f"Column: {col}")
#     print(f"Minimum: {min_val}")
#     print(f"Maximum: {max_val}")
#     print()

print(df.isnull().sum())


Dataset column
Index(['ID', 'flow_duration', 'Header_Length', 'Protocol type', 'Duration',
       'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
       'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
       'ece_flag_number', 'cwr_flag_number', 'ack_count', 'syn_count',
       'fin_count', 'urg_count', 'rst_count', 'HTTP', 'HTTPS', 'DNS', 'Telnet',
       'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC',
       'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number',
       'Magnitue', 'Radius', 'Covariance', 'Variance', 'Weight', 'Label'],
      dtype='object')
Summary of dataset info
<bound method DataFrame.info of               ID  flow_duration  Header_Length  Protocol type  Duration  \
0        2696539       4.789412         108.00           6.00     64.00   
1        1487915      89.817633      122260.90           8.70    113.70   
2        2033215    1496.467448      211355.40           8.20     88.70   
3         13142

We see some cell have null value, we can not drop which rows have null cell because it to much. So we just fill all null value = -1

In [5]:
# Fill all null data as -1
print(df.isnull().sum())
data_n_null = df.fillna(-1, inplace=False)
print(data_n_null.isnull().sum())

data_n_null.head()
print(data_n_null.duplicated().sum())
print(data_n_null['Label'].unique().tolist())


ID                      0
flow_duration           0
Header_Length      155801
Protocol type      155810
Duration           156043
Rate               156180
Srate              156075
Drate              156049
fin_flag_number         0
syn_flag_number         0
rst_flag_number    156030
psh_flag_number    156006
ack_flag_number         0
ece_flag_number    155889
cwr_flag_number    156119
ack_count          156078
syn_count          156278
fin_count               0
urg_count               0
rst_count               0
HTTP               155993
HTTPS              156399
DNS                     0
Telnet             156044
SMTP               155832
SSH                156261
IRC                     0
TCP                156010
UDP                     0
DHCP                    0
ARP                156189
ICMP               155988
IPv                     0
LLC                     0
Tot sum            155800
Min                156138
Max                155961
AVG                     0
Std         

For tracking during training, we using MLflow. The software defined by container in mlflow folder

In [6]:
# Set mlflow as tracking server
mlflow.set_tracking_uri("http://localhost:5000")

# Train model
We train with some model with these steps
- We training with small part of dataset (0.2 or 0.3): dataset_frac
- We log artifacts, we see some column less contribute in  Feature Importance Score, so we delete it
- We train with full dataset, verify droped column is correct and need modify or not
- We use RandomizedSearchCV to search parameter
- We save best parameter to mlflow. With mlflow.sklearn.autolog, model and its metrics was save to model registry. We just download it and use

## Decision Tree

1. We training with small dataset, use RandomizedSearchCV and get this artifacts
- Run Overview
    ![decisiontree-run-overview](./imgs/decision_tree/run_overview.png)

- Feature importance
    ![decisiontree-feature-importance](./imgs/decision_tree/feature_importance.png)

- Confusion matrix

    ![decisiontree-confusion-matrix](./imgs/decision_tree/training_confusion_matrix.png)

2. Then, we train with full dataset. Best model save on models folder. Picture bellow show some difference 2 runs.

![](./imgs/decision_tree/difference.png) 


In [None]:
# Decision Tree Classification: https://gist.github.com/pb111/af439e4affb1dd94879579cfd6793770
mlflow.set_experiment("decision_tree")

tags = {
    "dataset_frac": 1.0,
    "random_state": 42,
    "test_size" : 0.2,
    # "droped_column" : ['ID'],
    "droped_column" : ['ID','IPv','DNS','IRC','DHCP','ARP','SMTP','cwr_flag_number','ece_flag_number','Telnet','Drate','psh_flag_number','rst_flag_number','LLC', 'TCP','SSH','HTTPS','ack_flag_number','Std','Tot size', 'ack_count'],
    "author": "Son Nguyen",
    "parameter" : {
        "max_features" : [int(x) for x in np.linspace(2, 30, num = 10)],
        "criterion" : ['gini','entropy','log_loss'],
        "max_depth" : [int(x) for x in np.linspace(10, 110, num = 11)],
        "min_samples_split" : [int(x) for x in np.random.randint(2, 10, 5)],
        "min_samples_leaf" : [int(x) for x in np.random.randint(2, 20, 5)]
    }
}
tags['parameter']['max_depth'].append(None)

# Prepare data
data = data_n_null.drop(columns=tags['droped_column'])
data_sample = data.sample(frac=tags['dataset_frac'])
X = data_sample.drop(columns=['Label'])
y = data_sample['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = tags['test_size'], random_state = tags['random_state'])

# Pick model
tree = DecisionTreeClassifier(random_state=tags['random_state'])
random_search = RandomizedSearchCV(
    estimator=tree,
    param_distributions =tags['parameter'],
    cv=5,
    random_state= tags['random_state'],
    scoring='f1_weighted',  
    verbose=2,
    error_score='raise',
    n_jobs=-1  # Use all available CPU cores
)


with mlflow.start_run():
    mlflow.sklearn.autolog()
    random_search.fit(X_train, y_train)
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_
    y_pred = best_model.predict(X_test)

    # Create some artifact
    feature_scores = pd.Series(best_model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
    plt.figure(figsize=(20, 20))
    sns.barplot(x=feature_scores, y=feature_scores.index)
    plt.xlabel('Feature Importance Score')
    plt.ylabel('Features')
    plt.title("Visualizing Important Features")
    feature_importance_plot = "feature_importance.png"
    plt.savefig(feature_importance_plot, bbox_inches='tight')

    report_dict = classification_report(y_test, y_pred, output_dict=True)
    report_df = pd.DataFrame(report_dict).transpose()
    report_df.reset_index(inplace=True)
    report_df.rename(columns={"index": "Attack Type"}, inplace=True)
    report_filename = "classification_report.csv"
    report_df.to_csv(report_filename, index=False)

    mlflow.log_artifact(feature_importance_plot)
    mlflow.log_artifact(report_filename)
    os.remove(feature_importance_plot)
    os.remove(report_filename)

    for key, value in tags.items():
        if key != "parameter":
            mlflow.set_tag(key, value)
    mlflow.set_tag("best_param", best_params)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END criterion=log_loss, max_depth=60, max_features=2, min_samples_leaf=15, min_samples_split=9; total time=   6.9s
[CV] END criterion=log_loss, max_depth=60, max_features=2, min_samples_leaf=15, min_samples_split=9; total time=   6.4s
[CV] END criterion=log_loss, max_depth=60, max_features=2, min_samples_leaf=15, min_samples_split=9; total time=   6.7s
[CV] END criterion=log_loss, max_depth=60, max_features=2, min_samples_leaf=15, min_samples_split=9; total time=   5.8s
[CV] END criterion=log_loss, max_depth=60, max_features=2, min_samples_leaf=15, min_samples_split=9; total time=   6.9s
[CV] END criterion=gini, max_depth=40, max_features=14, min_samples_leaf=8, min_samples_split=9; total time=  16.2s
[CV] END criterion=gini, max_depth=40, max_features=14, min_samples_leaf=8, min_samples_split=9; total time=  16.5s
[CV] END criterion=gini, max_depth=40, max_features=14, min_samples_leaf=8, min_samples_split=9; total time

2024/12/02 23:36:13 INFO mlflow.sklearn.utils: Logging the 5 best runs, 5 runs will be omitted.


🏃 View run sassy-lamb-941 at: http://localhost:5000/#/experiments/5/runs/98207c11a7b045b3819468a3f8455d97
🧪 View experiment at: http://localhost:5000/#/experiments/5


## Naive Bayes
Now, we do same thing with GaussianNB

In [37]:
# # Naive Bayes https://gist.github.com/pb111/9e3816d2584a85ef7bff8d70bed20b1b
mlflow.set_experiment("naive-bayes")

tags = {
    "dataset_frac": 1.0,
    "random_state": 42,
    "test_size" : 0.2,
    "droped_column" : ['ID'],
    # "droped_column" : ['ID','IRC','DHCP','LLC','IPv','DNS','ece_flag_number','Drate','SMTP','Telnet','cwr_flag_number','ARP','SSH'],
    "author": "Son Nguyen",
    "parameter" : {
        # "var_smoothing": np.logspace(-9, -1, num=50)
        "var_smoothing": np.logspace(0,-9, num=100)
    }
}

# Prepare data
data = data_n_null.drop(columns=tags['droped_column'])
data_sample = data.sample(frac=tags['dataset_frac'])
X = data_sample.drop(columns=['Label'])
y = data_sample['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = tags['test_size'], random_state = tags['random_state'])

# Pick model
gnb = GaussianNB()
random_search = RandomizedSearchCV(
    estimator=gnb,
    param_distributions =tags['parameter'],
    cv=5,
    random_state= tags['random_state'],
    scoring='f1_weighted',  
    verbose=2,
    error_score='raise',
    n_jobs=None
)


with mlflow.start_run():
    mlflow.sklearn.autolog()
    random_search.fit(X_train, y_train)
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_
    y_pred = best_model.predict(X_test)
 

    # Create some artifact
    # perm_importance = permutation_importance(best_model, X_train, y_train, scoring="f1_weighted", n_repeats=10, random_state=42)
    # feature_scores = pd.Series(perm_importance.importances_mean, index=X_train.columns).sort_values(ascending=False)
    # plt.figure(figsize=(20, 20))
    # sns.barplot(x=feature_scores, y=feature_scores.index)
    # plt.xlabel('Feature Importance Score')
    # plt.ylabel('Features')
    # plt.title("Visualizing Important Features")
    # feature_importance_plot = "feature_importance.png"
    # plt.savefig(feature_importance_plot, bbox_inches='tight')

    report_dict = classification_report(y_test, y_pred, output_dict=True)
    report_df = pd.DataFrame(report_dict).transpose()
    report_df.reset_index(inplace=True)
    report_df.rename(columns={"index": "Attack Type"}, inplace=True)
    report_filename = "classification_report.csv"
    report_df.to_csv(report_filename, index=False)

    # mlflow.log_artifact(feature_importance_plot)
    mlflow.log_artifact(report_filename)
    # os.remove(feature_importance_plot)
    os.remove(report_filename)

    for key, value in tags.items():
        if key != "parameter":
            mlflow.set_tag(key, value)
    mlflow.set_tag("best_param", best_params)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END ................var_smoothing=2.848035868435799e-08; total time=   5.9s
[CV] END ................var_smoothing=2.848035868435799e-08; total time=   5.8s
[CV] END ................var_smoothing=2.848035868435799e-08; total time=   5.9s
[CV] END ................var_smoothing=2.848035868435799e-08; total time=   5.9s
[CV] END ................var_smoothing=2.848035868435799e-08; total time=   6.0s
[CV] END ...............var_smoothing=1.5199110829529332e-05; total time=   5.9s
[CV] END ...............var_smoothing=1.5199110829529332e-05; total time=   6.0s
[CV] END ...............var_smoothing=1.5199110829529332e-05; total time=   6.1s
[CV] END ...............var_smoothing=1.5199110829529332e-05; total time=   5.9s
[CV] END ...............var_smoothing=1.5199110829529332e-05; total time=   6.0s
[CV] END ...............var_smoothing=4.3287612810830526e-07; total time=   5.9s
[CV] END ...............var_smoothing=4.32876128

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
2024/12/03 19:53:20 INFO mlflow.sklearn.utils: Logging the 5 best runs, 5 runs will be omitted.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


🏃 View run shivering-cat-250 at: http://localhost:5000/#/experiments/7/runs/331f50ef4eba4125b34fe15666892090
🧪 View experiment at: http://localhost:5000/#/experiments/7


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [35]:
# # # # Random forest follow this: https://gist.github.com/pb111/88545fa33780928694388779af23bf58
# # # Turning hyper parameter follow this: https://www.geeksforgeeks.org/random-forest-hyperparameter-tuning-in-python/
mlflow.set_experiment("random-forest")

tags = {
    "dataset_frac": 0.2,
    "random_state": 42,
    "test_size" : 0.2,
    # "droped_column" : ['ID'],
    "droped_column" : ['ID','IRC','DHCP','LLC','IPv','DNS','ece_flag_number','Drate','SMTP','Telnet','cwr_flag_number','ARP','SSH'],
    "author": "Son Nguyen",
    "parameter" : {
        "n_estimators" : [int(x) for x in np.linspace(start = 50, stop = 1000, num = 10)],
        "criterion" : ['gini','entropy','log_loss'],
        "max_depth" : [int(x) for x in np.linspace(10, 1000, num = 20)],
        "min_samples_split" : [int(x) for x in np.random.randint(2, 10, 5)],
        "min_samples_leaf" : [int(x) for x in np.random.randint(1, 20, 5)],
        "max_features" : [int(x) for x in np.random.randint(2, 30, 10)],
        "max_leaf_nodes" : [int(x) for x in np.linspace(10, 100, num = 13)],
        "bootstrap" : [True],
        "max_samples" : np.random.rand(5),
        "class_weight" : ['balanced','balanced_subsample',None]
    }
}
tags['parameter']['max_depth'].append(None)
tags['parameter']['max_leaf_nodes'].append(None)

# Prepare data
data = data_n_null.drop(columns=tags['droped_column'])
data_sample = data.sample(frac=tags['dataset_frac'])
X = data_sample.drop(columns=['Label'])
y = data_sample['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = tags['test_size'], random_state = tags['random_state'])

# Pick model
forest = RandomForestClassifier(random_state = tags['random_state'])
# random_search = RandomizedSearchCV(
#     estimator=forest,
#     param_distributions = tags['parameter'],
#     cv=5,
#     random_state= tags['random_state'],
#     scoring='f1_weighted',  
#     verbose=2,
#     error_score='raise',
#     n_jobs=None  # Use all available CPU cores
# )

with mlflow.start_run():
    mlflow.sklearn.autolog()
    forest.fit(X_train, y_train)
    # best_model = random_search.best_estimator_
    # best_params = random_search.best_params_
    y_pred = forest.predict(X_test)

    # Create some artifact
    feature_scores = pd.Series(forest.feature_importances_, index=X_train.columns).sort_values(ascending=False)
    plt.figure(figsize=(20, 20))
    sns.barplot(x=feature_scores, y=feature_scores.index)
    plt.xlabel('Feature Importance Score')
    plt.ylabel('Features')
    plt.title("Visualizing Important Features")
    feature_importance_plot = "feature_importance.png"
    plt.savefig(feature_importance_plot, bbox_inches='tight')

    report_dict = classification_report(y_test, y_pred, output_dict=True)
    report_df = pd.DataFrame(report_dict).transpose()
    report_df.reset_index(inplace=True)
    report_df.rename(columns={"index": "Attack Type"}, inplace=True)
    report_filename = "classification_report.csv"
    report_df.to_csv(report_filename, index=False)

    mlflow.log_artifact(feature_importance_plot)
    mlflow.log_artifact(report_filename)
    os.remove(feature_importance_plot)
    os.remove(report_filename)

    for key, value in tags.items():
        if key != "parameter":
            mlflow.set_tag(key, value)
    # mlflow.set_tag("best_param", best_params)

🏃 View run blushing-lark-657 at: http://localhost:5000/#/experiments/3/runs/fae831c3dc3642b8ae809d7a3553b792
🧪 View experiment at: http://localhost:5000/#/experiments/3
