In [None]:
# Install dependencies
# !pip3 uninstall numpy
!pip3 install --upgrade numpy==2.0.0
!pip3 install pandas
!pip3 install scikit-learn mlflow seaborn shap





[notice] A new release of pip is available: 23.0.1 -> 24.3.1
[notice] To update, run: python310.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.0.1 -> 24.3.1
[notice] To update, run: python310.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.0.1 -> 24.3.1
[notice] To update, run: python310.exe -m pip install --upgrade pip


In [None]:
# Import library

import pandas as pd
import mlflow
from mlflow import MlflowClient

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import StratifiedKFold

import seaborn as sns
import matplotlib.pyplot as plt
import shap
import os

In [2]:
# Read dataset
df = pd.read_csv('data/train.csv')

In [3]:
# Check dataset
df.head()
print("Dataset column")
print(df.columns)
print("Summary of dataset info")
print(df.info)
print("view dimensions of dataset")
df.shape

# for col in df.columns:
#   if df[col].dtype != 'object':  # Exclude non-numeric columns
#     min_val = df[col].min()
#     max_val = df[col].max()
#     print(f"Column: {col}")
#     print(f"Minimum: {min_val}")
#     print(f"Maximum: {max_val}")
#     print()

print(df.isnull().sum())


Dataset column
Index(['ID', 'flow_duration', 'Header_Length', 'Protocol type', 'Duration',
       'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
       'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
       'ece_flag_number', 'cwr_flag_number', 'ack_count', 'syn_count',
       'fin_count', 'urg_count', 'rst_count', 'HTTP', 'HTTPS', 'DNS', 'Telnet',
       'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC',
       'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number',
       'Magnitue', 'Radius', 'Covariance', 'Variance', 'Weight', 'Label'],
      dtype='object')
Summary of dataset info
<bound method DataFrame.info of               ID  flow_duration  Header_Length  Protocol type  Duration  \
0         769866       0.000000          54.00           6.00     64.00   
1        1859874       0.000892          54.58            NaN     64.00   
2         396092       0.000000           0.00           1.00     64.00   
3         17970

In [4]:
# Fill all null data as -1
print(df.isnull().sum())
data_n_null = df.fillna(-1, inplace=False)
print(data_n_null.isnull().sum())

data_n_null.head()
print(data_n_null.duplicated().sum())


ID                      0
flow_duration           0
Header_Length      195013
Protocol type      195013
Duration           195013
Rate               195013
Srate              195013
Drate              195013
fin_flag_number         0
syn_flag_number         0
rst_flag_number    195013
psh_flag_number    195013
ack_flag_number         0
ece_flag_number    195013
cwr_flag_number    195013
ack_count          195013
syn_count          195013
fin_count               0
urg_count               0
rst_count               0
HTTP               195013
HTTPS              195013
DNS                     0
Telnet             195013
SMTP               195013
SSH                195013
IRC                     0
TCP                195013
UDP                     0
DHCP                    0
ARP                195013
ICMP               195013
IPv                     0
LLC                     0
Tot sum            195013
Min                195013
Max                195013
AVG                     0
Std         

In [5]:
# Set mlflow as tracking server
mlflow.set_tracking_uri("http://localhost:5000")

In [7]:
# # Random forest follow this: https://gist.github.com/pb111/88545fa33780928694388779af23bf58
mlflow.set_experiment("random_forest")


data = data_n_null.drop(columns=['ID'])
data_sample = data.sample(frac=0.5)

X = data_sample.drop(columns=['Label'])
y = data_sample['Label']
# X = data.drop(columns=['Label'])
# y = data['Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

tags = {
    "dataset_frac": 0.5,
    "random_state": 42,
    "n_estimators" : 100,
    "test_size" : 0.2,
    "author": "Son Nguyen"
}

model = RandomForestClassifier(n_estimators=100,random_state=42)
mlflow.sklearn.autolog()

# Write artifact to mlflow
with mlflow.start_run() as run:

    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    # explainer = shap.TreeExplainer(model)
    # shap_values = explainer.shap_values(X_train)
    # mlflow.shap.log_explanation(shap_values, X_train)
    
    # Draw Feature Importance Score
    feature_scores = pd.Series(model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
    plt.figure(figsize=(10, 6))
    sns.barplot(x=feature_scores, y=feature_scores.index)
    plt.xlabel('Feature Importance Score')
    plt.ylabel('Features')
    plt.title("Visualizing Important Features")

    feature_importance_plot = "feature_importance.png"
    plt.savefig(feature_importance_plot, bbox_inches='tight')


    # Draw Confusion Matrix
    # cm = confusion_matrix(y_test, y_pred)
    # plt.figure(figsize=(20, 20))
    # sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
    # plt.xlabel('Predicted Labels')
    # plt.ylabel('True Labels')
    # plt.title('Confusion Matrix Heatmap')
    # confusion_matrix_plot = "confusion_matrix.png"
    # plt.savefig(confusion_matrix_plot, bbox_inches='tight')

    # Create classification report
    report_dict = classification_report(y_test, y_pred, output_dict=True)
    report_df = pd.DataFrame(report_dict).transpose()
    report_df.reset_index(inplace=True)
    report_df.rename(columns={"index": "Attack Type"}, inplace=True)
    report_filename = "classification_report.csv"
    report_df.to_csv(report_filename, index=False)


    mlflow.log_artifact(feature_importance_plot)
    # mlflow.log_artifact(confusion_matrix_plot)
    mlflow.log_artifact(report_filename)

    os.remove(feature_importance_plot)
    os.remove(report_filename)


    new_f1_score = f1_score(y_test, y_pred, average="weighted")
    client = mlflow.tracking.MlflowClient()
    model_name = "random_forest"
    latest_ = client.get_latest_versions(model_name, stages=None)[0]

    if latest_:
        previous_f1_score = client.get_metric_history(latest_.run_id, "training_f1_score")[-1].value

        # Compare F1 scores
        if new_f1_score > previous_f1_score:
            print("New F1 score is higher. Registering the new model version.")
            model_uri = "random_forest" 
            mlflow.sklearn.log_model(model, model_uri)
            registered_model = mlflow.register_model(f"runs:/{run.info.run_id}/{model_uri}", model_name)
            client.update_registered_model(
                name=model_name,
                description="A Random Forest Classifier trained.",
            )

            for key, value in tags.items():
                client.set_model_version_tag(
                    name=model_name,
                    version=registered_model.version,
                    key=key,
                    value=value
                )

            client.set_registered_model_alias(
                name=model_name,
                alias="test",
                version=registered_model.version,
            )
        else:
            print("New F1 score is not higher. Model will not be registered.")

    else:
        print("No previous model version found. Registering the new model as the first version.")
        model_uri = "random_forest" 
        mlflow.sklearn.log_model(model, model_uri)
        registered_model = mlflow.register_model(f"runs:/{run.info.run_id}/{model_uri}", model_name)
        client.update_registered_model(
            name=model_name,
            description="A Random Forest Classifier trained.",
        )

        for key, value in tags.items():
            client.set_model_version_tag(
                name=model_name,
                version=registered_model.version,
                key=key,
                value=value
            )

        client.set_registered_model_alias(
            name=model_name,
            alias="test",
            version=registered_model.version,
        )

  latest_ = client.get_latest_versions(model_name, stages=None)[0]


New F1 score is not higher. Model will not be registered.
🏃 View run mercurial-stag-700 at: http://localhost:5000/#/experiments/2/runs/e6f627a78c3e43188597afcb65a2d838
🧪 View experiment at: http://localhost:5000/#/experiments/2


In [None]:
# Mlflow check
client = mlflow.tracking.MlflowClient()
model_name = "random_forest"
latest_mv = client.get_latest_versions(model_name, stages=None)[0]
client.get_metric_history(latest_mv.run_id, "training_f1_score")[-1].value
