In [8]:
import numpy as np
import pandas as pd
from shared.utils import load_data
from datasets import preprocess_dataset, datasets_types
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans
from scipy.stats import beta as beta_dist
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, recall_score, roc_auc_score


In [9]:
import os
os.environ["LOKY_MAX_CPU_COUNT"] = "4"

In [None]:
load_dataset = True
name = "CIC-IDS_2017_2"
seed = 42
if not load_dataset:
    # Preprocesar el dataset
    """      """
    df = load_data(
        [
            "./shared/data/CIC_2019/DrDoS_MSSQL.csv"
        ],
        seed
    )
    print("Dataset cargado")
    df_preprocessed = preprocess_dataset(
        df, save=True, dataset_type="CIC_2019", seed=seed, load=load_dataset, name_save=name, name_load=name)
    print("Dataset Preprocesado")
else:
    df_preprocessed = preprocess_dataset(
        pd.DataFrame(), save=True, dataset_type="CIC_2017", seed=seed, load=load_dataset, name_save=name, name_load=name)
    print("Dataset Preprocesado")

Loading existing data
Dataset Preprocesado


In [28]:
class MultiArmedBanditThompsonSampling:

    def __init__(self, n_arms, n_clusters):
        self.n_arms = n_arms
        self.n_clusters = n_clusters
        self.arms = [RandomForestClassifier(), DecisionTreeClassifier(),
                     GaussianNB(), LogisticRegression(), MLPClassifier()]
        self.cluster_centers = None
        self.cluster_assignments = None
        self.reward_sums = {}
        for cluster in range(n_clusters):
            self.reward_sums[cluster] = np.zeros(n_arms)
        self.alpha = np.ones(self.n_arms)
        self.beta = np.ones(self.n_arms)

    def train(self, X_train, y_train):
        kmeans = KMeans(n_clusters=self.n_clusters)
        self.cluster_assignments = kmeans.fit_predict(X_train)
        self.cluster_centers = kmeans.cluster_centers_
        # Print the number of samples in each cluster

        for i in range(self.n_clusters):
            print('Cluster {}: {}'.format(
                i, np.sum(self.cluster_assignments == i)))
            cluster_mask = self.cluster_assignments == i
            cluster_X_train = X_train[cluster_mask]
            cluster_y_train = y_train[cluster_mask]
            for arm in range(self.n_arms):
                print('Training arm {} on cluster {}'.format(arm, i))
                # arm_mask = cluster_y_train == arm
                arm_mask = (cluster_y_train.values == arm).ravel()
                print("cluster_X_train shape:", cluster_X_train.shape)  # Số dòng, số cột của X
                print("cluster_y_train shape:", cluster_y_train.shape)  # Số dòng của y
                print("arm_mask shape:", arm_mask.shape)  # Kích thước của mask

                arm_X_train = cluster_X_train[arm_mask]
                arm_y_train = cluster_y_train[arm_mask]
                if len(arm_X_train) > 0 and len(np.unique(arm_y_train)) > 1:
                    self.arms[arm].fit(arm_X_train, arm_y_train)
                else:
                    self.arms[arm].fit(X_train, y_train)

        # Set the arms rewards for each cluster
        for i in range(self.n_clusters):
            cluster_mask = self.cluster_assignments == i
            cluster_X_test = X_train[cluster_mask]
            cluster_y_test = y_train[cluster_mask]
            for arm in range(self.n_arms):
                print('Setting reward_sums arm {} on cluster {}'.format(arm, i))
                arm_mask = (cluster_y_test.values == arm).ravel()
                arm_X_test = cluster_X_test[arm_mask]
                arm_y_test = cluster_y_test[arm_mask]
                if len(arm_X_test) > 0:
                    arm_y_pred = self.arms[arm].predict(arm_X_test)
                    self.reward_sums[i][arm] = np.mean(
                        arm_y_pred ==  arm_y_test.values.ravel())

    def select_arm(self, cluster):
        # Select the arm with the highest reward
        theta = np.zeros(self.n_arms)
        for arm in range(self.n_arms):
            theta[arm] = np.random.beta(self.alpha[arm] + self.reward_sums[cluster]
                                        [arm], self.beta[arm] + 1 - self.reward_sums[cluster][arm])
        return np.argmax(theta)

    def predict(self, X_test):
        # Select the arm for each sample
        arms = np.zeros(len(X_test))
        for i in range(len(X_test)):
            cluster = np.argmin(np.linalg.norm(
                self.cluster_centers - X_test[i], axis=1))
            arms[i] = self.select_arm(cluster)
        # Predict using the selected arm
        y_pred = np.zeros(len(X_test))
        for arm in range(self.n_arms):
            arm_mask = arms == arm
            arm_X_test = X_test[arm_mask]
            if len(arm_X_test) > 0:
                y_pred[arm_mask] = self.arms[arm].predict(arm_X_test)
        return y_pred, arms

In [32]:
# Train the MAB
mab = MultiArmedBanditThompsonSampling(n_arms=5, n_clusters=3)
mab.train(df_preprocessed.x_train, df_preprocessed.y_train)

Cluster 0: 92469
Training arm 0 on cluster 0
cluster_X_train shape: (92469, 69)
cluster_y_train shape: (92469, 1)
arm_mask shape: (92469,)


  return fit_method(estimator, *args, **kwargs)


Training arm 1 on cluster 0
cluster_X_train shape: (92469, 69)
cluster_y_train shape: (92469, 1)
arm_mask shape: (92469,)
Training arm 2 on cluster 0
cluster_X_train shape: (92469, 69)
cluster_y_train shape: (92469, 1)
arm_mask shape: (92469,)


  y = column_or_1d(y, warn=True)


Training arm 3 on cluster 0
cluster_X_train shape: (92469, 69)
cluster_y_train shape: (92469, 1)
arm_mask shape: (92469,)


  y = column_or_1d(y, warn=True)


Training arm 4 on cluster 0
cluster_X_train shape: (92469, 69)
cluster_y_train shape: (92469, 1)
arm_mask shape: (92469,)


  y = column_or_1d(y, warn=True)


Cluster 1: 204140
Training arm 0 on cluster 1
cluster_X_train shape: (204140, 69)
cluster_y_train shape: (204140, 1)
arm_mask shape: (204140,)


  return fit_method(estimator, *args, **kwargs)


Training arm 1 on cluster 1
cluster_X_train shape: (204140, 69)
cluster_y_train shape: (204140, 1)
arm_mask shape: (204140,)
Training arm 2 on cluster 1
cluster_X_train shape: (204140, 69)
cluster_y_train shape: (204140, 1)
arm_mask shape: (204140,)


  y = column_or_1d(y, warn=True)


Training arm 3 on cluster 1
cluster_X_train shape: (204140, 69)
cluster_y_train shape: (204140, 1)
arm_mask shape: (204140,)


  y = column_or_1d(y, warn=True)


Training arm 4 on cluster 1
cluster_X_train shape: (204140, 69)
cluster_y_train shape: (204140, 1)
arm_mask shape: (204140,)


  y = column_or_1d(y, warn=True)


Cluster 2: 195662
Training arm 0 on cluster 2
cluster_X_train shape: (195662, 69)
cluster_y_train shape: (195662, 1)
arm_mask shape: (195662,)


  return fit_method(estimator, *args, **kwargs)


Training arm 1 on cluster 2
cluster_X_train shape: (195662, 69)
cluster_y_train shape: (195662, 1)
arm_mask shape: (195662,)
Training arm 2 on cluster 2
cluster_X_train shape: (195662, 69)
cluster_y_train shape: (195662, 1)
arm_mask shape: (195662,)


  y = column_or_1d(y, warn=True)


Training arm 3 on cluster 2
cluster_X_train shape: (195662, 69)
cluster_y_train shape: (195662, 1)
arm_mask shape: (195662,)


  y = column_or_1d(y, warn=True)


Training arm 4 on cluster 2
cluster_X_train shape: (195662, 69)
cluster_y_train shape: (195662, 1)
arm_mask shape: (195662,)


  y = column_or_1d(y, warn=True)


Setting reward_sums arm 0 on cluster 0
Setting reward_sums arm 1 on cluster 0
Setting reward_sums arm 2 on cluster 0
Setting reward_sums arm 3 on cluster 0
Setting reward_sums arm 4 on cluster 0
Setting reward_sums arm 0 on cluster 1
Setting reward_sums arm 1 on cluster 1
Setting reward_sums arm 2 on cluster 1
Setting reward_sums arm 3 on cluster 1
Setting reward_sums arm 4 on cluster 1
Setting reward_sums arm 0 on cluster 2
Setting reward_sums arm 1 on cluster 2
Setting reward_sums arm 2 on cluster 2
Setting reward_sums arm 3 on cluster 2
Setting reward_sums arm 4 on cluster 2


In [34]:
# Kiểm tra nếu y_test là DataFrame, lấy cột đầu tiên
if isinstance(df_preprocessed.y_test, pd.DataFrame):
    y_test_series = df_preprocessed.y_test.iloc[:, 0]  # Lấy cột đầu tiên
else:
    y_test_series = df_preprocessed.y_test  # Nếu là Series thì giữ nguyên

# Xử lý giá trị trong y_test
y_test_series = y_test_series.astype(str).str.strip()  # Xóa khoảng trắng
y_test_series = y_test_series[y_test_series.str.isnumeric()]  # Giữ lại giá trị số
y_test = np.array(y_test_series.astype(int))  # Chuyển thành mảng số nguyên

# Test the MAB
y_pred, selected_arms = mab.predict(df_preprocessed.x_test)

# Chuyển đổi y_pred sang kiểu số nguyên
y_pred = np.array([int(y) for y in y_pred])

# In các giá trị duy nhất
print("Unique values in y_pred:", np.unique(y_pred))
print("Unique values in y_test:", np.unique(y_test))

# Đánh giá mô hình
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1 Score:", f1_score(y_test, y_pred, average='macro'))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred))


Unique values in y_pred: [0 1]
Unique values in y_test: [0 1]
              precision    recall  f1-score   support

           0       0.99      0.98      0.99    124297
           1       0.98      0.99      0.98     86677

    accuracy                           0.99    210974
   macro avg       0.98      0.99      0.98    210974
weighted avg       0.99      0.99      0.99    210974

Accuracy: 0.9852446273000465
Recall: 0.9858259337935356
F1 Score: 0.9847919437202677
ROC AUC Score: 0.9858259337935357
