# CICIDS2017 - NayveBayes Multiclass Classifier

The preprocessing is the same as in LSTM.ipynb.

In [1]:
from sklearn.preprocessing import QuantileTransformer
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample
from math import ceil
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

In [2]:
dfv3 = pd.read_csv('feature_selected_cicids2017.csv')

qt = QuantileTransformer(random_state=10) # number of quantiles can be set, default n_quantiles=1000

labels = dfv3.loc[:, "Label"]

binary_labels = dfv3.loc[:, "Traffic type"]

dfv3.drop(["Label", "Traffic type"], axis=1, inplace=True) # drop categorical columns

dfv3_scalled = qt.fit_transform(dfv3)

dfv3_scalled

array([[3.00586076e-04, 4.42942943e-01, 8.26826827e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.01916103e-04, 4.42942943e-01, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.02248610e-04, 4.42942943e-01, 8.26826827e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [3.16219702e-01, 9.42335051e-01, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [9.45149941e-01, 9.92141282e-01, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [4.34898670e-01, 7.19219219e-01, 2.55755756e-01, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

### Train Test Split

In [3]:
from sklearn.model_selection import train_test_split

train_features, test_features, train_labels, test_labels  = train_test_split(dfv3_scalled, labels, random_state=10, train_size=0.7) # 70/30 train test split
train_features, validation_features, train_labels, validation_labels = train_test_split(train_features, train_labels, random_state=10, train_size=0.8)

labels_count = train_labels.value_counts()
all_samples = labels_count.sum()
print(labels_count)
print("Total: {}".format(all_samples))

Label
BENIGN              280308
DoS Hulk             96802
DDoS                 71851
PortScan             50862
DoS GoldenEye         5717
FTP-Patator           3299
DoS slowloris         3013
DoS Slowhttptest      2942
SSH-Patator           1816
Bot                   1079
Brute Force            800
XSS                    376
Infiltration            20
Sql Injection           12
Heartbleed               5
Name: count, dtype: int64
Total: 518902


### OverSampling

In [4]:
# Parameters
min_threshold = 0.005  # Minimum percentage threshold for resampling
min_samples_small_class = 2  # Minimum samples required for small classes

# Ensure DataFrame/Series compatibility
def ensure_dataframe(features, labels):
    if isinstance(features, np.ndarray):
        features = pd.DataFrame(features)
    if isinstance(labels, np.ndarray):
        labels = pd.Series(labels)
    return features.reset_index(drop=True), labels.reset_index(drop=True)

# Handle small classes separately
def replicate_small_classes(features, labels, min_samples=2):
    features, labels = ensure_dataframe(features, labels)
    small_classes = labels.value_counts()[labels.value_counts() < min_samples].index
    replicated_features, replicated_labels = [], []
    for cls in small_classes:
        cls_features = features[labels == cls]
        cls_labels = labels[labels == cls]
        replicated_features.append(resample(cls_features, replace=True, n_samples=min_samples, random_state=10))
        replicated_labels.append(resample(cls_labels, replace=True, n_samples=min_samples, random_state=10))
    if replicated_features:
        replicated_features = pd.concat(replicated_features, ignore_index=True)
        replicated_labels = pd.concat(replicated_labels, ignore_index=True)
    else:
        replicated_features = pd.DataFrame(columns=features.columns)
        replicated_labels = pd.Series(dtype=labels.dtype)
    return replicated_features, replicated_labels

# Main oversampling function
def oversample_data(features, labels):
    features, labels = ensure_dataframe(features, labels)
    labels_count = labels.value_counts()
    all_samples = labels_count.sum()

    # Handle classes with fewer than `min_samples_small_class`
    small_classes = labels_count[labels_count < min_samples_small_class].index
    small_features = features[labels.isin(small_classes)]
    small_labels = labels[labels.isin(small_classes)]
    replicated_features, replicated_labels = replicate_small_classes(small_features, small_labels, min_samples=min_samples_small_class)

    # Determine valid k_neighbors dynamically for SMOTE
    smallest_majority_class_size = labels_count[labels_count >= min_samples_small_class].min()
    k_neighbors = max(1, min(5, smallest_majority_class_size - 1))  # SMOTE requires k_neighbors < samples in class

    # Create SMOTE sampling strategy
    smote_strategy = {
        cls: max(count, ceil(min_threshold * all_samples))
        for cls, count in labels_count.items()
        if count >= min_samples_small_class
    }

    # Apply SMOTE
    smote = SMOTE(random_state=10, k_neighbors=k_neighbors, sampling_strategy=smote_strategy)
    try:
        over_features, over_labels = smote.fit_resample(features, labels)
    except ValueError as e:
        print(f"SMOTE failed with ValueError: {e}")
        return features, labels  # Return original data if SMOTE fails

    # Combine SMOTE results with small classes
    final_features = pd.concat([pd.DataFrame(over_features), replicated_features], ignore_index=True)
    final_labels = pd.concat([pd.Series(over_labels), replicated_labels], ignore_index=True)

    return final_features, final_labels

# Apply oversampling
over_train_features, over_train_labels = oversample_data(train_features, train_labels)
over_validation_features, over_validation_labels = oversample_data(validation_features, validation_labels)

# Print results
print("Oversampled Training Labels Distribution:")
print(over_train_labels.value_counts())
print("Oversampled Validation Labels Distribution:")
print(over_validation_labels.value_counts())


  final_features = pd.concat([pd.DataFrame(over_features), replicated_features], ignore_index=True)


Oversampled Training Labels Distribution:
BENIGN              280308
DoS Hulk             96802
DDoS                 71851
PortScan             50862
DoS GoldenEye         5717
FTP-Patator           3299
DoS slowloris         3013
DoS Slowhttptest      2942
SSH-Patator           2595
Brute Force           2595
Bot                   2595
XSS                   2595
Sql Injection         2595
Infiltration          2595
Heartbleed            2595
Name: count, dtype: int64
Oversampled Validation Labels Distribution:
Label
BENIGN              70305
DoS Hulk            24154
DDoS                17932
PortScan            12559
DoS GoldenEye        1420
FTP-Patator           831
DoS slowloris         757
DoS Slowhttptest      723
Brute Force           649
SSH-Patator           649
Bot                   649
XSS                   649
Infiltration          649
Sql Injection         649
Heartbleed              3
Name: count, dtype: int64


### One-hot encoding

In [5]:
from sklearn.preprocessing import OneHotEncoder

test_labels_rshped = test_labels.values.reshape(-1,1)
over_train_labels_rshped = over_train_labels.values.reshape(-1,1)
over_validation_rshped = over_validation_labels.values.reshape(-1,1)

ohenc = OneHotEncoder()


test_labels_enc = ohenc.fit_transform(test_labels_rshped).toarray()  # one-hot encoded test set lbls
over_train_labels_enc = ohenc.fit_transform(over_train_labels_rshped).toarray()  # one-hot encoded upsampled train set lbls
over_validation_labels_enc = ohenc.fit_transform(over_validation_rshped).toarray()  # one-hot encoded upsampled train set lbls for neural nets predicting upsampled traffic

print("Shape of train features", over_train_features.shape)
print("Shape of validation features", over_validation_features.shape)
print("Shape of test features", test_features.shape)

Shape of train features (532959, 36)
Shape of validation features (132578, 36)
Shape of test features (277984, 36)


### NayveBayes

In [6]:
# Input shape for the data
num_classes = over_train_labels_enc.shape[1]  # Number of classes

# Convert one-hot encoded labels back to categorical for Naive Bayes
train_labels_categorical = np.argmax(over_train_labels_enc, axis=1)
validation_labels_categorical = np.argmax(over_validation_labels_enc, axis=1)
test_labels_categorical = np.argmax(test_labels_enc, axis=1)

# Create a Naive Bayes classifier
naive_bayes_model = GaussianNB()

# Train the Naive Bayes model
naive_bayes_model.fit(over_train_features, train_labels_categorical)

# Evaluate on the validation set
validation_predictions = naive_bayes_model.predict(over_validation_features)
validation_accuracy = accuracy_score(validation_labels_categorical, validation_predictions)
print(f"Validation Accuracy: {validation_accuracy}")

# Evaluate on the test set
test_predictions = naive_bayes_model.predict(test_features)
test_accuracy = accuracy_score(test_labels_categorical, test_predictions)
print(f"Test Accuracy: {test_accuracy}")

# Classification report
print("Classification Report on Test Data:")
print(classification_report(test_labels_categorical, test_predictions))

# Save the Naive Bayes model
import joblib
joblib.dump(naive_bayes_model, 'naive_bayes_multiclass_model.pkl')

Validation Accuracy: 0.901559836473623
Test Accuracy: 0.9004439104408887
Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.99      0.87      0.92    150258
           1       0.05      1.00      0.10       602
           2       0.55      0.86      0.67       459
           3       0.98      1.00      0.99     38231
           4       0.31      0.99      0.47      3149
           5       0.89      0.88      0.89     51890
           6       1.00      0.60      0.75      1563
           7       0.60      0.95      0.74      1615
           8       0.98      1.00      0.99      1801
           9       1.00      0.80      0.89         5
          10       1.00      0.90      0.95        10
          11       1.00      0.98      0.99     27273
          12       0.89      0.92      0.91       926
          13       0.29      1.00      0.45         5
          14       0.68      0.94      0.79       197

    accuracy             

['naive_bayes_multiclass_model.pkl']