## Import Dependencies

In [2]:
import pandas as pd
import numpy as np
from pycaret.datasets import get_data
from pycaret.classification import *
import os

## Define functions

In [3]:
dataset_prefix = '0_01-partial-dataset'

In [4]:
def save_pre_processed_dataset(df: pd.DataFrame, name):
    """Save as the dataset pre-processed as csv in the dir root/Datasets/CSE-CIC/IDS2018/pre-processed/"""
    file_name = dataset_prefix + '-' + name + '.csv'
    
    # Define the path to save the dataset pre-processed
    file_path = os.path.join(os.getcwd(), '..', 'Datasets', 'CSE-CIC-IDS2018', 'pre-processed', file_name)

    # Save the dataset pre-processed
    df.to_csv(file_path ,index=False)

In [6]:
def test_and_save_pre_processing_approach(approach_name, setup):
    # Test different algorithms
    setup.compare_models(include = ['ada', 'gbc', 'et','xgboost', 'rf', 'dt', 'lightgbm'])
    # setup.compare_models()
    
    # Getting the resulting df of the models_comparison
    df_models_comparison = pull().copy()
    
    # Getting the dataset_pre_processed by the autoML
    df_dataset_pre_processed = setup.get_config('dataset_transformed').copy()

    # Save as the dataset pre-processed as csv in the dir root/Datasets/CSE-CIC/IDS2018/pre-processed/
    save_pre_processed_dataset(df_dataset_pre_processed, approach_name)

    return df_models_comparison, df_dataset_pre_processed


## Set display unlimited number of lines

In [7]:
pd.set_option('display.max_rows', None, 'display.max_columns', None)

## Load datasets in pandas dataframes

In [8]:
# Define the path to the folder containing the CSV files
file_path = os.path.join(os.getcwd(), '..', 'Datasets', 'CSE-CIC-IDS2018', 'raw', '0_01-partial-dataset.csv')
# Import csv to pandas
dataset = pd.read_csv(file_path)

# Pre-processing

## 1. First steps for pycaret be able to consume

#### Replace -inf/+inf for Nan

In [9]:
dataset.replace([np.inf, -np.inf], np.nan, inplace=True)

#### Convert Timestamp values to pandas date and time datetime64 format

In [10]:
dataset['Timestamp'] = pd.to_datetime(dataset['Timestamp'], format='%d/%m/%Y %H:%M:%S', errors='coerce')

## 2. Pre-processing using Pycaret

### 2.1 Generic pre-processing specifications techniques
The autoML choose the following parameters automatically:
- inputation_type: simple
    - numeric_imputation: mean
    - categorical_imputation: mode
- fold_strategy: stratifiedkfold
    - fold: 10

In [19]:
generic_pre_processing_setup = setup(
                                dataset, 
                                target = 'Label',
                                imputation_type = 'simple',
                                numeric_imputation = 'mean',
                                categorical_imputation = 'mode',
                                fold_strategy = 'stratifiedkfold',
                                fold = 10
                                )

Unnamed: 0,Description,Value
0,Session id,5543
1,Target,Label
2,Target type,Binary
3,Original data shape,"(31480, 80)"
4,Transformed data shape,"(31480, 82)"
5,Transformed train set shape,"(22036, 82)"
6,Transformed test set shape,"(9444, 82)"
7,Numeric features,78
8,Date features,1
9,Rows with missing values,0.5%


In [20]:
result = test_and_save_pre_processing_approach('generic_pre_processing', generic_pre_processing_setup)
models_comparison_generic = result[0]
dataset_generic_pre_processing = result[1]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,0.9999,1.0,0.9,1.0,0.9333,0.9333,0.9414,1.974
et,Extra Trees Classifier,0.9998,1.0,0.8167,1.0,0.88,0.8799,0.8937,0.666
xgboost,Extreme Gradient Boosting,0.9998,0.9999,0.7833,0.9,0.8267,0.8266,0.834,0.755
rf,Random Forest Classifier,0.9996,0.9997,0.6333,0.9,0.7167,0.7166,0.7405,1.157
dt,Decision Tree Classifier,0.9994,0.9164,0.8333,0.7,0.7505,0.7502,0.7581,0.422
knn,K Neighbors Classifier,0.9992,0.8998,0.45,0.7,0.5067,0.5064,0.5389,0.803
gbc,Gradient Boosting Classifier,0.9991,0.8809,0.65,0.675,0.639,0.6386,0.6501,8.827
ridge,Ridge Classifier,0.999,0.9571,0.2333,0.3667,0.2633,0.2632,0.2807,0.263
qda,Quadratic Discriminant Analysis,0.9989,0.838,0.0,0.0,0.0,0.0,0.0,0.409
dummy,Dummy Classifier,0.9989,0.5,0.0,0.0,0.0,0.0,0.0,0.153


### 2.2 Specifyng date feature pre-processing
- Discarding year
- Adding second, day and month features

This discarding is necessary beacause the year of the network package must not be taken as a variable to determine if the package is malicious or not

In [24]:
date_specific_pre_processing_setup = setup(
                                    dataset, 
                                    target = 'Label',
                                    date_features = ['Timestamp'],
                                    create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
                                    imputation_type = 'simple',
                                    numeric_imputation = 'mean',
                                    categorical_imputation = 'mode',
                                    fold_strategy = 'stratifiedkfold',
                                    fold = 10
                                )

Unnamed: 0,Description,Value
0,Session id,5991
1,Target,Label
2,Target type,Binary
3,Original data shape,"(31480, 80)"
4,Transformed data shape,"(31480, 84)"
5,Transformed train set shape,"(22036, 84)"
6,Transformed test set shape,"(9444, 84)"
7,Numeric features,78
8,Date features,1
9,Rows with missing values,0.5%


In [25]:
result = test_and_save_pre_processing_approach('date_specific_pre_processing', date_specific_pre_processing_setup)
models_comparison_date_specific = result[0]
dataset_date_specific_pre_processing = result[1]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,1.0,1.0,0.9667,1.0,0.98,0.98,0.9816,1.971
xgboost,Extreme Gradient Boosting,0.9998,0.9997,0.7833,1.0,0.86,0.8599,0.8753,0.712
et,Extra Trees Classifier,0.9996,0.9999,0.6333,0.9,0.7233,0.7232,0.744,0.514
rf,Random Forest Classifier,0.9995,0.9998,0.5,0.8,0.5933,0.5932,0.6201,0.983
dt,Decision Tree Classifier,0.9993,0.8248,0.65,0.7567,0.6571,0.6568,0.6792,0.33
knn,K Neighbors Classifier,0.9992,0.8665,0.3833,0.4333,0.4,0.3999,0.4039,0.711
ridge,Ridge Classifier,0.9992,0.9383,0.35,0.5,0.3967,0.3965,0.41,0.171
gbc,Gradient Boosting Classifier,0.9992,0.8747,0.6167,0.65,0.6033,0.603,0.6177,8.946
lr,Logistic Regression,0.9991,0.9285,0.35,0.4,0.3667,0.3664,0.3705,3.814
qda,Quadratic Discriminant Analysis,0.9989,0.8598,0.0,0.0,0.0,0.0,0.0,0.425


### 2.3 Missing values pre-processing

In [11]:
simple_imputation_pre_processing_setup = setup(
                                                dataset, 
                                                target = 'Label',
                                                date_features = ['Timestamp'],
                                                create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
                                                imputation_type = 'simple',
                                                numeric_imputation = 'knn',
                                                categorical_imputation = 'mode',
                                                fold = 10
                                                )

Unnamed: 0,Description,Value
0,Session id,3903
1,Target,Label
2,Target type,Binary
3,Original data shape,"(31480, 80)"
4,Transformed data shape,"(31480, 84)"
5,Transformed train set shape,"(22036, 84)"
6,Transformed test set shape,"(9444, 84)"
7,Numeric features,78
8,Date features,1
9,Rows with missing values,0.5%


In [12]:
result = test_and_save_pre_processing_approach('simple_imputation_pre_processing', simple_imputation_pre_processing_setup)
models_comparison_simple_imputation = result[0]
dataset_simple_imputation_pre_processing = result[1]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,0.9998,0.992,0.8667,0.9667,0.8933,0.8932,0.9047,3.309
et,Extra Trees Classifier,0.9997,1.0,0.7667,0.9,0.8,0.7999,0.8154,0.791
xgboost,Extreme Gradient Boosting,0.9997,0.9999,0.75,1.0,0.83,0.8299,0.8514,0.791
rf,Random Forest Classifier,0.9995,0.9998,0.5667,0.9,0.6667,0.6665,0.6982,1.119
dt,Decision Tree Classifier,0.9994,0.8582,0.7167,0.6767,0.6783,0.678,0.687,0.581
gbc,Gradient Boosting Classifier,0.9991,0.9201,0.6167,0.6433,0.6017,0.6012,0.6151,7.24
lightgbm,Light Gradient Boosting Machine,0.9987,0.8995,0.7333,0.5595,0.6015,0.601,0.622,0.988


In [26]:
iterative_imputation_pre_processing_setup = setup(
                                                dataset, 
                                                target = 'Label',
                                                date_features = ['Timestamp'],
                                                create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
                                                imputation_type = 'iterative',
                                                fold_strategy = 'stratifiedkfold',
                                                fold = 10
                                                )

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005555 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12901
[LightGBM] [Info] Number of data points in the train set: 21925, number of used features: 70
[LightGBM] [Info] Start training from score 301342.783134
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004657 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12901
[LightGBM] [Info] Number of data points in the train set: 21925, number of used features: 70
[LightGBM] [Info] Start training from score 28209.183952
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015280 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Inf

Unnamed: 0,Description,Value
0,Session id,3091
1,Target,Label
2,Target type,Binary
3,Original data shape,"(31480, 80)"
4,Transformed data shape,"(31480, 84)"
5,Transformed train set shape,"(22036, 84)"
6,Transformed test set shape,"(9444, 84)"
7,Numeric features,78
8,Date features,1
9,Rows with missing values,0.5%


In [27]:
result = test_and_save_pre_processing_approach('iterative_imputation_pre_processing', iterative_imputation_pre_processing_setup)
models_comparison_iterative_imputation = result[0]
dataset_iterative_imputation_pre_processing = result[1]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,1.0,1.0,0.95,1.0,0.9667,0.9666,0.9707,7.725
et,Extra Trees Classifier,0.9997,1.0,0.75,1.0,0.83,0.8299,0.8514,7.245
xgboost,Extreme Gradient Boosting,0.9997,0.9998,0.7,1.0,0.7967,0.7965,0.8221,7.185
rf,Random Forest Classifier,0.9995,0.9996,0.55,1.0,0.6933,0.6931,0.7322,5.306
gbc,Gradient Boosting Classifier,0.9995,0.9464,0.75,0.8,0.7433,0.7431,0.7582,21.545
dt,Decision Tree Classifier,0.9994,0.8998,0.8,0.79,0.7505,0.7502,0.7718,4.905
knn,K Neighbors Classifier,0.9993,0.8832,0.3333,0.7,0.4467,0.4465,0.4798,6.018
ridge,Ridge Classifier,0.9993,0.9762,0.3333,0.7,0.4467,0.4465,0.4798,4.398
lr,Logistic Regression,0.999,0.9056,0.3333,0.4,0.3305,0.3301,0.3474,10.1
qda,Quadratic Discriminant Analysis,0.9989,0.8746,0.0,0.0,0.0,0.0,0.0,4.64


#### Conclusion
Analyzing mainly the Recall and F1 metrics it was possible do see:
- The Decision Tree and Extra Trees algorithms were improved
- The Ada Boost and Extreme Gradiend Boosting were deteriorated

### 2.4 Fix imbalance pre-processing

In [34]:
fix_imbalance_pre_processing_setup = setup(
                                                dataset, 
                                                target = 'Label',
                                                date_features = ['Timestamp'],
                                                create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
                                                imputation_type = 'simple',
                                                numeric_imputation = 'mean',
                                                categorical_imputation = 'mode',
                                                fold_strategy = 'stratifiedkfold',
                                                fold = 10,
                                                fix_imbalance = True,
                                                fix_imbalance_method = 'SMOTE'
                                                )

Unnamed: 0,Description,Value
0,Session id,476
1,Target,Label
2,Target type,Binary
3,Original data shape,"(31480, 80)"
4,Transformed data shape,"(53468, 84)"
5,Transformed train set shape,"(44024, 84)"
6,Transformed test set shape,"(9444, 84)"
7,Numeric features,78
8,Date features,1
9,Rows with missing values,0.5%


In [35]:
result = test_and_save_pre_processing_approach('fix_imbalance_pre_processing', fix_imbalance_pre_processing_setup)
models_comparison_fix_imbalance = result[0]
dataset_fix_imbalance_pre_processing = result[1]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,1.0,1.0,0.9667,1.0,0.98,0.98,0.9816,5.221
gbc,Gradient Boosting Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,20.013
xgboost,Extreme Gradient Boosting,1.0,1.0,0.95,1.0,0.9667,0.9666,0.9707,1.017
et,Extra Trees Classifier,0.9998,1.0,0.7833,0.9,0.8167,0.8166,0.8284,1.48
lightgbm,Light Gradient Boosting Machine,0.9998,0.9648,0.8667,0.9667,0.8933,0.8932,0.9047,1.369
rf,Random Forest Classifier,0.9997,1.0,0.75,1.0,0.83,0.8299,0.8514,3.2
dt,Decision Tree Classifier,0.9996,0.85,0.7,0.825,0.7324,0.7322,0.7466,0.997
qda,Quadratic Discriminant Analysis,0.9992,0.9077,0.3333,0.6,0.4167,0.4166,0.4405,0.666
dummy,Dummy Classifier,0.9989,0.5,0.0,0.0,0.0,0.0,0.0,0.241
knn,K Neighbors Classifier,0.9961,0.9406,0.8833,0.216,0.3403,0.3391,0.4289,1.518


#### Conclusion
Analyzing mainly the Recall and F1 metrics it was possible do see:
- The SMOTE method proved be the most suitable
- The Ada, xgboost, gbc and rf were improved
- The Decision Tree were deteriorated

### 2.5 Remove Outliers pre-processing

In [39]:
remove_outliers_pre_processing_setup = setup(
                                            dataset, 
                                            target = 'Label',
                                            date_features = ['Timestamp'],
                                            create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
                                            imputation_type = 'simple',
                                            numeric_imputation = 'mean',
                                            categorical_imputation = 'mode',
                                            fold_strategy = 'stratifiedkfold',
                                            fold = 10,
                                            fix_imbalance = True,
                                            fix_imbalance_method = 'SMOTE',
                                            remove_outliers = True,
                                            outliers_method = 'iforest'
                                            )

Unnamed: 0,Description,Value
0,Session id,7987
1,Target,Label
2,Target type,Binary
3,Original data shape,"(31480, 80)"
4,Transformed data shape,"(51274, 84)"
5,Transformed train set shape,"(41830, 84)"
6,Transformed test set shape,"(9444, 84)"
7,Numeric features,78
8,Date features,1
9,Rows with missing values,0.5%


In [40]:
result = test_and_save_pre_processing_approach('remove_outliers_pre_processing', remove_outliers_pre_processing_setup)
models_comparison_remove_outliers = result[0]
dataset_remove_outliers_pre_processing = result[1]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,0.9997,0.9993,0.8,0.9167,0.8267,0.8265,0.8416,1.501
dt,Decision Tree Classifier,0.9996,0.8749,0.75,0.9,0.7833,0.7831,0.8031,0.753
ada,Ada Boost Classifier,0.9996,0.9806,0.7167,0.9333,0.7833,0.7831,0.8031,4.561
gbc,Gradient Boosting Classifier,0.9996,0.9997,0.7167,0.95,0.8,0.7998,0.8161,17.957
et,Extra Trees Classifier,0.9996,0.9985,0.6667,0.85,0.7267,0.7265,0.7416,1.721
rf,Random Forest Classifier,0.9995,0.9992,0.5833,0.8,0.6567,0.6565,0.6733,2.903
lightgbm,Light Gradient Boosting Machine,0.9995,0.8931,0.7167,0.8833,0.7667,0.7665,0.7824,1.673
qda,Quadratic Discriminant Analysis,0.9993,0.7922,0.4,0.9,0.55,0.5498,0.5973,0.849
dummy,Dummy Classifier,0.9989,0.5,0.0,0.0,0.0,0.0,0.0,0.515
knn,K Neighbors Classifier,0.9959,0.8571,0.7167,0.1794,0.2803,0.279,0.3507,1.414


In [42]:
remove_outliers_2_pre_processing_setup = setup(
                                            dataset, 
                                            target = 'Label',
                                            date_features = ['Timestamp'],
                                            create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
                                            imputation_type = 'simple',
                                            numeric_imputation = 'mean',
                                            categorical_imputation = 'mode',
                                            fold_strategy = 'stratifiedkfold',
                                            fold = 10,
                                            fix_imbalance = True,
                                            fix_imbalance_method = 'SMOTE',
                                            remove_outliers = True,
                                            outliers_method = 'ee'
                                            )

Unnamed: 0,Description,Value
0,Session id,7027
1,Target,Label
2,Target type,Binary
3,Original data shape,"(31480, 80)"
4,Transformed data shape,"(51264, 84)"
5,Transformed train set shape,"(41820, 84)"
6,Transformed test set shape,"(9444, 84)"
7,Numeric features,78
8,Date features,1
9,Rows with missing values,0.5%


In [43]:
result = test_and_save_pre_processing_approach('remove_outliers_2_pre_processing', remove_outliers_2_pre_processing_setup)
models_comparison_remove_outliers_2 = result[0]
dataset_remove_outliers_2_pre_processing = result[1]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,1.0,1.0,0.95,1.0,0.9667,0.9666,0.9707,7.646
gbc,Gradient Boosting Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,20.561
rf,Random Forest Classifier,0.9998,0.9997,0.7833,1.0,0.85,0.8499,0.8698,6.984
et,Extra Trees Classifier,0.9998,1.0,0.9,0.975,0.9157,0.9156,0.9259,4.265
dt,Decision Tree Classifier,0.9997,0.9166,0.8333,0.875,0.8424,0.8422,0.848,4.515
xgboost,Extreme Gradient Boosting,0.9997,0.9995,0.9333,0.8667,0.8933,0.8932,0.8965,4.297
qda,Quadratic Discriminant Analysis,0.9992,0.9518,0.25,0.4,0.2967,0.2966,0.31,4.249
lightgbm,Light Gradient Boosting Machine,0.9992,0.9993,0.8833,0.7461,0.794,0.7937,0.8037,4.903
dummy,Dummy Classifier,0.9989,0.5,0.0,0.0,0.0,0.0,0.0,3.791
knn,K Neighbors Classifier,0.9958,0.9574,0.9167,0.2023,0.329,0.3278,0.427,4.496


In [44]:
remove_outliers_3_pre_processing_setup = setup(
                                            dataset, 
                                            target = 'Label',
                                            date_features = ['Timestamp'],
                                            create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
                                            imputation_type = 'simple',
                                            numeric_imputation = 'mean',
                                            categorical_imputation = 'mode',
                                            fold_strategy = 'stratifiedkfold',
                                            fold = 10,
                                            fix_imbalance = True,
                                            fix_imbalance_method = 'SMOTE',
                                            remove_outliers = True,
                                            outliers_method = 'lof'
                                            )

Unnamed: 0,Description,Value
0,Session id,288
1,Target,Label
2,Target type,Binary
3,Original data shape,"(31480, 80)"
4,Transformed data shape,"(51286, 84)"
5,Transformed train set shape,"(41842, 84)"
6,Transformed test set shape,"(9444, 84)"
7,Numeric features,78
8,Date features,1
9,Rows with missing values,0.5%


In [45]:
result = test_and_save_pre_processing_approach('remove_outliers_3_pre_processing', remove_outliers_3_pre_processing_setup)
models_comparison_remove_outliers_3 = result[0]
dataset_remove_outliers_3_pre_processing = result[1]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.9996,0.9999,0.6667,0.9667,0.7567,0.7565,0.7854,22.798
dt,Decision Tree Classifier,0.9995,0.7916,0.5833,0.825,0.6624,0.6622,0.6826,2.617
rf,Random Forest Classifier,0.9995,0.9048,0.5333,0.8,0.6267,0.6265,0.646,4.997
ada,Ada Boost Classifier,0.9995,0.946,0.5833,0.875,0.679,0.6789,0.7033,6.681
et,Extra Trees Classifier,0.9995,0.9986,0.4833,0.7,0.56,0.5599,0.5753,3.399
xgboost,Extreme Gradient Boosting,0.9995,0.9857,0.5833,0.825,0.6624,0.6622,0.6826,3.393
lightgbm,Light Gradient Boosting Machine,0.9994,0.7925,0.5833,0.775,0.6457,0.6455,0.6618,3.746
qda,Quadratic Discriminant Analysis,0.999,0.6992,0.15,0.3,0.2,0.1999,0.2121,2.628
dummy,Dummy Classifier,0.9989,0.5,0.0,0.0,0.0,0.0,0.0,2.298
knn,K Neighbors Classifier,0.9945,0.7645,0.5333,0.1021,0.1696,0.1682,0.2296,3.247


#### Conclusion
Analyzing mainly the Recall and F1 metrics it was possible do see:
- The ee method proved be the most suitable
- The only algortihm that has been improved was the Extra trees
- All the others algorithms has been deteriorated

So we wil not use remove_outliers method

### 2.6 Normalize pre-processing

In [46]:
normalize_pre_processing_setup = setup(
                                    dataset, 
                                    target = 'Label',
                                    date_features = ['Timestamp'],
                                    create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
                                    imputation_type = 'simple',
                                    numeric_imputation = 'mean',
                                    categorical_imputation = 'mode',
                                    fold_strategy = 'stratifiedkfold',
                                    fold = 10,
                                    fix_imbalance = True,
                                    fix_imbalance_method = 'SMOTE',
                                    normalize = True,
                                    # Test different zscore
                                    normalize_method = 'zscore'
                                    )

Unnamed: 0,Description,Value
0,Session id,1744
1,Target,Label
2,Target type,Binary
3,Original data shape,"(31480, 80)"
4,Transformed data shape,"(53468, 84)"
5,Transformed train set shape,"(44024, 84)"
6,Transformed test set shape,"(9444, 84)"
7,Numeric features,78
8,Date features,1
9,Rows with missing values,0.5%


In [47]:
result = test_and_save_pre_processing_approach('normalize_pre_processing', normalize_pre_processing_setup)
models_comparison_normalize = result[0]
dataset_normalize_pre_processing = result[1]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,1.0,1.0,0.9667,1.0,0.98,0.98,0.9816,4.527
gbc,Gradient Boosting Classifier,0.9999,1.0,1.0,0.9333,0.96,0.96,0.9633,17.273
xgboost,Extreme Gradient Boosting,0.9999,0.9999,0.9167,0.9667,0.9333,0.9333,0.9373,1.149
rf,Random Forest Classifier,0.9998,0.9997,0.8333,0.95,0.8667,0.8666,0.8784,2.921
et,Extra Trees Classifier,0.9998,0.9999,0.7833,1.0,0.85,0.8499,0.8698,1.43
lightgbm,Light Gradient Boosting Machine,0.9997,0.9937,0.8667,0.8167,0.8333,0.8332,0.8373,1.587
dt,Decision Tree Classifier,0.9994,0.8582,0.7167,0.7333,0.7,0.6997,0.712,0.978
qda,Quadratic Discriminant Analysis,0.9992,0.9407,0.3,0.6,0.3833,0.3832,0.4145,1.17
dummy,Dummy Classifier,0.9989,0.5,0.0,0.0,0.0,0.0,0.0,0.321
knn,K Neighbors Classifier,0.993,0.822,0.65,0.0947,0.1645,0.1629,0.2449,1.335


In [48]:
normalize_2_pre_processing_setup = setup(
                                    dataset, 
                                    target = 'Label',
                                    date_features = ['Timestamp'],
                                    create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
                                    imputation_type = 'simple',
                                    numeric_imputation = 'mean',
                                    categorical_imputation = 'mode',
                                    fold_strategy = 'stratifiedkfold',
                                    fold = 10,
                                    fix_imbalance = True,
                                    fix_imbalance_method = 'SMOTE',
                                    normalize = True,
                                    # Test different zscore
                                    normalize_method = 'minmax'
                                    )

Unnamed: 0,Description,Value
0,Session id,4303
1,Target,Label
2,Target type,Binary
3,Original data shape,"(31480, 80)"
4,Transformed data shape,"(53468, 84)"
5,Transformed train set shape,"(44024, 84)"
6,Transformed test set shape,"(9444, 84)"
7,Numeric features,78
8,Date features,1
9,Rows with missing values,0.5%


In [49]:
result = test_and_save_pre_processing_approach('normalize_2_pre_processing', normalize_2_pre_processing_setup)
models_comparison_normalize_2 = result[0]
dataset_normalize_2_pre_processing = result[1]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9999,0.999,0.85,0.9,0.8667,0.8666,0.8707,2.172
ada,Ada Boost Classifier,0.9999,0.9989,0.9167,0.95,0.93,0.9299,0.9316,3.697
gbc,Gradient Boosting Classifier,0.9999,0.9999,0.95,0.9167,0.93,0.9299,0.9316,18.665
dt,Decision Tree Classifier,0.9998,0.9583,0.9167,0.9267,0.9017,0.9016,0.9114,0.722
et,Extra Trees Classifier,0.9998,0.9999,0.8,0.9,0.8333,0.8333,0.8414,1.68
xgboost,Extreme Gradient Boosting,0.9998,0.9996,0.9167,0.9333,0.9067,0.9066,0.9156,1.286
lightgbm,Light Gradient Boosting Machine,0.9996,0.9649,0.8667,0.8517,0.844,0.8439,0.8513,2.1
qda,Quadratic Discriminant Analysis,0.9992,0.9524,0.2667,0.6,0.3667,0.3665,0.3982,0.689
dummy,Dummy Classifier,0.9989,0.5,0.0,0.0,0.0,0.0,0.0,0.324
knn,K Neighbors Classifier,0.9915,0.8972,0.7333,0.0982,0.1674,0.1659,0.2578,1.137


In [50]:
normalize_3_pre_processing_setup = setup(
                                    dataset, 
                                    target = 'Label',
                                    date_features = ['Timestamp'],
                                    create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
                                    imputation_type = 'simple',
                                    numeric_imputation = 'mean',
                                    categorical_imputation = 'mode',
                                    fold_strategy = 'stratifiedkfold',
                                    fold = 10,
                                    fix_imbalance = True,
                                    fix_imbalance_method = 'SMOTE',
                                    normalize = True,
                                    # Test different zscore
                                    normalize_method = 'maxabs'
                                    )

Unnamed: 0,Description,Value
0,Session id,1648
1,Target,Label
2,Target type,Binary
3,Original data shape,"(31480, 80)"
4,Transformed data shape,"(53468, 84)"
5,Transformed train set shape,"(44024, 84)"
6,Transformed test set shape,"(9444, 84)"
7,Numeric features,78
8,Date features,1
9,Rows with missing values,0.5%


In [51]:
result = test_and_save_pre_processing_approach('normalize_3_pre_processing', normalize_3_pre_processing_setup)
models_comparison_normalize_3 = result[0]
dataset_normalize_3_pre_processing = result[1]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
xgboost,Extreme Gradient Boosting,1.0,1.0,1.0,0.9667,0.98,0.98,0.9816,1.013
rf,Random Forest Classifier,0.9999,0.9999,0.9167,1.0,0.9467,0.9466,0.9523,2.971
ada,Ada Boost Classifier,0.9999,1.0,1.0,0.9417,0.9657,0.9657,0.9682,4.639
gbc,Gradient Boosting Classifier,0.9998,1.0,1.0,0.8833,0.9267,0.9266,0.9339,17.906
et,Extra Trees Classifier,0.9998,1.0,0.8333,1.0,0.8933,0.8932,0.9047,1.276
lightgbm,Light Gradient Boosting Machine,0.9998,1.0,0.9167,0.8833,0.89,0.8899,0.8948,1.277
dt,Decision Tree Classifier,0.9997,0.9416,0.8833,0.8833,0.8667,0.8665,0.8747,1.087
qda,Quadratic Discriminant Analysis,0.999,0.9519,0.05,0.1,0.0667,0.0666,0.0707,0.772
dummy,Dummy Classifier,0.9989,0.5,0.0,0.0,0.0,0.0,0.0,0.35
knn,K Neighbors Classifier,0.9949,0.9143,0.7333,0.1559,0.2542,0.2528,0.3333,2.049


In [52]:
normalize_4_pre_processing_setup = setup(
                                    dataset, 
                                    target = 'Label',
                                    date_features = ['Timestamp'],
                                    create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
                                    imputation_type = 'simple',
                                    numeric_imputation = 'mean',
                                    categorical_imputation = 'mode',
                                    fold_strategy = 'stratifiedkfold',
                                    fold = 10,
                                    fix_imbalance = True,
                                    fix_imbalance_method = 'SMOTE',
                                    normalize = True,
                                    # Test different zscore
                                    normalize_method = 'robust'
                                    )

Unnamed: 0,Description,Value
0,Session id,4117
1,Target,Label
2,Target type,Binary
3,Original data shape,"(31480, 80)"
4,Transformed data shape,"(53468, 84)"
5,Transformed train set shape,"(44024, 84)"
6,Transformed test set shape,"(9444, 84)"
7,Numeric features,78
8,Date features,1
9,Rows with missing values,0.5%


In [53]:
result = test_and_save_pre_processing_approach('normalize_4_pre_processing', normalize_4_pre_processing_setup)
models_comparison_normalize_4 = result[0]
dataset_normalize_4_pre_processing = result[1]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,0.9999,1.0,0.95,0.9667,0.9467,0.9466,0.9523,3.98
et,Extra Trees Classifier,0.9999,1.0,0.9167,1.0,0.9467,0.9466,0.9523,1.303
gbc,Gradient Boosting Classifier,0.9998,1.0,1.0,0.9067,0.9371,0.9371,0.9448,16.616
xgboost,Extreme Gradient Boosting,0.9998,0.9999,0.9,0.9083,0.879,0.8789,0.8912,1.001
rf,Random Forest Classifier,0.9997,1.0,0.6833,0.9,0.75,0.7499,0.7698,2.793
dt,Decision Tree Classifier,0.9996,0.9166,0.8333,0.8917,0.8257,0.8255,0.8435,0.937
lightgbm,Light Gradient Boosting Machine,0.9996,0.9998,0.9,0.8317,0.8395,0.8394,0.8521,1.288
qda,Quadratic Discriminant Analysis,0.9992,0.9454,0.25,0.4,0.2967,0.2966,0.31,0.616
dummy,Dummy Classifier,0.9989,0.5,0.0,0.0,0.0,0.0,0.0,0.305
knn,K Neighbors Classifier,0.9955,0.8902,0.6833,0.1503,0.24,0.2387,0.3131,1.188


#### Conclusion

Analyzing mainly the Recall and F1 Score it was possible to see:

- The maxabs was the most suitable normalize method
- This normalization generally improved the Recall and F1 Score of the main algorithms
- The only exceptions were the F1 Score of the ada and gbc algorithms, that slightly decreased 

So this method could be a good choice for pre-processing


### 2.7 Feature Transform pre-processing

In [55]:
transformation_feature_pre_processing_setup = setup(
                                                dataset, 
                                                target = 'Label',
                                                date_features = ['Timestamp'],
                                                create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
                                                imputation_type = 'simple',
                                                numeric_imputation = 'mean',
                                                categorical_imputation = 'mode',
                                                fold_strategy = 'stratifiedkfold',
                                                fold = 10,
                                                fix_imbalance = True,
                                                fix_imbalance_method = 'SMOTE',
                                                transformation = True,
                                                transformation_method = 'yeo-johnson'
                                                )

Unnamed: 0,Description,Value
0,Session id,4119
1,Target,Label
2,Target type,Binary
3,Original data shape,"(31480, 80)"
4,Transformed data shape,"(53468, 84)"
5,Transformed train set shape,"(44024, 84)"
6,Transformed test set shape,"(9444, 84)"
7,Numeric features,78
8,Date features,1
9,Rows with missing values,0.5%


In [56]:
result = test_and_save_pre_processing_approach('transformation_feature_pre_processing', transformation_feature_pre_processing_setup)
models_comparison_transformation_feature = result[0]
dataset_transformation_feature_pre_processing = result[1]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,0.9999,0.9997,0.9667,0.9667,0.96,0.96,0.9633,7.595
gbc,Gradient Boosting Classifier,0.9999,1.0,1.0,0.9,0.94,0.9399,0.9449,23.842
xgboost,Extreme Gradient Boosting,0.9999,0.9999,0.95,0.95,0.95,0.95,0.95,3.502
qda,Quadratic Discriminant Analysis,0.9998,0.9933,0.8333,0.9,0.86,0.86,0.8633,2.436
et,Extra Trees Classifier,0.9998,1.0,0.7833,1.0,0.86,0.8599,0.8753,3.461
rf,Random Forest Classifier,0.9997,1.0,0.75,1.0,0.84,0.8399,0.857,4.521
lightgbm,Light Gradient Boosting Machine,0.9997,0.9639,0.8167,0.85,0.8181,0.8179,0.8254,3.777
dt,Decision Tree Classifier,0.9996,0.9249,0.85,0.8,0.8214,0.8212,0.823,2.158
dummy,Dummy Classifier,0.9989,0.5,0.0,0.0,0.0,0.0,0.0,2.132
knn,K Neighbors Classifier,0.9986,0.9081,0.8167,0.4308,0.5512,0.5506,0.585,2.729


In [57]:
transformation_feature_2_pre_processing_setup = setup(
                                                dataset, 
                                                target = 'Label',
                                                date_features = ['Timestamp'],
                                                create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
                                                imputation_type = 'simple',
                                                numeric_imputation = 'mean',
                                                categorical_imputation = 'mode',
                                                fold_strategy = 'stratifiedkfold',
                                                fold = 10,
                                                fix_imbalance = True,
                                                fix_imbalance_method = 'SMOTE',
                                                transformation = True,
                                                transformation_method = 'quantile'
                                                )

Unnamed: 0,Description,Value
0,Session id,3524
1,Target,Label
2,Target type,Binary
3,Original data shape,"(31480, 80)"
4,Transformed data shape,"(53468, 84)"
5,Transformed train set shape,"(44024, 84)"
6,Transformed test set shape,"(9444, 84)"
7,Numeric features,78
8,Date features,1
9,Rows with missing values,0.5%


In [58]:
result = test_and_save_pre_processing_approach('transformation_feature_2_pre_processing', transformation_feature_2_pre_processing_setup)
models_comparison_transformation_feature_2 = result[0]
dataset_transformation_feature_2_pre_processing = result[1]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,1.0,1.0,0.9667,1.0,0.98,0.98,0.9816,5.489
gbc,Gradient Boosting Classifier,0.9999,1.0,0.9667,0.95,0.9467,0.9466,0.9523,23.866
rf,Random Forest Classifier,0.9998,0.9999,0.85,1.0,0.9067,0.9066,0.9156,3.564
xgboost,Extreme Gradient Boosting,0.9998,1.0,0.9,0.9667,0.9267,0.9266,0.9299,1.457
lightgbm,Light Gradient Boosting Machine,0.9998,0.9888,0.9167,0.9417,0.919,0.919,0.9239,2.002
et,Extra Trees Classifier,0.9997,0.999,0.8333,0.9333,0.86,0.8599,0.8713,1.606
dt,Decision Tree Classifier,0.9996,0.9166,0.8333,0.875,0.8257,0.8255,0.8395,1.145
dummy,Dummy Classifier,0.9989,0.5,0.0,0.0,0.0,0.0,0.0,0.659
knn,K Neighbors Classifier,0.9985,0.8745,0.7,0.41,0.5011,0.5005,0.5262,1.543
lr,Logistic Regression,0.9915,0.9932,0.85,0.1242,0.2035,0.202,0.3056,2.351


#### Conclusion

Analyzing mainly the Recall and F1 Score it was possible to see:

- This method brign only deterioration to the mainly algortithms that we are considering

So we will not use the Transformation feature method

### 2.8 Polynomial Features pre-processing

In [59]:
polynomial_features_pre_processing_setup = setup(
                                                dataset, 
                                                target = 'Label',
                                                date_features = ['Timestamp'],
                                                create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
                                                imputation_type = 'simple',
                                                numeric_imputation = 'mean',
                                                categorical_imputation = 'mode',
                                                fold_strategy = 'stratifiedkfold',
                                                fold = 10,
                                                fix_imbalance = True,
                                                fix_imbalance_method = 'SMOTE',
                                                normalize = True,
                                                normalize_method = 'maxabs',
                                                polynomial_features = True
                                                )

Unnamed: 0,Description,Value
0,Session id,1726
1,Target,Label
2,Target type,Binary
3,Original data shape,"(31480, 80)"
4,Transformed data shape,"(53468, 3570)"
5,Transformed train set shape,"(44024, 3570)"
6,Transformed test set shape,"(9444, 3570)"
7,Numeric features,78
8,Date features,1
9,Rows with missing values,0.5%


In [62]:
result = test_and_save_pre_processing_approach('polynomial_features_pre_processing', polynomial_features_pre_processing_setup)
models_comparison_polynomial_features = result[0]
dataset_polynomial_features_pre_processing = result[1]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)


Processing:   0%|          | 0/33 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [8]:
my_setup = setup(
                    dataset, 
                    target = 'Label',
                    categorical_imputation= 'drop',
                    date_features = ['Timestamp'],
                    create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
                    remove_outliers = True,
                    normalize = True,
                    transformation= True
                    )

Unnamed: 0,Description,Value
0,Session id,4391
1,Target,Label
2,Target type,Binary
3,Original data shape,"(31480, 80)"
4,Transformed data shape,"(30378, 84)"
5,Transformed train set shape,"(20934, 84)"
6,Transformed test set shape,"(9444, 84)"
7,Numeric features,78
8,Date features,1
9,Rows with missing values,0.5%


In [9]:
dataset_pre_processed = get_config('dataset_transformed')


In [10]:
dataset_pre_processed.head()

Unnamed: 0,Dst Port,Protocol,Timestamp_month,Timestamp_day,Timestamp_second,Timestamp_minute,Timestamp_hour,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,Bwd Pkt Len Max,Bwd Pkt Len Min,Bwd Pkt Len Mean,Bwd Pkt Len Std,Flow Byts/s,Flow Pkts/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Tot,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Tot,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Len,Bwd Header Len,Fwd Pkts/s,Bwd Pkts/s,Pkt Len Min,Pkt Len Max,Pkt Len Mean,Pkt Len Std,Pkt Len Var,FIN Flag Cnt,SYN Flag Cnt,RST Flag Cnt,PSH Flag Cnt,ACK Flag Cnt,URG Flag Cnt,CWE Flag Count,ECE Flag Cnt,Down/Up Ratio,Pkt Size Avg,Fwd Seg Size Avg,Bwd Seg Size Avg,Fwd Byts/b Avg,Fwd Pkts/b Avg,Fwd Blk Rate Avg,Bwd Byts/b Avg,Bwd Pkts/b Avg,Bwd Blk Rate Avg,Subflow Fwd Pkts,Subflow Fwd Byts,Subflow Bwd Pkts,Subflow Bwd Byts,Init Fwd Win Byts,Init Bwd Win Byts,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
10086,-0.875631,1.406264,0.0,0.0,0.863434,-0.102546,0.980888,-0.973095,-1.236807,-0.20433,0.147012,0.151641,0.247316,1.541111,0.627952,-0.781132,0.192203,1.516131,0.416272,-0.698808,1.340328,0.97429,-0.85912,-0.915938,-0.939333,0.24436,-1.244248,-1.254543,-0.806812,-1.249574,-1.172718,-0.768606,-0.769539,-0.694583,-0.769579,-0.693568,-0.19343,0.0,0.0,0.0,-1.190932,-0.230214,0.962451,1.391014,1.544722,0.070441,0.433691,-0.31914,-0.345151,-0.077816,-0.19343,-0.375339,-0.73558,-0.695283,-0.213302,0.0,-0.375339,1.091241,0.566859,0.627952,0.416272,0.0,0.0,0.0,0.0,0.0,0.0,-1.236807,0.147012,-0.20433,0.151641,-1.277618,-0.803023,-0.84475,-1.406042,-0.253069,-0.212689,-0.253069,-0.253069,-0.419368,-0.240185,-0.419368,-0.419368,0
30615,0.030027,-0.548838,0.0,0.0,0.358858,-1.600531,-1.587104,-1.219234,-1.236807,0.301853,-1.294371,-1.186369,-1.302111,-0.664841,-1.34843,-0.781132,-1.194271,-0.648506,-1.223592,-0.698808,-1.258318,1.325888,-1.348909,-0.339868,-1.318863,-0.359314,-1.244248,-1.254543,-0.806812,-1.249574,-1.172718,0.199512,0.292813,-0.694583,0.241704,0.865372,-0.19343,0.0,0.0,0.0,-0.562648,0.525507,1.216049,1.606536,-0.662541,-1.310286,-1.34399,-1.261251,-1.255271,12.850792,-0.19343,-0.375339,-0.73558,1.438263,-0.213302,0.0,-0.375339,1.795683,-1.359465,-1.34843,-1.223592,0.0,0.0,0.0,0.0,0.0,0.0,-1.236807,-1.294371,0.301853,-1.186369,-1.085851,-0.297542,-0.84475,0.655312,-0.253069,-0.212689,-0.253069,-0.253069,-0.419368,-0.240185,-0.419368,-0.419368,0
7573,0.030027,-0.548838,0.0,0.0,-0.356932,0.893848,0.980888,0.078511,1.257074,1.343627,1.052833,1.80648,0.827502,-0.664841,0.761665,1.264898,1.409982,-0.648506,2.06637,1.521344,1.081562,0.186143,-0.220381,0.794364,-0.018013,-1.538304,0.341854,0.205626,1.085462,0.263187,-0.237991,1.15424,1.086528,1.336146,1.180924,-0.270723,-0.19343,0.0,0.0,0.0,1.134869,1.24962,0.174517,0.781669,-0.662541,1.456828,2.22226,1.708773,1.70189,-0.077816,-0.19343,-0.375339,1.359472,-0.695283,-0.213302,0.0,-0.375339,1.091241,2.197314,0.761665,2.06637,0.0,0.0,0.0,0.0,0.0,0.0,1.257074,1.052833,1.343627,1.80648,0.823387,0.714746,1.335903,0.655312,-0.253069,-0.212689,-0.253069,-0.253069,-0.419368,-0.240185,-0.419368,-0.419368,0
15416,-0.875631,1.406264,0.0,0.0,-1.343991,0.120308,0.476759,-0.670184,-1.236807,-0.20433,0.060633,0.095777,0.149924,1.516577,0.454266,-0.781132,0.130588,1.496691,0.329538,-0.698808,0.978157,0.621181,-0.485872,-0.915938,-0.615312,0.599075,-1.244248,-1.254543,-0.806812,-1.249574,-1.172718,-0.768606,-0.769539,-0.694583,-0.769579,-0.693568,-0.19343,0.0,0.0,0.0,-1.190932,-0.230214,0.617584,1.15279,1.520521,0.004498,0.300963,-0.31914,-0.345151,-0.077816,-0.19343,-0.375339,-0.73558,-0.695283,-0.213302,0.0,-0.375339,1.091241,0.420519,0.454266,0.329538,0.0,0.0,0.0,0.0,0.0,0.0,-1.236807,0.060633,-0.20433,0.095777,-1.277618,-0.803023,-0.84475,-1.406042,-0.253069,-0.212689,-0.253069,-0.253069,-0.419368,-0.240185,-0.419368,-0.419368,0
13540,-0.875631,1.406264,0.0,0.0,-1.194437,-1.427111,1.231868,-0.722267,-1.236807,-0.20433,0.187351,0.362231,0.292992,1.55127,0.712111,-0.781132,0.427413,1.573576,0.759846,-0.698808,1.163391,0.683717,-0.548896,-0.915938,-0.670828,0.542881,-1.244248,-1.254543,-0.806812,-1.249574,-1.172718,-0.768606,-0.769539,-0.694583,-0.769579,-0.693568,-0.19343,0.0,0.0,0.0,-1.190932,-0.230214,0.679665,1.198849,1.554735,0.325964,0.674652,0.32211,0.326095,-0.077816,-0.19343,-0.375339,-0.73558,-0.695283,-0.213302,0.0,-0.375339,1.091241,0.835004,0.712111,0.759846,0.0,0.0,0.0,0.0,0.0,0.0,-1.236807,0.187351,-0.20433,0.362231,-1.277618,-0.803023,-0.84475,-1.406042,-0.253069,-0.212689,-0.253069,-0.253069,-0.419368,-0.240185,-0.419368,-0.419368,0


In [11]:
file_path = file_path = os.path.join(os.getcwd(), '..', 'Datasets', 'CSE-CIC-IDS2018', 'pre-processed','0_01-dataset-pre-processed-by-pycaret.csv')
dataset_pre_processed.to_csv(file_path, index=False)

In [12]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,0.9996,0.9641,0.6333,0.9,0.7267,0.7265,0.746,2.639
xgboost,Extreme Gradient Boosting,0.9995,0.9986,0.4667,0.7,0.5433,0.5432,0.5624,1.324
dt,Decision Tree Classifier,0.9994,0.7999,0.6,0.6583,0.6157,0.6155,0.6222,1.274
gbc,Gradient Boosting Classifier,0.9994,0.9001,0.65,0.7333,0.6733,0.6731,0.6821,6.778
et,Extra Trees Classifier,0.9993,0.9986,0.3667,0.6,0.4333,0.4332,0.4568,1.32
rf,Random Forest Classifier,0.9992,0.9736,0.2833,0.5,0.3467,0.3466,0.3677,1.647
lr,Logistic Regression,0.999,0.9829,0.1167,0.3,0.1667,0.1666,0.1861,2.173
knn,K Neighbors Classifier,0.999,0.7165,0.05,0.1,0.0667,0.0666,0.0707,1.457
svm,SVM - Linear Kernel,0.9989,0.5906,0.0,0.0,0.0,0.0,0.0,1.176
ridge,Ridge Classifier,0.9989,0.9913,0.0,0.0,0.0,0.0,0.0,1.066
