## Import Dependencies

In [1]:
import pandas as pd
import numpy as np
from pycaret.datasets import get_data
from pycaret.classification import *
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import os

## Define functions

In [2]:
dataset_prefix = 'one-day-dataset'

In [3]:
def save_pre_processed_dataset(df: pd.DataFrame, name):
    """Save as the dataset pre-processed as csv in the dir root/Datasets/CSE-CIC/IDS2018/pre-processed/"""
    file_name = dataset_prefix + '-' + name + '.csv'
    
    # Define the path to save the dataset pre-processed
    file_path = os.path.join(os.getcwd(), '..', 'Datasets', 'CSE-CIC-IDS2018', 'pre-processed', file_name)

    # Save the dataset pre-processed
    df.to_csv(file_path ,index=False)

In [4]:
def test_and_save_pre_processing_approach(approach_name, setup):
    # Test different algorithms
    setup.compare_models(include = ['ada', 'gbc', 'et','xgboost', 'rf', 'dt', 'lightgbm'])
    # setup.compare_models()
    
    # Getting the resulting df of the models_comparison
    df_models_comparison = pull().copy()
    
    # Getting the dataset_pre_processed by the autoML
    df_dataset_pre_processed = setup.get_config('dataset_transformed').copy()

    # Save as the dataset pre-processed as csv in the dir root/Datasets/CSE-CIC/IDS2018/pre-processed/
    save_pre_processed_dataset(df_dataset_pre_processed, approach_name)

    return df_models_comparison, df_dataset_pre_processed


In [5]:
colors_list = [
        '#1982C4',  # Azul
        '#B4436C',  # Roxo
        '#F2C14E', # Amarelo
        '#ED217C', # Rosa
        '#5FAD56', # Verde
        '#F78154', # Laranja
        '#4D9078', # Azul esverdeado
        ]

def plot_radar_curves(subplot, column, dfs_dict, angles):
    for index, (name, df) in enumerate(dfs_dict.items()):
        values = df[column].tolist()
        values += values[:1]
        subplot.plot(angles, values, color=colors_list[index], linewidth=2, linestyle='solid', label=name)
        subplot.fill(angles, values, color=colors_list[index], alpha=0.25)

def compare_metrics(dfs_dict, chart_name):

    # Cria colunas com a sigla dos modelos
    for name, df in dfs_dict.items():
        df = df.reset_index()
        df = df.rename(columns={'index': 'Model Alias'})
        dfs_dict[name] = df

    # Número de variáveis
    labels = dfs_dict[list(dfs_dict.keys())[0]]['Model Alias']
    num_vars = len(labels)

    # Ângulos dos eixos
    angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
    angles += angles[:1]  # Completa o círculo

    # Criação dos subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 7), subplot_kw=dict(polar=True))

    # Gráfico 1
    # ATENÇÃO
    # Escrever um for para plotar o gráfico para cada datafra
    
    # Plotar Recall
    plot_radar_curves(ax1, 'Recall', dfs_dict, angles)
    ax1.set_xticks(angles[:-1])
    ax1.set_xticklabels(labels, fontsize=14)
    ax1.set_title('Revocação', fontsize=25)
    ax1.legend(loc='upper right', bbox_to_anchor=(1.1, 1.1))

    # Plotar F1 Score
    plot_radar_curves(ax2, 'F1', dfs_dict, angles)
    ax2.set_xticks(angles[:-1])
    ax2.set_xticklabels(labels, fontsize=14)
    ax2.set_title('Medida-F1', fontsize=25)
    ax2.legend(loc='upper right', bbox_to_anchor=(1.1, 1.1))

    # Ajustar o layout
    plt.tight_layout()

    # Salvar a imagem
    file_path = os.path.join('imgs', f'{chart_name}.png')
    plt.savefig(file_path, dpi=300)

    # Mostrar o gráfico
    plt.show()

## Set display unlimited number of lines

In [6]:
pd.set_option('display.max_rows', None, 'display.max_columns', None)

## Load datasets in pandas dataframes

In [7]:
# # Define the path to the folder containing the CSV files
# file_path = os.path.join(os.getcwd(), '..', 'Datasets', 'CSE-CIC-IDS2018', 'raw', 'original', '02-14-2018.csv')
# # Import csv to pandas
# dataset = pd.read_csv(file_path)

In [8]:
# dataset

In [9]:
file_pattern = './../Datasets/CSE-CIC-IDS2018/raw/original/02-14-2018.csv'
dataset = pd.read_csv(file_pattern)

# Pre-processing

## 1. First steps for pycaret be able to consume

In [10]:
target_classes = dataset["Label"].value_counts().to_dict()

In [11]:
classes = target_classes.keys()

if not (0 in classes and 1 in classes and len(classes) == 2):
    for target in target_classes:
        value = 0 if target == "Benign"else 1
        dataset.replace(to_replace=target, value=value, inplace=True)

In [12]:
dataset["Label"].value_counts()

Label
0    667626
1    380949
Name: count, dtype: int64

In [13]:
columns = dataset.columns.values
columns = np.delete(columns, -1)

In [14]:
dataset.replace([np.inf, -np.inf], np.nan, inplace=True)
dataset.drop(columns=['Timestamp'], inplace=True)
dataset.dropna(inplace=True)

In [15]:
columns = np.delete(columns, 2)

In [16]:
#Iterate throught each dataframe in the dictionary
for key in dataset.keys():
    for col in dataset.columns:
        #Check if the datatype of the column is object
        if dataset[col].dtype == 'object' and col != 'Label':
            # Change all values to numeric, and to NaN if it is a strig
            dataset[col] = pd.to_numeric(dataset[col], errors='coerce')

### Pegar dataset parcial

In [17]:
dataset, _ = train_test_split(dataset, test_size=0.5, stratify=dataset['Label'])


## 2. Pre-processing using Pycaret

### 2.1 Generic pre-processing specifications techniques
The autoML choose the following parameters automatically:
- inputation_type: simple
    - numeric_imputation: mean
    - categorical_imputation: mode
- fold_strategy: stratifiedkfold
    - fold: 10

In [18]:
# %store generic_pre_processing_setup

In [None]:
generic_pre_processing_setup = setup(
                                dataset, 
                                target = 'Label',
                                imputation_type = 'simple',
                                numeric_imputation = 'mean',
                                categorical_imputation = 'mode',
                                fold_strategy = 'stratifiedkfold',
                                n_jobs=-1
)

In [39]:
models_comparison_generic

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,7.743
gbc,Gradient Boosting Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,18.867
et,Extra Trees Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.411
xgboost,Extreme Gradient Boosting,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.324
rf,Random Forest Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.003
dt,Decision Tree Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.449
lightgbm,Light Gradient Boosting Machine,1.0,1.0,1.0,1.0,1.0,1.0,1.0,20.808


In [None]:
result = test_and_save_pre_processing_approach('generic_pre_processing', generic_pre_processing_setup)
models_comparison_generic = result[0]
dataset_generic_pre_processing = result[1]

In [None]:
models_comparison_generic = result[0]
dataset_generic_pre_processing = result[1]

### 2.3 Missing values pre-processing

In [None]:
simple_imputation_pre_processing_setup = setup(
                                                dataset, 
                                                target = 'Label',
                                                imputation_type = 'simple',
                                                numeric_imputation = 'knn',
                                                categorical_imputation = 'mode',
                                                fold = 10,
                                                n_jobs=-1
                                                )

In [None]:
result = test_and_save_pre_processing_approach('simple_imputation_pre_processing', simple_imputation_pre_processing_setup)
models_comparison_simple_imputation = result[0]
dataset_simple_imputation_pre_processing = result[1]

In [None]:
simple_imputation_2_pre_processing_setup = setup(
                                                dataset, 
                                                target = 'Label',
                                                imputation_type = 'simple',
                                                numeric_imputation = 'mean',
                                                categorical_imputation = 'mode',
                                                fold = 10,
                                                n_jobs=-1
                                                )

In [None]:
result = test_and_save_pre_processing_approach('simple_imputation_2_pre_processing', simple_imputation_2_pre_processing_setup)
models_comparison_simple_imputation_2 = result[0]
dataset_simple_imputation_2_pre_processing = result[1]

In [None]:
simple_imputation_3_pre_processing_setup = setup(
                                                dataset, 
                                                target = 'Label',
                                                imputation_type = 'simple',
                                                numeric_imputation = 'median',
                                                categorical_imputation = 'mode',
                                                fold = 10,
                                                n_jobs=-1
                                                )


result = test_and_save_pre_processing_approach('simple_imputation_3_pre_processing', simple_imputation_3_pre_processing_setup)
models_comparison_simple_imputation_3 = result[0]
dataset_simple_imputation_3_pre_processing = result[1]

In [None]:
simple_imputation_4_pre_processing_setup = setup(
                                                dataset, 
                                                target = 'Label',
                                                imputation_type = 'simple',
                                                numeric_imputation = 'drop',
                                                categorical_imputation = 'mode',
                                                fold = 10,
                                                n_jobs=-1
                                                )


result = test_and_save_pre_processing_approach('simple_imputation_4_pre_processing', simple_imputation_4_pre_processing_setup)
models_comparison_simple_imputation_4 = result[0]
dataset_simple_imputation_4_pre_processing = result[1]

In [None]:
simple_imputation_5_pre_processing_setup = setup(
                                                dataset, 
                                                target = 'Label',
                                                imputation_type = 'simple',
                                                numeric_imputation = 'mode',
                                                categorical_imputation = 'mode',
                                                fold = 10,
                                                n_jobs=-1
                                                )


result = test_and_save_pre_processing_approach('simple_imputation_5_pre_processing', simple_imputation_5_pre_processing_setup)
models_comparison_simple_imputation_5 = result[0]
dataset_simple_imputation_5_pre_processing = result[1]

In [None]:
KNN_X_Mean = {
    "Remoção": models_comparison_simple_imputation_4,
    "KNN": models_comparison_simple_imputation,
    "Média": models_comparison_simple_imputation_2,
    "Mediana": models_comparison_simple_imputation_3,
    "Moda": models_comparison_simple_imputation_5,
}


compare_metrics(KNN_X_Mean, 'missign_values_types')

In [None]:
iterative_imputation_pre_processing_setup = setup(
                                                dataset, 
                                                target = 'Label',
                                                imputation_type = 'iterative',
                                                numeric_iterative_imputer = 'lightgbm',
                                                fold_strategy = 'stratifiedkfold',
                                                fold = 10,
                                                n_jobs=-1
                                                )

In [None]:
result = test_and_save_pre_processing_approach('iterative_imputation_pre_processing', iterative_imputation_pre_processing_setup)
models_comparison_iterative_imputation = result[0]
dataset_iterative_imputation_pre_processing = result[1]

In [None]:
Simples_X_Iterativo = {
    "Simples (Mediana)": models_comparison_simple_imputation_3,
    "Iterativo (lightgbm)": models_comparison_iterative_imputation
}


compare_metrics(Simples_X_Iterativo, 'missing_values')

#### Conclusion
Analyzing mainly the Recall and F1 metrics it was possible do see:
- The Decision Tree and Extra Trees algorithms were improved
- The Ada Boost and Extreme Gradiend Boosting were deteriorated

### 2.4 Fix imbalance pre-processing

In [None]:
fix_imbalance_pre_processing_setup = setup(
                                                dataset, 
                                                target = 'Label',
                                                
                                                imputation_type = 'simple',
                                                numeric_imputation = 'mean',
                                                categorical_imputation = 'mode',
                                                fold_strategy = 'stratifiedkfold',
                                                fold = 10,
                                                fix_imbalance = True,
                                                fix_imbalance_method = 'SMOTE'
                                                )

In [None]:
result = test_and_save_pre_processing_approach('fix_imbalance_pre_processing', fix_imbalance_pre_processing_setup)
models_comparison_fix_imbalance = result[0]
dataset_fix_imbalance_pre_processing = result[1]

In [None]:
Fix_Imbalance = {
    "Coluna Alvo desbalanceada": models_comparison_simple_imputation,
    "Coluna Alvo balanceada": models_comparison_fix_imbalance,
}


compare_metrics(Fix_Imbalance, 'fix_imbalance')

#### Conclusion
Analyzing mainly the Recall and F1 metrics it was possible do see:
- The SMOTE method proved be the most suitable
- The Ada, xgboost, gbc and rf were improved
- The Decision Tree were deteriorated

### 2.5 Remove Outliers pre-processing

In [None]:
remove_outliers_pre_processing_setup = setup(
                                            dataset, 
                                            target = 'Label',
                                            date_features = ['Timestamp'],
                                            create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
                                            imputation_type = 'simple',
                                            numeric_imputation = 'mean',
                                            categorical_imputation = 'mode',
                                            fold_strategy = 'stratifiedkfold',
                                            fold = 10,
                                            fix_imbalance = True,
                                            fix_imbalance_method = 'SMOTE',
                                            remove_outliers = True,
                                            outliers_method = 'iforest'
                                            )

In [None]:
result = test_and_save_pre_processing_approach('remove_outliers_pre_processing', remove_outliers_pre_processing_setup)
models_comparison_remove_outliers = result[0]
dataset_remove_outliers_pre_processing = result[1]

In [None]:
remove_outliers_2_pre_processing_setup = setup(
                                            dataset, 
                                            target = 'Label',
                                            date_features = ['Timestamp'],
                                            create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
                                            imputation_type = 'simple',
                                            numeric_imputation = 'mean',
                                            categorical_imputation = 'mode',
                                            fold_strategy = 'stratifiedkfold',
                                            fold = 10,
                                            fix_imbalance = True,
                                            fix_imbalance_method = 'SMOTE',
                                            remove_outliers = True,
                                            outliers_method = 'ee'
                                            )

In [None]:
result = test_and_save_pre_processing_approach('remove_outliers_2_pre_processing', remove_outliers_2_pre_processing_setup)
models_comparison_remove_outliers_2 = result[0]
dataset_remove_outliers_2_pre_processing = result[1]

In [None]:
remove_outliers_3_pre_processing_setup = setup(
                                            dataset, 
                                            target = 'Label',
                                            date_features = ['Timestamp'],
                                            create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
                                            imputation_type = 'simple',
                                            numeric_imputation = 'mean',
                                            categorical_imputation = 'mode',
                                            fold_strategy = 'stratifiedkfold',
                                            fold = 10,
                                            fix_imbalance = True,
                                            fix_imbalance_method = 'SMOTE',
                                            remove_outliers = True,
                                            outliers_method = 'lof'
                                            )

In [None]:
result = test_and_save_pre_processing_approach('remove_outliers_3_pre_processing', remove_outliers_3_pre_processing_setup)
models_comparison_remove_outliers_3 = result[0]
dataset_remove_outliers_3_pre_processing = result[1]

In [None]:
Remove_Outliers_Types = {
    # "Caso anterior": models_comparison_fix_imbalance,
    "Isolation Forest": models_comparison_remove_outliers,
    "Elliptic Envelope": models_comparison_remove_outliers_2,
    "Local Outlier Factor": models_comparison_remove_outliers_3
}


compare_metrics(Remove_Outliers_Types, 'remove_outliers_types')

In [None]:
Remove_Outliers = {
    "Sem remoção de outliers": models_comparison_fix_imbalance,
    "Elliptic Envelope": models_comparison_remove_outliers_2
}


compare_metrics(Remove_Outliers, 'remove_outliers')

#### Conclusion
Analyzing mainly the Recall and F1 metrics it was possible do see:
- The ee method proved be the most suitable
- The only algortihm that has been improved was the Extra trees
- All the others algorithms has been deteriorated

So we wil not use remove_outliers method

### 2.6 Normalize pre-processing

In [None]:
normalize_pre_processing_setup = setup(
                                    dataset, 
                                    target = 'Label',
                                    date_features = ['Timestamp'],
                                    create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
                                    imputation_type = 'simple',
                                    numeric_imputation = 'mean',
                                    categorical_imputation = 'mode',
                                    fold_strategy = 'stratifiedkfold',
                                    fold = 10,
                                    fix_imbalance = True,
                                    fix_imbalance_method = 'SMOTE',
                                    normalize = True,
                                    # Test different zscore
                                    normalize_method = 'zscore'
                                    )

In [None]:
result = test_and_save_pre_processing_approach('normalize_pre_processing', normalize_pre_processing_setup)
models_comparison_normalize = result[0]
dataset_normalize_pre_processing = result[1]

In [None]:
normalize_2_pre_processing_setup = setup(
                                    dataset, 
                                    target = 'Label',
                                    date_features = ['Timestamp'],
                                    create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
                                    imputation_type = 'simple',
                                    numeric_imputation = 'mean',
                                    categorical_imputation = 'mode',
                                    fold_strategy = 'stratifiedkfold',
                                    fold = 10,
                                    fix_imbalance = True,
                                    fix_imbalance_method = 'SMOTE',
                                    normalize = True,
                                    # Test different zscore
                                    normalize_method = 'minmax'
                                    )

In [None]:
result = test_and_save_pre_processing_approach('normalize_2_pre_processing', normalize_2_pre_processing_setup)
models_comparison_normalize_2 = result[0]
dataset_normalize_2_pre_processing = result[1]

In [None]:
normalize_3_pre_processing_setup = setup(
                                    dataset, 
                                    target = 'Label',
                                    date_features = ['Timestamp'],
                                    create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
                                    imputation_type = 'simple',
                                    numeric_imputation = 'mean',
                                    categorical_imputation = 'mode',
                                    fold_strategy = 'stratifiedkfold',
                                    fold = 10,
                                    fix_imbalance = True,
                                    fix_imbalance_method = 'SMOTE',
                                    normalize = True,
                                    normalize_method = 'maxabs'
                                    )

In [None]:
result = test_and_save_pre_processing_approach('normalize_3_pre_processing', normalize_3_pre_processing_setup)
models_comparison_normalize_3 = result[0]
dataset_normalize_3_pre_processing = result[1]











In [None]:
normalize_4_pre_processing_setup = setup(
                                    dataset, 
                                    target = 'Label',
                                    date_features = ['Timestamp'],
                                    create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
                                    imputation_type = 'simple',
                                    numeric_imputation = 'mean',
                                    categorical_imputation = 'mode',
                                    fold_strategy = 'stratifiedkfold',
                                    fold = 10,
                                    fix_imbalance = True,
                                    fix_imbalance_method = 'SMOTE',
                                    normalize = True,
                                    # Test different zscore
                                    normalize_method = 'robust'
                                    )

In [None]:
result = test_and_save_pre_processing_approach('normalize_4_pre_processing', normalize_4_pre_processing_setup)
models_comparison_normalize_4 = result[0]
dataset_normalize_4_pre_processing = result[1]

In [None]:
Normalize_Types = {
    "Z-score": models_comparison_normalize,
    "Min-Max": models_comparison_normalize_2,
    "MaxAbs": models_comparison_normalize_3,
    "Robust": models_comparison_normalize_4
}


compare_metrics(Normalize_Types, 'normalize_types')

In [None]:
Normalize = {
    "Sem normalização": models_comparison_fix_imbalance,
    "Normalizado (MinMax)": models_comparison_normalize_2,
}


compare_metrics(Normalize, 'normalize')

#### Conclusion

Analyzing mainly the Recall and F1 Score it was possible to see:

- The maxabs was the most suitable normalize method
- This normalization generally improved the Recall and F1 Score of the main algorithms
- The only exceptions were the F1 Score of the ada and gbc algorithms, that slightly decreased 

So this method could be a good choice for pre-processing


### 2.7 Feature Transform pre-processing

In [None]:
transformation_feature_pre_processing_setup = setup(
                                                dataset, 
                                                target = 'Label',
                                                
                                                imputation_type = 'simple',
                                                numeric_imputation = 'mean',
                                                categorical_imputation = 'mode',
                                                fold_strategy = 'stratifiedkfold',
                                                fold = 10,
                                                fix_imbalance = True,
                                                fix_imbalance_method = 'SMOTE',
                                                transformation = True,
                                                transformation_method = 'yeo-johnson'
                                                )

In [None]:
result = test_and_save_pre_processing_approach('transformation_feature_pre_processing', transformation_feature_pre_processing_setup)
models_comparison_transformation_feature = result[0]
dataset_transformation_feature_pre_processing = result[1]

In [None]:
transformation_feature_2_pre_processing_setup = setup(
                                                dataset, 
                                                target = 'Label',
                                                
                                                imputation_type = 'simple',
                                                numeric_imputation = 'mean',
                                                categorical_imputation = 'mode',
                                                fold_strategy = 'stratifiedkfold',
                                                fold = 10,
                                                fix_imbalance = True,
                                                fix_imbalance_method = 'SMOTE',
                                                transformation = True,
                                                transformation_method = 'quantile'
                                                )

In [None]:
result = test_and_save_pre_processing_approach('transformation_feature_2_pre_processing', transformation_feature_2_pre_processing_setup)
models_comparison_transformation_feature_2 = result[0]
dataset_transformation_feature_2_pre_processing = result[1]

In [None]:
Yeo_X_Quantile = {
    "Yeo-Johnson": models_comparison_transformation_feature,
    "Quantile": models_comparison_transformation_feature_2,
}


compare_metrics(Yeo_X_Quantile, 'yeo_x_quantile')

In [None]:
Transformation_Feature_X_Normalize = {
    "Normalização (Min-Max)": models_comparison_normalize_2,
    "Yeo-Johnson": models_comparison_transformation_feature,
    # "Nenhum": models_comparison_fix_imbalance
}


compare_metrics(Transformation_Feature_X_Normalize, 'transformation_feature_X_normalize')

#### Conclusion

Analyzing mainly the Recall and F1 Score it was possible to see:

- This method brign only deterioration to the mainly algortithms that we are considering

So we will not use the Transformation feature method

fold_pre_processing_setup = setup(
    dataset, 
    target = 'Label',
    date_features = ['Timestamp'],
    create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
    imputation_type = 'simple',
    numeric_imputation = 'mean',
    categorical_imputation = 'mode',
    fold_strategy = 'stratifiedkfold',
    fold = 10,
    fix_imbalance = True,
    fix_imbalance_method = 'SMOTE',
    
    )

### 2.8 Data Split Stratification

In [None]:
control_pre_processing_setup = setup(
                                        dataset, 
                                        target = 'Label',
                                        date_features = ['Timestamp'],
                                        create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
                                        imputation_type = 'simple',
                                        numeric_imputation = 'mean',
                                        categorical_imputation = 'mode',
                                        fold_strategy = 'stratifiedkfold',
                                        fold = 10,
                                        fix_imbalance = True,
                                        fix_imbalance_method = 'SMOTE',
                                        transformation = True,
                                        transformation_method = 'yeo-johnson',
                                        
                                        )


In [None]:
result = test_and_save_pre_processing_approach('control_pre_processing', control_pre_processing_setup)
models_comparison_control = result[0]
dataset_control_pre_processing = result[1]

In [None]:
univariate_feature_selection_pre_processing_setup = setup(
                                        dataset, 
                                        target = 'Label',
                                        date_features = ['Timestamp'],
                                        create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
                                        imputation_type = 'simple',
                                        numeric_imputation = 'mean',
                                        categorical_imputation = 'mode',
                                        fold_strategy = 'stratifiedkfold',
                                        fold = 10,
                                        fix_imbalance = True,
                                        fix_imbalance_method = 'SMOTE',
                                        transformation = True,
                                        transformation_method = 'yeo-johnson',
                                        feature_selection = True,
                                        feature_selection_method = 'univariate'
                                        )

result = test_and_save_pre_processing_approach('univariate_feature_selection_pre_processing', univariate_feature_selection_pre_processing_setup)
models_comparison_univariate_feature_selection = result[0]
dataset_univariate_feature_selection_pre_processing = result[1]

In [None]:
classic_feature_selection_pre_processing_setup = setup(
                                        dataset, 
                                        target = 'Label',
                                        date_features = ['Timestamp'],
                                        create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
                                        imputation_type = 'simple',
                                        numeric_imputation = 'mean',
                                        categorical_imputation = 'mode',
                                        fold_strategy = 'stratifiedkfold',
                                        fold = 10,
                                        fix_imbalance = True,
                                        fix_imbalance_method = 'SMOTE',
                                        transformation = True,
                                        transformation_method = 'yeo-johnson',
                                        feature_selection = True,
                                        feature_selection_method = 'classic'
                                        )

result = test_and_save_pre_processing_approach('classic_feature_selection_pre_processing', classic_feature_selection_pre_processing_setup)
models_comparison_classic_feature_selection = result[0]
dataset_classic_feature_selection_pre_processing = result[1]

In [None]:
sequential_feature_selection_pre_processing_setup = setup(
                                        dataset, 
                                        target = 'Label',
                                        date_features = ['Timestamp'],
                                        create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
                                        imputation_type = 'simple',
                                        numeric_imputation = 'mean',
                                        categorical_imputation = 'mode',
                                        fold_strategy = 'stratifiedkfold',
                                        fold = 10,
                                        fix_imbalance = True,
                                        fix_imbalance_method = 'SMOTE',
                                        transformation = True,
                                        transformation_method = 'yeo-johnson',
                                        feature_selection = True,
                                        feature_selection_method = 'sequential'
                                        )

result = test_and_save_pre_processing_approach('sequential_feature_selection_pre_processing', sequential_feature_selection_pre_processing_setup)
models_comparison_sequential_feature_selection = result[0]
dataset_sequential_feature_selection_pre_processing = result[1]

In [None]:

Feature_Selection = {
    "Seleção de Colunas": models_comparison_classic_feature_selection,
    # "Univariável": models_comparison_univariate_feature_selection,
    "Sem Seleção": models_comparison_control
}


compare_metrics(Feature_Selection, 'Feature_Selection')

### 2.9 Remoção de colinearidade

In [None]:
remove_multicollinearity_pre_processing_setup = setup(
                                        dataset, 
                                        target = 'Label',
                                        date_features = ['Timestamp'],
                                        create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
                                        imputation_type = 'simple',
                                        numeric_imputation = 'mean',
                                        categorical_imputation = 'mode',
                                        fold_strategy = 'stratifiedkfold',
                                        fold = 10,
                                        fix_imbalance = True,
                                        fix_imbalance_method = 'SMOTE',
                                        transformation = True,
                                        transformation_method = 'yeo-johnson',
                                        remove_multicollinearity = True
                                        )

result = test_and_save_pre_processing_approach('remove_multicollinearity_pre_processing', remove_multicollinearity_pre_processing_setup)
models_comparison_remove_multicollinearity = result[0]
dataset_remove_multicollinearity_pre_processing = result[1]

In [None]:
Remove_Multicollinearity = {
    "Remoção de Multicolinearidade": models_comparison_remove_multicollinearity,
    "Sem Remoção": models_comparison_control,
}


compare_metrics(Remove_Multicollinearity, 'Remove_Multicollinearity')

### 2.11 PCA

In [None]:
linear_pca_pre_processing_setup = setup(
                                        dataset, 
                                        target = 'Label',
                                        date_features = ['Timestamp'],
                                        create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
                                        imputation_type = 'simple',
                                        numeric_imputation = 'mean',
                                        categorical_imputation = 'mode',
                                        fold_strategy = 'stratifiedkfold',
                                        fold = 10,
                                        fix_imbalance = True,
                                        fix_imbalance_method = 'SMOTE',
                                        transformation = True,
                                        transformation_method = 'yeo-johnson',
                                        pca = True,
                                        pca_method = 'linear',
                                        pca_components = 70
                                        )

result = test_and_save_pre_processing_approach('linear_pca_pre_processing', linear_pca_pre_processing_setup)
models_comparison_linear_pca = result[0]
dataset_linear_pca_pre_processing = result[1]

In [None]:
kernel_pca_pre_processing_setup = setup(
                                        dataset, 
                                        target = 'Label',
                                        date_features = ['Timestamp'],
                                        create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
                                        imputation_type = 'simple',
                                        numeric_imputation = 'mean',
                                        categorical_imputation = 'mode',
                                        fold_strategy = 'stratifiedkfold',
                                        fold = 10,
                                        fix_imbalance = True,
                                        fix_imbalance_method = 'SMOTE',
                                        transformation = True,
                                        transformation_method = 'yeo-johnson',
                                        pca = True,
                                        pca_method = 'kernel',
                                        pca_components = 70
                                        )

result = test_and_save_pre_processing_approach('kernel_pca_pre_processing', kernel_pca_pre_processing_setup)
models_comparison_kernel_pca = result[0]
dataset_kernel_pca_pre_processing = result[1]

In [None]:
len(dataset_control_pre_processing.columns)

In [None]:
incremental_pca_pre_processing_setup = setup(
                                        dataset, 
                                        target = 'Label',
                                        date_features = ['Timestamp'],
                                        create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
                                        imputation_type = 'simple',
                                        numeric_imputation = 'mean',
                                        categorical_imputation = 'mode',
                                        fold_strategy = 'stratifiedkfold',
                                        fold = 10,
                                        fix_imbalance = True,
                                        fix_imbalance_method = 'SMOTE',
                                        transformation = True,
                                        transformation_method = 'yeo-johnson',
                                        pca = True,
                                        pca_method = 'incremental',
                                        pca_components = 70
                                        )

result = test_and_save_pre_processing_approach('incremental_pca_pre_processing', incremental_pca_pre_processing_setup)
models_comparison_incremental_pca = result[0]
dataset_incremental_pca_pre_processing = result[1]

In [None]:
PCA = {
    "Linear": models_comparison_linear_pca,
    # "Kernel": models_comparison_kernel_pca,
    "Incremental": models_comparison_incremental_pca,
    "Sem Remoção": models_comparison_control,
}


compare_metrics(PCA, 'PCA')

### 2.11 Data Split Stratification

In [None]:
data_split_pre_processing_setup = setup(
    dataset, 
    target = 'Label',
    date_features = ['Timestamp'],
    create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
    imputation_type = 'iterative',
    numeric_iterative_imputer = 'lightgbm',
    fix_imbalance = True,
    fix_imbalance_method = 'SMOTE',
    transformation = True,
    transformation_method = 'yeo-johnson',
    fold_strategy = 'stratifiedkfold',
    fold = 10,
    data_split_shuffle = True,
    data_split_stratify = True
    )

result = test_and_save_pre_processing_approach('data_split_pre_processing', data_split_pre_processing_setup)
models_comparison_data_split = result[0]
dataset_data_split_pre_processing = result[1]

In [None]:
Data_Stratify = {
    "Estratificado": models_comparison_data_split,
    "Não estratificado": models_comparison_transformation_feature
}


compare_metrics(Data_Stratify, 'data_stratify')

### 2.9 Fold Strategy

In [None]:
kfold_pre_processing_setup = setup(
    dataset, 
    target = 'Label',
    date_features = ['Timestamp'],
    create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
    imputation_type = 'iterative',
    numeric_iterative_imputer = 'lightgbm',
    fix_imbalance = True,
    fix_imbalance_method = 'SMOTE',
    transformation = True,
    transformation_method = 'yeo-johnson',
    fold_strategy = 'kfold',
    fold = 10
    )

result = test_and_save_pre_processing_approach('kfold_pre_processing', kfold_pre_processing_setup)
models_comparison_kfold = result[0]
dataset_kfold_pre_processing = result[1]

In [None]:
teste_pre_processing_setup = setup(
    dataset, 
    target = 'Label',
    date_features = ['Timestamp'],
    create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
    imputation_type = 'iterative',
    numeric_iterative_imputer = 'lightgbm',
    fix_imbalance = True,
    fix_imbalance_method = 'SMOTE',
    transformation = True,
    transformation_method = 'quantile',
    fold_strategy = 'stratifiedkfold',
    fold = 10
    )

result = test_and_save_pre_processing_approach('teste_pre_processing', teste_pre_processing_setup)
models_comparison_teste = result[0]
dataset_teste_pre_processing = result[1]

In [None]:
Fold_Strategy = {
    "Kfold": models_comparison_kfold,
    "Stratifiedkfold": models_comparison_transformation_feature
}


compare_metrics(Fold_Strategy, 'fold_strategy')

## Final Pre Processing

In [None]:
setup_pre_processing_A = setup(
    dataset, 
    target = 'Label',
    date_features = ['Timestamp'],
    create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
    imputation_type = 'iterative',
    numeric_iterative_imputer = 'lightgbm',
    fix_imbalance = True,
    fix_imbalance_method = 'SMOTE',
    transformation = True,
    transformation_method = 'yeo-johnson',
    fold_strategy = 'kfold',
    fold = 10
    )

result = test_and_save_pre_processing_approach('A_pre_processing', setup_pre_processing_A)
models_comparison_A = result[0]
pre_processed_dataset_A = result[1]

In [None]:
setup_pre_processing_B = setup(
    dataset, 
    target = 'Label',
    date_features = ['Timestamp'],
    create_date_columns = ['hour', 'minute', 'second', 'day', 'month'],
    imputation_type = 'iterative',
    numeric_iterative_imputer = 'lightgbm',
    fix_imbalance = True,
    fix_imbalance_method = 'SMOTE',
    transformation = True,
    transformation_method = 'quantile',
    fold_strategy = 'stratifiedkfold',
    fold = 10
    )

result = test_and_save_pre_processing_approach('B_pre_processing', setup_pre_processing_B)
models_comparison_B = result[0]
pre_processed_dataset_B = result[1]

In [None]:
Teste_Final = {
    "Pre-processamento A": models_comparison_A,
    "Pre-processamento B": models_comparison_B,
    "Generica": models_comparison_generic
}


compare_metrics(Teste_Final, 'teste-final')

In [None]:
save_pre_processed_dataset(pre_processed_dataset_A, "pre_processed_dataset_A")
save_pre_processed_dataset(pre_processed_dataset_B, "pre_processed_dataset_B")