In [3]:
import pandas as pd
import numpy as np
from pycaret.datasets import get_data
from pycaret.classification import *
import matplotlib.pyplot as plt
import os

In [4]:
pd.set_option('display.max_rows', None, 'display.max_columns', None)

dir_path = os.path.join('..', 'Datasets', 'CSE-CIC-IDS2018', 'pre-processed')

# Define the path to the folder containing the CSV files
train_file_path = os.path.join(os.getcwd(), dir_path,'train_dataset_treated.parquet')
test_file_path = os.path.join(os.getcwd(), dir_path, 'test_dataset_treated.parquet')
# Import csv to pandas
train_dataset = pd.read_parquet(train_file_path)
test_dataset = pd.read_parquet(test_file_path)

In [5]:
len(train_dataset)

8399452

In [None]:
setup = setup(
    train_dataset, 
    target = 'Label',
    test_data = test_dataset,
    preprocess=False,
    fold_strategy = 'stratifiedkfold',
    fold = 10,
    index=False,
    n_jobs=-1
    )

In [None]:
from sklearn.metrics import precision_recall_curve, auc, fbeta_score, confusion_matrix

def pr_auc(y_true, y_pred, **kwargs):
    precision, recall, _ = precision_recall_curve(y_true, y_pred)
    return auc(recall, precision)

def fbeta(y_true, y_pred, beta=1, **kwargs):
    return fbeta_score(y_true, y_pred, beta=beta)


def fpr(y_true, y_pred, **kwargs):
    # Calcula a matriz de confusão
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    # Calcula o FPR
    return fp / (fp + tn)

In [None]:
setup.add_metric('f2', 'F2', fbeta, greater_is_better=True, beta=2)
setup.add_metric('pr_auc', 'PR-AUC', pr_auc, greater_is_better=True)
setup.add_metric('fpr', 'FPR', fpr, greater_is_better=False)

In [None]:
best_models = compare_models(exclude = ['catboost', 'knn'], n_select = 5, sort = 'F2')

In [None]:
best_models

In [19]:
%store setup

Stored 'setup' (ClassificationExperiment)


In [34]:
df_models_comparison_B = pull().copy()

In [35]:
df_models_comparison_B

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,F2,PR-AUC,FPR,TT (Sec)
dt,Decision Tree Classifier,0.969,0.974,0.9635,0.9743,0.9688,0.9381,0.9382,0.9688,0.978,0.0254,18.75
lightgbm,Light Gradient Boosting Machine,0.9669,0.9919,0.9438,0.9895,0.9661,0.9337,0.9347,0.9661,0.9807,0.01,58.133
gbc,Gradient Boosting Classifier,0.9651,0.9863,0.94,0.9898,0.9642,0.9303,0.9315,0.9642,0.9799,0.0097,374.776
ada,Ada Boost Classifier,0.9538,0.9794,0.9374,0.9691,0.953,0.9075,0.908,0.953,0.9689,0.0299,77.964
lr,Logistic Regression,0.9438,0.962,0.9326,0.954,0.9432,0.8876,0.8879,0.9432,0.9602,0.0449,17.5
svm,SVM - Linear Kernel,0.9117,0.956,0.8725,0.9468,0.9081,0.8234,0.826,0.9081,0.9415,0.049,4.593
qda,Quadratic Discriminant Analysis,0.8915,0.9498,0.9263,0.8662,0.8952,0.7829,0.7849,0.8952,0.9147,0.1434,8.778
ridge,Ridge Classifier,0.8881,0.9522,0.8565,0.9143,0.8845,0.7762,0.7778,0.8845,0.9213,0.0803,2.36
lda,Linear Discriminant Analysis,0.8862,0.9479,0.8594,0.9081,0.883,0.7724,0.7735,0.883,0.9189,0.087,5.125
nb,Naive Bayes,0.8137,0.898,0.8602,0.787,0.8219,0.6273,0.6301,0.8219,0.8585,0.2329,2.616


In [45]:
names_list = ["dt", 'lgbm', 'gradient_boosting', 'ada_boost', 'logistic_regression']

In [48]:
# Salvar cada modelo individualmente
for i, model in enumerate(best_models):
    save_model(model, f'{names_list[i]}_model')

Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Saved


In [18]:
# Define the path to save the dataset pre-processed
file_path = os.path.join(os.getcwd(), '..', 'Scripts', 'models_comparison', 'pycaret-models-comparison.csv')

# Save the dataset pre-processed
df_models_comparison_B.to_csv(file_path ,index=False)

## Otimização de hiperparâmetros

In [51]:
dt_model = best_models[0]

In [1]:
dt_model_tuned = tune_model(dt_model, optimize = 'F2')

NameError: name 'tune_model' is not defined

## Cálculo do Score

In [49]:
df = df_models_comparison_B.copy()

df['Score'] = df.apply(lambda row: 0.1*row['Recall'] + 0.1*row['Prec.'] + 0.2*(1-row['FPR']) + 0.3*row['F2'] + 0.3*row['PR-AUC'], axis=1)
df = df[['Prec.','Recall', 'FPR', 'PR-AUC', 'F2', 'Score']]

def compute_score(fbeta, pr_auc, fpr, recall, precision):
    return 0.3*fbeta + 0.3*pr_auc + 0.2*(1-fpr) + 0.1*recall + 0.1*precision


def evaluate_algorithms(df):
    df.rename(columns={'Prec.': 'Precision'}, inplace=True)
    df['Score'] = df.apply(
        lambda row: compute_score(
            row['F2'],
            row['PR-AUC'],
            row['FPR'],
            row['Recall'],
            row['Precision']
        ),
        axis = 1 
    )

    df = df[['Model', 'Precision', 'Recall', 'FPR', 'PR-AUC', 'F2', 'Score']]

    return df.copy()

df_results_B = evaluate_algorithms(df_models_comparison_B)
df_results_B = df_results_B.sort_values(by=['Score'], ascending=False)
df_final = df_results_B.copy()
# Define the path to save the dataset pre-processed
file_path = os.path.join(os.getcwd(), '..', 'Scripts', 'models_comparison', 'pycaret-models-comparison.csv')

# Save the dataset pre-processed
df_final.to_csv(file_path ,index=False)