In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import make_scorer, fbeta_score, precision_score, precision_recall_curve, auc, roc_curve
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
import os

In [2]:
dir_path = '../Datasets/CSE-CIC-IDS2018/pre-processed/2024-07-28_22-48-48/'
os.listdir(dir_path)

['test_dataset_treated.parquet',
 'train_dataset_treated.parquet',
 '.ipynb_checkpoints']

In [3]:
train_df = pd.read_parquet(os.path.join(dir_path, 'train_dataset_treated.parquet'))
test_df = pd.read_parquet(os.path.join(dir_path, 'test_dataset_treated.parquet'))

In [4]:
# Separação das features e do target para o dataset de treino
X_train = train_df.drop('Label', axis=1)
y_train = train_df['Label']

In [5]:
# Separação das features e do target para o dataset de teste
X_test = test_df.drop('Label', axis=1)
y_test = test_df['Label']

In [6]:
# Definição da métrica F2
f2_scorer = make_scorer(fbeta_score, beta=2)

In [7]:
# Configuração da validação cruzada estratificada
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [15]:
def grid_search(model, param_grid):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=f2_scorer, cv=cv, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Melhores parâmetros e melhor score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print(f'Melhores parâmetros: {best_params}')
    print(f'Melhor F2 score durante a validação cruzada: {best_score}')

    return grid_search
    

In [None]:
def predict_test(grid_search):
    # Avaliação do modelo no conjunto de teste
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]
    
    return y_pred, y_pred_proba

In [None]:
def get_metrics(y_pred):
    metrics = {}
    # Cálculo das métricas
    metrics['f2_test_score'] = fbeta_score(y_test, y_pred, beta=2)
    metrics['precision'] = precision_score(y_test, y_pred)
    metrics['precision_vals'], metrics['recall_vals'], _ = precision_recall_curve(y_test, y_pred_proba)
    metrics['prauc'] = auc(metrics['recall_vals'], metrics['precision_vals'])
    metrics['fpr'], metrics['tpr'], _ = roc_curve(y_test, y_pred_proba)
    metrics['false_positive_rate'] =  metrics['fpr'][0]
    return metrics

## Definições das matrizes de hiperparâmetros

In [10]:
# Definição do Random Forest e dos hiperparâmetros para Grid Search
rf = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200, 300],
    # 'max_depth': [None, 10, 20, 30],
    # 'min_samples_split': [2, 5, 10],
    #'min_samples_leaf': [1, 2, 4]
}

In [None]:
# Definição do AdaBoost e dos hiperparâmetros para Grid Search
adb = AdaBoostClassifier(random_state=42)
param_grid = {
    'n_estimators': [50, 100, 200],
    # 'learning_rate': [0.01, 0.1, 1.0]
}

In [None]:
# Definição do XGBoost e dos hiperparâmetros para Grid Search
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

## Otimização de Hiperparâmetros

In [None]:
gs_rf = grid_search(rf, param_grid)

In [29]:
gs_rf.best_params_

{'n_estimators': 300}

In [30]:
gs_rf.best_score_

0.9627046480129355

In [44]:
y_pred_rf, y_pred_proba_rf = predict_test(gs_rf)

In [45]:
metrics_rf = get_metrics(y_pred_rf)

{'f2_test_score': 0.9401357532558307,
 'precision': 0.9602992652391914,
 'precision_vals': array([0.26318772, 0.28925582, 0.30221248, ..., 0.99219528, 0.99217579,
        1.        ]),
 'recall_vals': array([1.        , 0.99979884, 0.99979884, ..., 0.00917504, 0.00915201,
        0.        ]),
 'prauc': 0.9803298398223443,
 'fpr': array([0.00000000e+00, 2.57796422e-05, 2.57796422e-05, ...,
        8.24578859e-01, 8.77511047e-01, 1.00000000e+00]),
 'tpr': array([0.        , 0.00915201, 0.00917504, ..., 0.99979884, 0.99979884,
        1.        ]),
 'false_positive_rate': 0.0}

In [None]:
gs_adb = grid_search(adb, param_grid)

In [18]:
y_pred_adb, y_pred_proba_adb = predict_test(gs_adb)

In [21]:
gs_adb.best_params_

{'n_estimators': 200}

In [19]:
metrics_adb = get_metrics(y_pred_adb)
metrics_adb

{'f2_test_score': 0.9386160875941832,
 'precision': 0.9518063817519583,
 'precision_vals': array([0.26318772, 0.26318782, 0.26318793, ..., 1.        , 1.        ,
        1.        ]),
 'recall_vals': array([1.        , 1.        , 1.        , ..., 0.07397312, 0.03722381,
        0.        ]),
 'prauc': 0.975866486810271,
 'fpr': array([0.        , 0.        , 0.        , ..., 0.99999561, 0.99999835,
        1.        ]),
 'tpr': array([0.        , 0.03722381, 0.07397312, ..., 1.        , 1.        ,
        1.        ]),
 'false_positive_rate': 0.0}

In [None]:
gs_xgb = grid_search(xgb, param_grid)

In [None]:
gs_xgb.best_params_

In [None]:
gs_xgb.best_score_

In [None]:
y_pred_xgb, y_pred_proba_xgb = predict_test(gs_xgb)

In [None]:
metrics_xgb = get_metrics(y_pred_xgb)