# Libraries

In [None]:
#Classification Methods
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

#Metrics
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from yellowbrick.classifier import ClassificationReport 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import roc_auc_score

#Tools
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import model_selection
from scipy.sparse import csr_matrix 
import string 
import time as tm
import spacy 
import os

# Hiperparameter tune
from sklearn.model_selection import GridSearchCV

# Feature importance
from yellowbrick.model_selection import FeatureImportances

import warnings
warnings.filterwarnings('ignore')

# Functions

In [None]:
def classifier_metrics(X_train,X_test,y_train,y_test,CV=True):    
    def metrics(model):
        print("\nHold-Out in process...")
        start_time = tm.time()
        model.fit(X_train, y_train) 
        TIME = tm.time() - start_time 
        print("Time, Training: {0:.4f} [seconds]".format(TIME))
        start_time = tm.time()
        y_pred = model.predict(X_test)
        TIME = tm.time() - start_time 
        print("Time, Prediction: {0:.4f} [seconds]".format(TIME))
        
        accuracy_s  = accuracy_score(y_test,y_pred) 
        print('accuracy_score: {0:.4f}'.format(accuracy_s))
        f1_s        = f1_score(y_test,y_pred,average='weighted')
        print('f1_score: {0:.4f}'.format(f1_s))
        recall_s    = recall_score(y_test,y_pred,average='weighted')
        print('recall_score: {0:.4f}'.format(recall_s))
        precision_s = precision_score(y_test,y_pred,average='weighted')
        print('precision_score: {0:.4f}'.format(precision_s))
        
        if type(list(np.unique(np.array(y_train)))[0]).__name__ == 'str': #If the classes are categorical with string names
            le           = LabelEncoder() 
            le.fit(list(np.unique(np.array(y_train)))) 
            y_test_coded = le.transform(y_test) 
            y_pred_coded = le.transform(y_pred) 
            mse_s        = MSE(y_test_coded,y_pred_coded)
            print('MSE: {0:.4f}'.format(mse_s))
        else:
            mse_s        = MSE(y_test,y_pred)
            print('MSE: {0:.4f}'.format(mse_s))
        
        if len(list(np.unique(np.array(y_train)))) > 2: #For multiclass classification, more than 2 classes
            y_pred_proba = model.predict_proba(X_test)[:]
            roc_s        = roc_auc_score(y_test, y_pred_proba, multi_class='ovo', average='weighted')
            print('ROC_AUC: {0:.4f}'.format(roc_s))            
        else:
            y_pred_proba = model.predict_proba(X_test)[:,1]
            roc_s        = roc_auc_score(y_test, y_pred_proba, multi_class='ovo', average='weighted')
            print('ROC_AUC: {0:.4f}'.format(roc_s))
        
        ck_s         = cohen_kappa_score(y_test,y_pred)
        print('CK: {0:.4f}'.format(ck_s))
        
        if CV == True:
            print('\nCross-Validation in process...')
            start_time = tm.time() 
            kfold = model_selection.KFold(n_splits=10)
            y_CV = np.concatenate((y_train,y_test))
            if "GaussianNB" in str(name) or "LinearDiscriminantAnalysis" in str(name) or "QuadraticDiscriminantAnalysis" in str(name):
                X_CV = np.concatenate((X_train,X_test))
                cv_results = np.array(model_selection.cross_val_score(model, X_CV, y_CV, cv=kfold, scoring='accuracy', n_jobs=-3))
            else:
                X_CV = np.concatenate((X_train.toarray(),X_test.toarray()))
                X_CV = csr_matrix(X_CV)
                cv_results = np.array(model_selection.cross_val_score(model, X_CV, y_CV, cv=kfold, scoring='accuracy', n_jobs=-3))

            cv_results = cv_results[np.logical_not(np.isnan(cv_results))] 
            TIME = tm.time() - start_time 
            print("Time, CV: {0:.4f} [seconds]".format(TIME))
            print('CV: {0:.4f} {1:.4f}'.format(cv_results.mean(),cv_results.std()))

    for name in classifiers:
        print ("---------------------------------------------------------------------------------\n") 
        print(str(name))
        if "GaussianNB" in str(name) or "LinearDiscriminantAnalysis" in str(name) or "QuadraticDiscriminantAnalysis" in str(name):
            X_train=csr_matrix(X_train) 
            X_test =csr_matrix(X_test) 
            X_train=X_train.toarray() 
            X_test=X_test.toarray() 
        else:
            X_train=csr_matrix(X_train)
            X_test=csr_matrix(X_test)
            
        metrics(name)
        print()


In [None]:
# Classification report
def CR_viz(x,y):
    ax = plt.figure(figsize=(x,y)) 
    visualizer = ClassificationReport(model_selected, classes=classes, support=True,  
                                      cmap='Blues', title="Classification Report - "+model_name)
    visualizer.fit(X_train, y_train)   
    visualizer.score(X_test, y_test)      
    visualizer.poof()
    ax.show()
    ax.savefig(path_figures+"/"+model_name+"_CR"+".pdf", bbox_inches = "tight") 

# Confusion Matrix
def CM_viz(y_test, y_pred, classes, name,
               path_img_base = './images',nrows=1,ncols=1,size_text_legend=25,size_text_title=25,title="",
           size_text_xy_labels=25,size_text_xy_tick=25,
          size_num_inter=25):
    if not os.path.exists(path_img_base):
        os.makedirs(path_img_base)
    
    if ncols==nrows and ncols==1:
        nrows=1
        ncols=1
        fig = plt.figure(figsize=(20*ncols,20*nrows))
        conf = confusion_matrix(y_test, y_pred) 
        annot_kws={'fontsize':size_num_inter, 'verticalalignment':'center' } 
        ax = sns.heatmap(conf, annot=True, cbar=False, cmap='Blues',fmt = 'd',annot_kws= annot_kws, 
                                      xticklabels=np.unique(classes), yticklabels=np.unique(classes)) 
        #cbar = ax.collections[0].colorbar # use matplotlib.colorbar.Colorbar object
        #cbar.ax.tick_params(labelsize=size_text_xy_tick) # here set the labelsize 
        ax.xaxis.set_tick_params(labelsize=size_text_xy_tick,rotation=90)
        ax.yaxis.set_tick_params(labelsize=size_text_xy_tick,rotation=0)
        ax.set_xlabel('Predicted Values',fontsize=size_text_xy_labels)
        ax.set_ylabel('Actual Values',fontsize=size_text_xy_labels)
        ax.set_title(title,fontsize=size_text_title)
        ax.figure.subplots_adjust(right=0.8)
        ax.figure.savefig(path_figures+"/"+name+"_CM"+".pdf", bbox_inches = "tight", format='pdf')
    else:
        conf = confusion_matrix(y_test, y_pred) 
        annot_kws={'fontsize':size_num_inter, 'verticalalignment':'center' }

        ax = sns.heatmap(conf, annot=True, cbar=False, cmap='Blues',fmt = 'd',annot_kws= annot_kws, 
                                      xticklabels=np.unique(classes), yticklabels=np.unique(classes)) 
        #cbar = ax.collections[0].colorbar # use matplotlib.colorbar.Colorbar object
        #cbar.ax.tick_params(labelsize=size_text_xy_tick) # here set the labelsize 
        ax.xaxis.set_tick_params(labelsize=size_text_xy_tick,rotation=90)
        ax.yaxis.set_tick_params(labelsize=size_text_xy_tick,rotation=0)
        ax.set_xlabel('Predicted Values',fontsize=size_text_xy_labels)
        ax.set_ylabel('Actual Values',fontsize=size_text_xy_labels)
        ax.set_title(title,fontsize=size_text_title)
    return ax

In [None]:
path_figures = "../images"
if not os.path.exists(path_figures):
    os.makedirs(path_figures)


# Loading data

In [None]:
path_folder_data = "../DBs"

In [None]:
# IDSAI
path     = path_folder_data+'/IDSAI.csv'
df_IDSAI = pd.read_csv(path)
df_IDSAI

In [None]:
# Bot-IoT
path     = path_folder_data+'/Bot-IoT.csv'
df_BotIoT = pd.read_csv(path)
df_BotIoT

In [None]:
# Class distribution in IDSAI
df_IDSAI.groupby("tipo_ataque").size()

In [None]:
# Class distribution in Bot-IoT
df_BotIoT.groupby("tipo_ataque").size()

In [None]:
# column names
print("Same columns in DBs: ",(df_BotIoT.columns.values == df_IDSAI.columns.values).all()) 
print("Number of columns: ",len(list(df_BotIoT.columns.values))) 
list(df_BotIoT.columns.values)

# IDSAI dataset

In [None]:
# Delete labels
features = df_IDSAI.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1)
features = features.drop(['ip_src', 'ip_dst', 'port_src', 'port_dst', 'protocols'], 
                         axis=1) # Features not recomended in literature

In [None]:
features

In [None]:
#Extraemos los labels
labels = df_IDSAI.copy()

labels_binary = labels['label'].values
labels_multiclass = labels['tipo_ataque'].values

In [None]:
labels_binary

In [None]:
labels_multiclass

## Scenario 1

### Inicial model exploration

In [None]:
X_train,X_test,y_train,y_test=train_test_split(features, labels_binary,
                                               test_size=0.2,random_state=21, stratify=labels_binary)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 

In [None]:
#ML Models
classifiers=[
    DecisionTreeClassifier(),
    ExtraTreesClassifier(n_jobs=-1), 
    RandomForestClassifier(n_jobs=-1),
    GradientBoostingClassifier(),
    XGBClassifier(eval_metric='mlogloss',n_jobs=-1),
    GaussianNB(),    
    LinearDiscriminantAnalysis(),
    LogisticRegression(solver='liblinear',n_jobs=-1)
    ] 

#Deploy aggregate metrics 
classifier_metrics(X_train,X_test,y_train,y_test,CV=True) 

### Hiperparameter tuning

#### XGB

In [None]:
# Model to tune
model = XGBClassifier(eval_metric='mlogloss',n_jobs=-1)

# Hiperparameter values
param_grid = {
              "min_child_weight":[1, 5],
              "gamma": [0.5, 1.5, 5],
              "subsample":[0.75, 1],
              "colsample_bytree":[0.75, 1],
              "max_depth":[2, 6]
             }

# Grid
grid = GridSearchCV(model,param_grid,cv=3,verbose=3)

# Tuning
grid.fit(X_train,y_train)

# Best model
print(grid.best_estimator_)

# Predictions with best hiperparameters
grid_predictions = grid.predict(X_test) 

# Metrics
print(confusion_matrix(y_test,grid_predictions)) 
print(classification_report(y_test,grid_predictions)) 

In [None]:
# Model with hiperparameters using GridSearch 
model = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.75,
              enable_categorical=False, eval_metric='mlogloss', gamma=1.5,
              gpu_id=-1, importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=np.nan, monotone_constraints='()',
              n_estimators=100, n_jobs=-1, num_parallel_tree=1,
              predictor='auto', random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=None)
model.fit(X_train, y_train)
print("Accuracy: ", model.score(X_test, y_test))

#### ET

In [None]:
ExtraTreesClassifier(n_jobs=-1).get_params().keys()

In [None]:
# Model to tune
model = ExtraTreesClassifier(n_jobs=-1)

# Hiperparameter values
param_grid = {
        'n_estimators': [100,150,200],
        'criterion': ['gini', 'entropy'],
        'max_depth': [10, 50, 80, None]
    },

# Grid
grid = GridSearchCV(model,param_grid,cv=3,verbose=3)

# Tuning
grid.fit(X_train,y_train)

# Best model
print(grid.best_estimator_)

# Predictions with best hiperparameters
grid_predictions = grid.predict(X_test) 

# Metrics
print(confusion_matrix(y_test,grid_predictions)) 
print(classification_report(y_test,grid_predictions)) 


In [None]:
# Model with hiperparameters using GridSearch 
model = ExtraTreesClassifier(criterion='entropy', max_depth=50, n_jobs=-1)
model.fit(X_train, y_train)
print("Accuracy: ", model.score(X_test, y_test))

#### DT

In [None]:
DecisionTreeClassifier().get_params().keys()

In [None]:
# Model to tune
model = DecisionTreeClassifier()

# Hiperparameter values
param_grid = {
        'criterion': ['gini', 'entropy'],
        'max_depth': [10, 20, 30, 40, 50, 100, 150, 200, None],
        'random_state':[32,64]
    },

# Grid
grid = GridSearchCV(model,param_grid,cv=3,verbose=3)

# Tuning
grid.fit(X_train,y_train)

# Best model
print(grid.best_estimator_)

# Predictions with best hiperparameters
grid_predictions = grid.predict(X_test) 

# Metrics
print(confusion_matrix(y_test,grid_predictions)) 
print(classification_report(y_test,grid_predictions)) 


In [None]:
# Model with hiperparameters using GridSearch 
model = DecisionTreeClassifier(criterion='entropy', max_depth=20, random_state=32)
model.fit(X_train, y_train)
print("Accuracy: ", model.score(X_test, y_test))

#### RF

In [None]:
RandomForestClassifier(n_jobs=-1).get_params().keys()

In [None]:
# Model to tune
model = RandomForestClassifier(n_jobs=-1)

# Hiperparameter values
param_grid = {
        'n_estimators': [60,100,120,150],
        'criterion': ['gini', 'entropy'],
        'max_depth': [10, 20, 40, None],
        'random_state':[32, 64]
    },

# Grid
grid = GridSearchCV(model,param_grid,cv=3,verbose=3)

# Tuning
grid.fit(X_train,y_train)

# Best model
print(grid.best_estimator_)

# Predictions with best hiperparameters
grid_predictions = grid.predict(X_test) 

# Metrics
print(confusion_matrix(y_test,grid_predictions)) 
print(classification_report(y_test,grid_predictions)) 


In [None]:
# Model with hiperparameters using GridSearch 
model = RandomForestClassifier(criterion='entropy', max_depth=20, n_estimators=150,
                       n_jobs=-1, random_state=32)
model.fit(X_train, y_train)
print("Accuracy: ", model.score(X_test, y_test))

#### GB

In [None]:
GradientBoostingClassifier().get_params().keys()

In [None]:
# Model to tune
model = GradientBoostingClassifier()

# Hiperparameter values
param_grid = {
        "n_estimators":[50,100,150],
        "max_depth":[1,3,5,9],
        'random_state':[32, 64]
    },

# Grid
grid = GridSearchCV(model,param_grid,cv=3,verbose=3)

# Tuning
grid.fit(X_train,y_train)

# Best model
print(grid.best_estimator_)

# Predictions with best hiperparameters
grid_predictions = grid.predict(X_test) 

# Metrics
print(confusion_matrix(y_test,grid_predictions)) 
print(classification_report(y_test,grid_predictions)) 


In [None]:
# Model with hiperparameters using GridSearch 
model = GradientBoostingClassifier(max_depth=9, random_state=64)
model.fit(X_train, y_train)
print("Accuracy: ", model.score(X_test, y_test))

### Best models for binary classification

In [None]:
#ML Models
classifiers=[
    XGBClassifier(eval_metric='mlogloss',n_jobs=-1, random_state=32),
    ExtraTreesClassifier(criterion='entropy', max_depth=50, n_jobs=-1, random_state=32),
    DecisionTreeClassifier(criterion='entropy', max_depth=20, random_state=32),
    RandomForestClassifier(criterion='entropy', max_depth=20, n_estimators=150,n_jobs=-1, random_state=32),
    GradientBoostingClassifier(max_depth=9, random_state=64)
    ] 

#Deploy aggregate metrics 
classifier_metrics(X_train,X_test,y_train,y_test,CV=True) 

### Feature importance binary classification

In [None]:
# Model with hiperparameters using GridSearch 
model = XGBClassifier(eval_metric='mlogloss',n_jobs=-1, random_state=32)
model.fit(X_train, y_train)
print("Accuracy: ", model.score(X_test, y_test))

# Mostrar la importancia de características
viz = FeatureImportances(model, topn=19)
viz.fit(X_train, y_train)
viz.show() 

In [None]:
# Model with hiperparameters using GridSearch 
model = ExtraTreesClassifier(criterion='entropy', max_depth=50, n_jobs=-1, random_state=32)
model.fit(X_train, y_train)
print("Accuracy: ", model.score(X_test, y_test))

# Mostrar la importancia de características
viz = FeatureImportances(model, topn=19)
viz.fit(X_train, y_train)
viz.show() 

In [None]:
# Model with hiperparameters using GridSearch 
model = DecisionTreeClassifier(criterion='entropy', max_depth=20, random_state=32)
model.fit(X_train, y_train)
print("Accuracy: ", model.score(X_test, y_test))

# Mostrar la importancia de características
viz = FeatureImportances(model, topn=19)
viz.fit(X_train, y_train)
viz.show() 

In [None]:
# Model with hiperparameters using GridSearch 
model = RandomForestClassifier(criterion='entropy', max_depth=20, n_estimators=150,n_jobs=-1, random_state=32)
model.fit(X_train, y_train)
print("Accuracy: ", model.score(X_test, y_test))

# Mostrar la importancia de características
viz = FeatureImportances(model, topn=19)
viz.fit(X_train, y_train)
viz.show() 

In [None]:
# Model with hiperparameters using GridSearch 
model = GradientBoostingClassifier(max_depth=9, random_state=64)
model.fit(X_train, y_train)
print("Accuracy: ", model.score(X_test, y_test))

# Mostrar la importancia de características
viz = FeatureImportances(model, topn=19)
viz.fit(X_train, y_train)
viz.show() 

## Scenario 2

In [None]:
X_train,X_test,y_train,y_test=train_test_split(features, labels_multiclass,
                                               test_size=0.2,random_state=21, stratify=labels_multiclass)

le_labels = LabelEncoder()
y_train = le_labels.fit_transform(y_train) 
y_test = le_labels.transform(y_test) 

print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 

In [None]:
#ML Models
classifiers=[
    DecisionTreeClassifier(),
    ExtraTreesClassifier(n_jobs=-1), 
    RandomForestClassifier(n_jobs=-1),
    GradientBoostingClassifier(),
    XGBClassifier(eval_metric='mlogloss',n_jobs=-1),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    LogisticRegression(solver='liblinear',n_jobs=-1)
    ] 

#Deploy aggregate metrics 
classifier_metrics(X_train,X_test,y_train,y_test,CV=True) 

### Hiperparameter tuning

#### XGB

In [None]:
# Model to tune
model = XGBClassifier(eval_metric='mlogloss',n_jobs=-1)

# Hiperparameter values
param_grid = {
              "min_child_weight":[1, 5],
              "gamma": [0.5, 1.5, 5],
              "colsample_bytree":[0.75, 1],
              "max_depth":[2, 6]
             }

# Grid
grid = GridSearchCV(model,param_grid,cv=3,verbose=3)

# Tuning
grid.fit(X_train,y_train)

# Best model
print(grid.best_estimator_)

# Predictions with best hiperparameters
grid_predictions = grid.predict(X_test) 

# Metrics
print(confusion_matrix(y_test,grid_predictions)) 
print(classification_report(y_test,grid_predictions)) 

In [None]:
# Model with hiperparameters using GridSearch 
model = XGBClassifier(eval_metric='mlogloss',n_jobs=-1)

model.fit(X_train, y_train)
print("Accuracy: ", model.score(X_test, y_test))

#### ET

In [None]:
ExtraTreesClassifier(n_jobs=-1).get_params().keys()

In [None]:
# Model to tune
model = ExtraTreesClassifier(n_jobs=-1)

# Hiperparameter values
param_grid = {
        'n_estimators': [100,150,200],
        'criterion': ['gini', 'entropy'],
        'max_depth': [10, 50, 80, None],
        'random_state':[32, 64]
    }

# Grid
grid = GridSearchCV(model,param_grid,cv=3,verbose=3)

# Tuning
grid.fit(X_train,y_train)

# Best model
print(grid.best_estimator_)

# Predictions with best hiperparameters
grid_predictions = grid.predict(X_test) 

# Metrics
print(confusion_matrix(y_test,grid_predictions)) 
print(classification_report(y_test,grid_predictions)) 

In [None]:
# Model with hiperparameters using GridSearch 
model = ExtraTreesClassifier(max_depth=50, n_estimators=150, n_jobs=-1)
model.fit(X_train, y_train)
print("Accuracy: ", model.score(X_test, y_test))

#### DT

In [None]:
DecisionTreeClassifier().get_params().keys()

In [None]:
# Model to tune
model = DecisionTreeClassifier()

# Hiperparameter values
param_grid = {
        'criterion': ['gini', 'entropy'],
        'max_depth': [10, 20, 30, 40, 50, 100, 150, 200, None],
        'random_state':[32,64]
    }

# Grid
grid = GridSearchCV(model,param_grid,cv=3,verbose=3)

# Tuning
grid.fit(X_train,y_train)

# Best model
print(grid.best_estimator_)

# Predictions with best hiperparameters
grid_predictions = grid.predict(X_test) 

# Metrics
print(confusion_matrix(y_test,grid_predictions)) 
print(classification_report(y_test,grid_predictions)) 

In [None]:
# Model with hiperparameters using GridSearch 
model = DecisionTreeClassifier(max_depth=20, random_state=32)
model.fit(X_train, y_train)
print("Accuracy: ", model.score(X_test, y_test))

#### RF

In [None]:
RandomForestClassifier(n_jobs=-1).get_params().keys()

In [None]:
# Model to tune
model = RandomForestClassifier(n_jobs=-1)

# Hiperparameter values
param_grid = {
        'n_estimators': [60,100,120,150],
        'criterion': ['gini', 'entropy'],
        'max_depth': [10, 20, 40, None],
        'random_state':[32, 64]
    }

# Grid
grid = GridSearchCV(model,param_grid,cv=3,verbose=3)

# Tuning
grid.fit(X_train,y_train)

# Best model
print(grid.best_estimator_)

# Predictions with best hiperparameters
grid_predictions = grid.predict(X_test) 

# Metrics
print(confusion_matrix(y_test,grid_predictions)) 
print(classification_report(y_test,grid_predictions)) 

In [None]:
# Model with hiperparameters using GridSearch 
model = RandomForestClassifier(max_depth=20, n_estimators=120, n_jobs=-1, random_state=64)
model.fit(X_train, y_train)
print("Accuracy: ", model.score(X_test, y_test))

#### GB

In [None]:
GradientBoostingClassifier().get_params().keys()

In [None]:
# Model to tune
model = GradientBoostingClassifier()

# Hiperparameter values
param_grid = {
        "n_estimators":[50,100,150],
        "max_depth":[1,3,5,9],
        'random_state':[32, 64]
    }

# Grid
grid = GridSearchCV(model,param_grid,cv=3,verbose=3)

# Tuning
grid.fit(X_train,y_train)

# Best model
print(grid.best_estimator_)

# Predictions with best hiperparameters
grid_predictions = grid.predict(X_test) 

# Metrics
print(confusion_matrix(y_test,grid_predictions)) 
print(classification_report(y_test,grid_predictions)) 

In [None]:
# Model with hiperparameters using GridSearch 
model = GradientBoostingClassifier()
model.fit(X_train, y_train)
print("Accuracy: ", model.score(X_test, y_test)) 

### Best models for multiclass classification

In [None]:
#ML Models
classifiers=[
        XGBClassifier(eval_metric='mlogloss',n_jobs=-1),
        ExtraTreesClassifier(max_depth=50, n_estimators=150, n_jobs=-1),
        DecisionTreeClassifier(max_depth=20, random_state=32),
        RandomForestClassifier(max_depth=20, n_estimators=120, n_jobs=-1, random_state=64),
        GradientBoostingClassifier()
    ] 

#Deploy aggregate metrics 
classifier_metrics(X_train,X_test,y_train,y_test,CV=True) 

### Feature importance multiclass classification

In [None]:
# Model with hiperparameters using GridSearch 
model = XGBClassifier(eval_metric='mlogloss',n_jobs=-1, random_state=32)
model.fit(X_train, y_train)
print("Accuracy: ", model.score(X_test, y_test))

# Mostrar la importancia de características
viz = FeatureImportances(model, topn=19)
viz.fit(X_train, y_train)
viz.show() 

In [None]:
# Model with hiperparameters using GridSearch 
model = ExtraTreesClassifier(max_depth=50, n_estimators=150, n_jobs=-1)
model.fit(X_train, y_train)
print("Accuracy: ", model.score(X_test, y_test))

# Mostrar la importancia de características
viz = FeatureImportances(model, topn=19)
viz.fit(X_train, y_train)
viz.show() 


In [None]:
# Model with hiperparameters using GridSearch 
model = DecisionTreeClassifier(max_depth=20, random_state=32)
model.fit(X_train, y_train)
print("Accuracy: ", model.score(X_test, y_test))

# Mostrar la importancia de características
viz = FeatureImportances(model, topn=19)
viz.fit(X_train, y_train)
viz.show() 


In [None]:
# Model with hiperparameters using GridSearch 
model = RandomForestClassifier(max_depth=20, n_estimators=120, n_jobs=-1, random_state=64)
model.fit(X_train, y_train)
print("Accuracy: ", model.score(X_test, y_test))

# Mostrar la importancia de características
viz = FeatureImportances(model, topn=19)
viz.fit(X_train, y_train)
viz.show() 


In [None]:
# Model with hiperparameters using GridSearch 
model = GradientBoostingClassifier()
model.fit(X_train, y_train)
print("Accuracy: ", model.score(X_test, y_test))

# Mostrar la importancia de características
viz = FeatureImportances(model, topn=19)
viz.fit(X_train, y_train)
viz.show() 


# External validation using Bot-IoT dataset

In [None]:
# Load IDSAI
IDSAIpath_folder_data = "../DBs"
IDSAIpath = IDSAIpath_folder_data+'/IDSAI.csv'
IDSAIdf=pd.read_csv(IDSAIpath)
#IDSAIdf

In [None]:
# Delete labels and redundant features
IDSAIfeatures = IDSAIdf.copy()
IDSAIfeatures = IDSAIfeatures.drop(['label', 'tipo_ataque','ip_src', 'ip_dst', 'port_src', 'port_dst', 'protocols'], axis=1) 
# Obtain labels
IDSAIlabels = IDSAIdf.copy()
IDSAIlabels = IDSAIlabels['label'].values 

In [None]:
# Load BotIoT
BotIoTpath_folder_data = "../DBs"
BotIoTpath = BotIoTpath_folder_data+'/Bot-Iot.csv'
BotIoTdf=pd.read_csv(BotIoTpath)
#BotIoTdf

In [None]:
# Delete labels and redundant features
BotIoTfeatures = BotIoTdf.copy()
BotIoTfeatures = BotIoTfeatures.drop(['label', 'tipo_ataque','ip_src', 'ip_dst', 'port_src', 'port_dst', 'protocols'], axis=1) 
# Obtain labels
BotIoTlabels = BotIoTdf.copy()
BotIoTlabels = BotIoTlabels['label'].values 

## Scenario 3

In [None]:
# Entrenamiento en IDSIA predicción en Bot-IoT

#ML Models
classifiers=[
    XGBClassifier(eval_metric='mlogloss',n_jobs=-1, random_state=32),
    ExtraTreesClassifier(criterion='entropy', max_depth=50, n_jobs=-1, random_state=32),
    DecisionTreeClassifier(random_state=32),
    RandomForestClassifier(criterion='entropy', max_depth=20, n_estimators=150,n_jobs=-1, random_state=32),
    GradientBoostingClassifier()
    ] 

#Deploy aggregate metrics 
classifier_metrics(IDSAIfeatures,BotIoTfeatures,IDSAIlabels,BotIoTlabels,CV=True) 