# Libraries

In [None]:
#Classification Methods
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

#Metrics
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from yellowbrick.classifier import ClassificationReport 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import roc_auc_score

#Tools
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import yellowbrick
from yellowbrick.style.palettes import LINE_COLOR
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import model_selection
from scipy.sparse import csr_matrix 
import string 
import time as tm
import spacy 
import os
from scipy.sparse import csr_matrix 
from yellowbrick.model_selection import FeatureImportances

import warnings
warnings.filterwarnings('ignore')

# Functions

In [None]:
path_figures = "../images"
if not os.path.exists(path_figures):
    os.makedirs(path_figures)


In [None]:
def classifier_metrics(X_train,X_test,y_train,y_test,CV=True):    
    def metrics(model):
        print("\nHold-Out in process...")
        start_time = tm.time()
        model.fit(X_train, y_train) 
        TIME = tm.time() - start_time 
        print("Time, Training: {0:.4f} [seconds]".format(TIME))
        start_time = tm.time()
        y_pred = model.predict(X_test)
        TIME = tm.time() - start_time 
        print("Time, Prediction: {0:.4f} [seconds]".format(TIME))
        
        accuracy_s  = accuracy_score(y_test,y_pred) 
        print('accuracy_score: {0:.4f}'.format(accuracy_s))
        f1_s        = f1_score(y_test,y_pred,average='weighted')
        print('f1_score: {0:.4f}'.format(f1_s))
        recall_s    = recall_score(y_test,y_pred,average='weighted')
        print('recall_score: {0:.4f}'.format(recall_s))
        precision_s = precision_score(y_test,y_pred,average='weighted')
        print('precision_score: {0:.4f}'.format(precision_s))
        
        if type(list(np.unique(np.array(y_train)))[0]).__name__ == 'str': #If the classes are categorical with string names
            le           = LabelEncoder() 
            le.fit(list(np.unique(np.array(y_train)))) 
            y_test_coded = le.transform(y_test) 
            y_pred_coded = le.transform(y_pred) 
            mse_s        = MSE(y_test_coded,y_pred_coded)
            print('MSE: {0:.4f}'.format(mse_s))
        else:
            mse_s        = MSE(y_test,y_pred)
            print('MSE: {0:.4f}'.format(mse_s))
        
        if len(list(np.unique(np.array(y_train)))) > 2: #For multiclass classification, more than 2 classes
            y_pred_proba = model.predict_proba(X_test)[:]
            roc_s        = roc_auc_score(y_test, y_pred_proba, multi_class='ovo', average='weighted')
            print('ROC_AUC: {0:.4f}'.format(roc_s))            
        else:
            y_pred_proba = model.predict_proba(X_test)[:,1]
            roc_s        = roc_auc_score(y_test, y_pred_proba, multi_class='ovo', average='weighted')
            print('ROC_AUC: {0:.4f}'.format(roc_s))
        
        ck_s         = cohen_kappa_score(y_test,y_pred)
        print('CK: {0:.4f}'.format(ck_s))
        
        if CV == True:
            print('\nCross-Validation in process...')
            start_time = tm.time() 
            kfold = model_selection.KFold(n_splits=10)
            y_CV = np.concatenate((y_train,y_test))
            if "GaussianNB" in str(name) or "LinearDiscriminantAnalysis" in str(name) or "QuadraticDiscriminantAnalysis" in str(name):
                X_CV = np.concatenate((X_train,X_test))
                cv_results = np.array(model_selection.cross_val_score(model, X_CV, y_CV, cv=kfold, scoring='accuracy', n_jobs=-3))
            else:
                X_CV = np.concatenate((X_train.toarray(),X_test.toarray()))
                X_CV = csr_matrix(X_CV)
                cv_results = np.array(model_selection.cross_val_score(model, X_CV, y_CV, cv=kfold, scoring='accuracy', n_jobs=-3))

            cv_results = cv_results[np.logical_not(np.isnan(cv_results))] 
            TIME = tm.time() - start_time 
            print("Time, CV: {0:.4f} [seconds]".format(TIME))
            print('CV: {0:.4f} {1:.4f}'.format(cv_results.mean(),cv_results.std()))

    for name in classifiers:
        print ("---------------------------------------------------------------------------------\n") 
        print(str(name))
        if "GaussianNB" in str(name) or "LinearDiscriminantAnalysis" in str(name) or "QuadraticDiscriminantAnalysis" in str(name):
            X_train=csr_matrix(X_train) 
            X_test =csr_matrix(X_test) 
            X_train=X_train.toarray() 
            X_test=X_test.toarray() 
        else:
            X_train=csr_matrix(X_train)
            X_test=csr_matrix(X_test)
            
        metrics(name)
        print()


In [None]:
# Classification report
def CR_viz(x,y):
    ax = plt.figure(figsize=(x,y)) 
    visualizer = ClassificationReport(model_selected, classes=classes, support=True,  
                                      cmap='Blues', title="Classification Report - "+model_name)
    visualizer.fit(X_train, y_train)   
    visualizer.score(X_test, y_test)      
    visualizer.poof()
    ax.show()
    ax.savefig(path_figures+"/"+model_name+"_CR"+".pdf", bbox_inches = "tight") 

# Confusion Matrix
def CM_viz(y_test, y_pred, classes, name,
               path_img_base = './images',nrows=1,ncols=1,size_text_legend=25,size_text_title=25,title="",
           size_text_xy_labels=25,size_text_xy_tick=25,
          size_num_inter=25):
    if not os.path.exists(path_img_base):
        os.makedirs(path_img_base)
    
    if ncols==nrows and ncols==1:
        nrows=1
        ncols=1
        fig = plt.figure(figsize=(20*ncols,20*nrows))
        conf = confusion_matrix(y_test, y_pred) 
        annot_kws={'fontsize':size_num_inter, 'verticalalignment':'center' } 
        ax = sns.heatmap(conf, annot=True, cmap='Blues',fmt = 'd',annot_kws= annot_kws, 
                                      xticklabels=np.unique(classes), yticklabels=np.unique(classes)) 
        cbar = ax.collections[0].colorbar # use matplotlib.colorbar.Colorbar object
        cbar.ax.tick_params(labelsize=size_text_xy_tick) # here set the labelsize 
        ax.xaxis.set_tick_params(labelsize=size_text_xy_tick,rotation=90)
        ax.yaxis.set_tick_params(labelsize=size_text_xy_tick,rotation=0)
        ax.set_xlabel('Predicted Values',fontsize=size_text_xy_labels)
        ax.set_ylabel('Actual Values',fontsize=size_text_xy_labels)
        ax.set_title(title,fontsize=size_text_title)
        ax.figure.subplots_adjust(right=0.8)
        ax.figure.savefig(path_figures+"/"+name+"_CM"+".pdf", bbox_inches = "tight", format='pdf')
    else:
        conf = confusion_matrix(y_test, y_pred) 
        annot_kws={'fontsize':size_num_inter, 'verticalalignment':'center' }

        ax = sns.heatmap(conf, annot=True, cmap='Blues',fmt = 'd',annot_kws= annot_kws, 
                                      xticklabels=np.unique(classes), yticklabels=np.unique(classes)) 
        cbar = ax.collections[0].colorbar # use matplotlib.colorbar.Colorbar object
        cbar.ax.tick_params(labelsize=size_text_xy_tick) # here set the labelsize 
        ax.xaxis.set_tick_params(labelsize=size_text_xy_tick,rotation=90)
        ax.yaxis.set_tick_params(labelsize=size_text_xy_tick,rotation=0)
        ax.set_xlabel('Predicted Values',fontsize=size_text_xy_labels)
        ax.set_ylabel('Actual Values',fontsize=size_text_xy_labels)
        ax.set_title(title,fontsize=size_text_title)
    return ax

In [None]:
def score_ci(y_true,y_pred,score_fun,n_bootstraps=1000,confidence_level=0.95,seed=None,reject_one_class_samples=True):

    assert len(y_true) == len(y_pred)
    
    score = score_fun(y_true, y_pred)
    _, ci_lower, ci_upper, scores = score_stat_ci(y_true=y_true,y_preds=y_pred,score_fun=score_fun,
                                                  n_bootstraps=n_bootstraps,confidence_level=confidence_level,
                                                  seed=seed,reject_one_class_samples=reject_one_class_samples)
    return score, ci_lower, ci_upper, scores

def score_stat_ci(y_true,y_preds,score_fun,stat_fun=np.mean,n_bootstraps=1000,confidence_level=0.95,
                  seed=None,reject_one_class_samples=True):

    y_true = np.array(y_true)
    y_preds = np.atleast_2d(y_preds)
    assert all(len(y_true) == len(y) for y in y_preds)
    
    np.random.seed(seed)
    scores = []
    for i in range(n_bootstraps):
        readers = np.random.randint(0, len(y_preds), len(y_preds))
        indices = np.random.randint(0, len(y_true), len(y_true))
        if reject_one_class_samples and len(np.unique(y_true[indices])) < 2:
            continue
        reader_scores = []
        for r in readers:
            reader_scores.append(score_fun(y_true[indices], y_preds[r][indices]))
        scores.append(stat_fun(reader_scores))

    mean_score = np.mean(scores)
    sorted_scores = np.array(sorted(scores))
    alpha = (1.0 - confidence_level) / 2.0
    ci_lower = sorted_scores[int(round(alpha * len(sorted_scores)))]
    ci_upper = sorted_scores[int(round((1.0 - alpha) * len(sorted_scores)))]
    return mean_score, ci_lower, ci_upper, scores

def ROC_curves(y_test, y_pred_proba, classes, name, size_x, size_y, n_bootstraps=1000, confidence_level=0.95, 
               path_img_base = './images',nrows=1,ncols=1,multi_x=0,multi_y=0,title="",
               size_text_legend=25,size_text_title=25,size_text_xy_labels=25,size_text_xy_tick=25):
    if not os.path.exists(path_img_base):
        os.makedirs(path_img_base)

    values = np.array(y_test)
    le = LabelEncoder()
    integer_encoded = le.fit_transform(values) # integer encode
    oe = OneHotEncoder(sparse=False) # binary encode
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    y_test = oe.fit_transform(integer_encoded)
    
    n_classes = len(list(np.unique(np.array(classes))))
    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_pred_proba[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    
    # Compute micro-average ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_pred_proba.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    
    
    # Compute macro-average ROC curve and ROC area
    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
    # Finally average it and compute AUC
    mean_tpr /= n_classes
    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
    
    if nrows >= 2 and ncols >=2: 
        # Plot ROC curve, for matrices of figures
        ax[multi_x,multi_y].set_facecolor('white')
        ax[multi_x,multi_y].grid(color='lightgray',linestyle='-')

        for i in range(n_classes):
            score, ci_lower, ci_upper, scores = score_ci(y_test[:, i], y_pred_proba[:, i], 
                                                         score_fun=roc_auc_score,
                                                         n_bootstraps=n_bootstraps,confidence_level=confidence_level,
                                                         seed=42)

            ax[multi_x,multi_y].plot(fpr[i], tpr[i], 
                     label="AUC of {:.3f} ({:.1f}% CI: {:.3f}, {:.3f}), ROC curve of {}".format(score,
                                                                                                      confidence_level*100, 
                                                                                                      ci_lower, ci_upper,
                                                                                                      classes[i]))

        if n_classes > 2:
            ax[multi_x,multi_y].plot(fpr["micro"], tpr["micro"],
                     label='micro-average ROC curve (AUC = {0:0.3f})'.format(roc_auc["micro"]),
                     color='aqua', linestyle=':', linewidth=4)

            ax[multi_x,multi_y].plot(fpr["macro"], tpr["macro"],
                     label='macro-average ROC curve (AUC = {0:0.3f})'.format(roc_auc["macro"]),
                     color='darkorange', linestyle=':', linewidth=4)

        ax[multi_x,multi_y].plot([0, 1], [0, 1], linestyle=":", c=LINE_COLOR, linewidth=2) 
        ax[multi_x,multi_y].set_xlim([-0.005, 1.005])
        ax[multi_x,multi_y].set_ylim([-0.005, 1.005])
        ax[multi_x,multi_y].xaxis.set_tick_params(labelsize=size_text_xy_tick)
        ax[multi_x,multi_y].yaxis.set_tick_params(labelsize=size_text_xy_tick)
        ax[multi_x,multi_y].set_xlabel('False Positive Rate',fontsize=size_text_xy_labels)
        ax[multi_x,multi_y].set_ylabel('True Positive Rate',fontsize=size_text_xy_labels)
        ax[multi_x,multi_y].set_title(title,fontsize=size_text_title)
        ax[multi_x,multi_y].legend(loc="lower right", frameon=True, facecolor='white',fontsize=size_text_legend)
        ax[multi_x,multi_y].set_facecolor('white')
        ax[multi_x,multi_y].grid(color='lightgray',linestyle='-')
        #fig.savefig(path_figures+"/"+model_name+"_ROC_AUC"+".pdf", bbox_inches = "tight", format='pdf') 
        #ax[multi_x,multi_y].figure.subplots_adjust(right=0.8)
        #ax[multi_x,multi_y].figure.savefig(path_img_base+'/ROC_CI_graphs_per_class_'+name+'.pdf', format='pdf')

    elif nrows<=1 and ncols<=1: 
                # Plot ROC curve, for only one ROC figure
        ax.set_facecolor('white')
        ax.grid(color='lightgray',linestyle='-')

        for i in range(n_classes):
            score, ci_lower, ci_upper, scores = score_ci(y_test[:, i], y_pred_proba[:, i], 
                                                         score_fun=roc_auc_score,
                                                         n_bootstraps=n_bootstraps,confidence_level=confidence_level,
                                                         seed=42)

            ax.plot(fpr[i], tpr[i], 
                     label="AUC of {:.3f} ({:.1f}% CI: {:.3f}, {:.3f}), ROC curve of {}".format(score,
                                                                                                      confidence_level*100, 
                                                                                                      ci_lower, ci_upper,
                                                                                                      classes[i]))

        if n_classes > 2:
            ax.plot(fpr["micro"], tpr["micro"],
                     label='micro-average ROC curve (AUC = {0:0.3f})'.format(roc_auc["micro"]),
                     color='aqua', linestyle=':', linewidth=4)

            ax.plot(fpr["macro"], tpr["macro"],
                     label='macro-average ROC curve (AUC = {0:0.3f})'.format(roc_auc["macro"]),
                     color='darkorange', linestyle=':', linewidth=4)

        ax.plot([0, 1], [0, 1], linestyle=":", c=LINE_COLOR, linewidth=2) 
        ax.set_xlim([-0.005, 1.005])
        ax.set_ylim([-0.005, 1.005])
        ax.xaxis.set_tick_params(labelsize=size_text_xy_tick)
        ax.yaxis.set_tick_params(labelsize=size_text_xy_tick)
        ax.set_xlabel('False Positive Rate',fontsize=size_text_xy_labels)
        ax.set_ylabel('True Positive Rate',fontsize=size_text_xy_labels)
        ax.set_title(title,fontsize=size_text_title)
        ax.legend(loc="lower right", frameon=True, facecolor='white',fontsize=size_text_legend)
        ax.set_facecolor('white')
        ax.grid(color='lightgray',linestyle='-')
        #fig.savefig(path_figures+"/"+model_name+"_ROC_AUC"+".pdf", bbox_inches = "tight", format='pdf') 
    else:
        # Plot ROC curve, for vectors of figures
        ax[multi_x].set_facecolor('white')
        ax[multi_x].grid(color='lightgray',linestyle='-')

        for i in range(n_classes):
            score, ci_lower, ci_upper, scores = score_ci(y_test[:, i], y_pred_proba[:, i], 
                                                         score_fun=roc_auc_score,
                                                         n_bootstraps=n_bootstraps,confidence_level=confidence_level,
                                                         seed=42)

            ax[multi_x].plot(fpr[i], tpr[i], 
                     label="AUC of {:.3f} ({:.1f}% CI: {:.3f}, {:.3f}), ROC curve of {}".format(score,
                                                                                                      confidence_level*100, 
                                                                                                      ci_lower, ci_upper,
                                                                                                      classes[i]))
        
        if n_classes > 2:
            ax[multi_x].plot(fpr["micro"], tpr["micro"],
                     label='micro-average ROC curve (AUC = {0:0.3f})'.format(roc_auc["micro"]),
                     color='aqua', linestyle=':', linewidth=4)

            ax[multi_x].plot(fpr["macro"], tpr["macro"],
                     label='macro-average ROC curve (AUC = {0:0.3f})'.format(roc_auc["macro"]),
                     color='darkorange', linestyle=':', linewidth=4)
        
        ax[multi_x].plot([0, 1], [0, 1], linestyle=":", c=LINE_COLOR, linewidth=2) 
        ax[multi_x].set_xlim([-0.005, 1.005])
        ax[multi_x].set_ylim([-0.005, 1.005])
        ax[multi_x].xaxis.set_tick_params(labelsize=size_text_xy_tick)
        ax[multi_x].yaxis.set_tick_params(labelsize=size_text_xy_tick)
        ax[multi_x].set_xlabel('False Positive Rate',fontsize=size_text_xy_labels)
        ax[multi_x].set_ylabel('True Positive Rate',fontsize=size_text_xy_labels)
        ax[multi_x].set_title(title,fontsize=size_text_title)
        ax[multi_x].legend(loc="lower right", frameon=True, facecolor='white',fontsize=size_text_legend)
        ax[multi_x].set_facecolor('white')
        ax[multi_x].grid(color='lightgray',linestyle='-')
        #fig.savefig(path_figures+"/"+model_name+"_ROC_AUC"+".pdf", bbox_inches = "tight", format='pdf') 


# Loading data

In [None]:
path_folder_data = "../Datasets"

In [None]:
path = path_folder_data+'/1NID_final_Set1.csv'
df_set1=pd.read_csv(path)
df_set1

In [None]:
path = path_folder_data+'/2NID_final_Set2.csv'
df_set2=pd.read_csv(path)
df_set2

In [None]:
path = path_folder_data+'/3NID_final_Set1_2.csv'
df_set1_2=pd.read_csv(path)
df_set1_2

# Scenario 1

In [None]:
# Eliminamos los labels
features = df_set1.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 

In [None]:
#Extraemos los labels
labels = df_set1.copy()
labels_binary = labels['label'].values 

In [None]:
labels=labels_binary
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 

In [None]:
model_name = "Scenario 1 - Decision Tree"
model_selected = DecisionTreeClassifier(random_state=179)
classes = np.unique(["Intrusion","Normal"])

model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

title="Confusion Matrix for {}".format(model_name)
visualization =[CR_viz(15,15), 
                CM_viz(y_test, y_pred, classes, name=model_name, path_img_base = './images',nrows=1,ncols=1, 
                            size_text_legend=25,size_text_title=25,title=title,
                       size_text_xy_labels=25,size_text_xy_tick=25,size_num_inter=25)] 


In [None]:
model_name = "Scenario 1 - Decision Tree"
model_selected = DecisionTreeClassifier(random_state=179)
model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(["Intrusion","Normal"])

global fig
global ax

nrows=1
ncols=1

fig, ax = plt.subplots(nrows=nrows,ncols=ncols,figsize=(20*ncols,20*nrows)) 

title="ROC Curves for {}".format(model_name)
ROC_curves(y_test,y_pred_proba,classes,name=model_name,n_bootstraps=100,confidence_level=0.95,
           size_x=15,size_y=15,path_img_base=path_figures,nrows=nrows,ncols=ncols,multi_x=0,multi_y=0,title=title,
           size_text_legend=30,size_text_title=35,
           size_text_xy_labels=35,size_text_xy_tick=35)

fig.savefig(path_figures+"/"+model_name+"_ROC_AUC"+".pdf", bbox_inches = "tight", format='pdf') 


# Scenario 2

In [None]:
# Eliminamos los labels
features = df_set2.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 

In [None]:
#Extraemos los labels
labels = df_set2.copy()
labels_binary = labels['label'].values 

In [None]:
labels=labels_binary
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 

In [None]:
#ML Models
classifiers=[
    DecisionTreeClassifier()
    ] 

#Deploy aggregate metrics 
classifier_metrics(X_train,X_test,y_train,y_test,CV=True) 

In [None]:
model_name = "Scenario 2 - Decision Tree"
model_selected = DecisionTreeClassifier()
classes = np.unique(["Intrusion","Normal"])

model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

title="Confusion Matrix for {}".format(model_name)
visualization =[CR_viz(15,15), 
                CM_viz(y_test, y_pred, classes, name=model_name, path_img_base = './images',nrows=1,ncols=1, 
                            size_text_legend=25,size_text_title=25,title=title,
                       size_text_xy_labels=25,size_text_xy_tick=25,size_num_inter=25)] 

In [None]:
model_name = "Scenario 2 - Decision Tree"
model_selected = DecisionTreeClassifier()
model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(["Intrusion","Normal"])

global fig
global ax

nrows=1
ncols=1

fig, ax = plt.subplots(nrows=nrows,ncols=ncols,figsize=(20*ncols,20*nrows)) 

title="ROC Curves for {}".format(model_name)
ROC_curves(y_test,y_pred_proba,classes,name=model_name,n_bootstraps=100,confidence_level=0.95,
           size_x=15,size_y=15,path_img_base=path_figures,nrows=nrows,ncols=ncols,multi_x=0,multi_y=0,title=title,
           size_text_legend=30,size_text_title=35,
           size_text_xy_labels=35,size_text_xy_tick=35)

fig.savefig(path_figures+"/"+model_name+"_ROC_AUC"+".pdf", bbox_inches = "tight", format='pdf') 

# Scenario 3

In [None]:
# Eliminamos los labels
features = df_set1_2.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 

In [None]:
#Extraemos los labels
labels = df_set1_2.copy()
labels_binary = labels['label'].values 

In [None]:
labels=labels_binary
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 

In [None]:
#ML Models
classifiers=[
    DecisionTreeClassifier()
    ] 

#Deploy aggregate metrics 
classifier_metrics(X_train,X_test,y_train,y_test,CV=True) 

In [None]:
model_name = "Scenario - Extra Trees"
model_selected = ExtraTreesClassifier(random_state=179, n_jobs=-1)
classes = np.unique(["Intrusion","Normal"])

model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

title="Confusion Matrix for {}".format(model_name)
visualization =[CR_viz(15,15), 
                CM_viz(y_test, y_pred, classes, name=model_name, path_img_base = './images',nrows=1,ncols=1, 
                            size_text_legend=25,size_text_title=25,title=title,
                       size_text_xy_labels=25,size_text_xy_tick=25,size_num_inter=25)] 

In [None]:
model_name = "Scenario 3 - Extra Trees"
model_selected = ExtraTreesClassifier(random_state=179, n_jobs=-1)
model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(["Intrusion","Normal"])

global fig
global ax

nrows=1
ncols=1

fig, ax = plt.subplots(nrows=nrows,ncols=ncols,figsize=(20*ncols,20*nrows)) 

title="ROC Curves for {}".format(model_name)
ROC_curves(y_test,y_pred_proba,classes,name=model_name,n_bootstraps=100,confidence_level=0.95,
           size_x=15,size_y=15,path_img_base=path_figures,nrows=nrows,ncols=ncols,multi_x=0,multi_y=0,title=title,
           size_text_legend=30,size_text_title=35,
           size_text_xy_labels=35,size_text_xy_tick=35)

fig.savefig(path_figures+"/"+model_name+"_ROC_AUC"+".pdf", bbox_inches = "tight", format='pdf') 


# Scenario 4

In [None]:
#SET 1

# Eliminamos los labels
features = df_set1.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 

#Extraemos los labels
labels = df_set1.copy()
labels_binary = labels['label'].values 

In [None]:
#SET 2

# Eliminamos los labels
features_ = df_set2.copy()
features_ = features_.drop(['label', 'tipo_ataque'], axis=1) 

#Extraemos los labels
labels_ = df_set1.copy()
labels_binary_ = labels_['label'].values 

In [None]:
labels=labels_binary
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 

In [None]:
X_test = features_
y_test = labels_binary_

classifiers=[
    ExtraTreesClassifier(random_state=179, n_jobs=-1)
    ] 

#Deploy aggregate metrics 
classifier_metrics(X_train,X_test,y_train,y_test,CV=False) 

In [None]:
X_test = features_
y_test = labels_binary_

model_name = "Scenario 4 - Extra Trees"
model_selected = ExtraTreesClassifier(random_state=179, n_jobs=-1)
classes = np.unique(["Intrusion","Normal"])

model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

title="Confusion Matrix for {}".format(model_name)
visualization =[CR_viz(15,15), 
                CM_viz(y_test, y_pred, classes, name=model_name, path_img_base = './images',nrows=1,ncols=1, 
                            size_text_legend=25,size_text_title=25,title=title,
                       size_text_xy_labels=25,size_text_xy_tick=25,size_num_inter=25)] 

In [None]:
model_name = "Scenario 4 - Extra Trees"
model_selected = ExtraTreesClassifier(random_state=179, n_jobs=-1)
model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(["Intrusion","Normal"])

global fig
global ax

nrows=1
ncols=1

fig, ax = plt.subplots(nrows=nrows,ncols=ncols,figsize=(20*ncols,20*nrows)) 

title="ROC Curves for {}".format(model_name)
ROC_curves(y_test,y_pred_proba,classes,name=model_name,n_bootstraps=100,confidence_level=0.95,
           size_x=15,size_y=15,path_img_base=path_figures,nrows=nrows,ncols=ncols,multi_x=0,multi_y=0,title=title,
           size_text_legend=30,size_text_title=35,
           size_text_xy_labels=35,size_text_xy_tick=35)

fig.savefig(path_figures+"/"+model_name+"_ROC_AUC"+".pdf", bbox_inches = "tight", format='pdf') 


# Scenario 5

In [None]:
#SET 1

# Eliminamos los labels
features_ = df_set1.copy()
features_ = features_.drop(['label', 'tipo_ataque'], axis=1) 

#Extraemos los labels
labels_ = df_set1.copy()
labels_binary_ = labels_['label'].values 

In [None]:
#SET 2

# Eliminamos los labels
features = df_set2.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 

#Extraemos los labels
labels = df_set1.copy()
labels_binary = labels['label'].values 

In [None]:
labels=labels_binary
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 

In [None]:
X_test = features_
y_test = labels_binary_

classifiers=[
    ExtraTreesClassifier(min_samples_leaf = 2, n_estimators = 50, random_state=179, n_jobs=-1)
    ] 

#Deploy aggregate metrics 
classifier_metrics(X_train,X_test,y_train,y_test,CV=False) 

In [None]:
X_test = features_
y_test = labels_binary_

model_name = "Scenario 5 - Extra Trees"
model_selected = ExtraTreesClassifier(min_samples_leaf = 2, n_estimators = 50, random_state=179, n_jobs=-1)
classes = np.unique(["Intrusion","Normal"])

model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

title="Confusion Matrix for {}".format(model_name)
visualization =[CR_viz(15,15), 
                CM_viz(y_test, y_pred, classes, name=model_name, path_img_base = './images',nrows=1,ncols=1, 
                            size_text_legend=25,size_text_title=25,title=title,
                       size_text_xy_labels=25,size_text_xy_tick=25,size_num_inter=25)] 

In [None]:
model_name = "Scenario 5 - Extra Trees"
model_selected = ExtraTreesClassifier(min_samples_leaf = 2, n_estimators = 50, random_state=179, n_jobs=-1)
model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(["Intrusion","Normal"])

global fig
global ax

nrows=1
ncols=1

fig, ax = plt.subplots(nrows=nrows,ncols=ncols,figsize=(20*ncols,20*nrows)) 

title="ROC Curves for {}".format(model_name)
ROC_curves(y_test,y_pred_proba,classes,name=model_name,n_bootstraps=100,confidence_level=0.95,
           size_x=15,size_y=15,path_img_base=path_figures,nrows=nrows,ncols=ncols,multi_x=0,multi_y=0,title=title,
           size_text_legend=30,size_text_title=35,
           size_text_xy_labels=35,size_text_xy_tick=35)

fig.savefig(path_figures+"/"+model_name+"_ROC_AUC"+".pdf", bbox_inches = "tight", format='pdf') 

# Scenario 6

In [None]:
df1 = df_set1.copy() 
df1.drop(index=df1.index[:200000], axis=0, inplace=True) #Only 50.000 samples for Normal

In [None]:
# Eliminamos los labels
features = df1.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 

In [None]:
#Extraemos los labels
labels = df1.copy()
labels_multiclass = labels['tipo_ataque'].values

In [None]:
labels=labels_multiclass
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 

In [None]:
#ML Models
classifiers=[
    DecisionTreeClassifier()
    ] 

#Deploy aggregate metrics 
classifier_metrics(X_train,X_test,y_train,y_test,CV=True) 

In [None]:
model_name = "Scenario 6 - Decision Tree"
model_selected = DecisionTreeClassifier()
classes = np.unique(y_test)

model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

title="Confusion Matrix for {}".format(model_name)
visualization =[CR_viz(15,15), 
                CM_viz(y_test, y_pred, classes, name=model_name, path_img_base = './images',nrows=1,ncols=1, 
                            size_text_legend=25,size_text_title=25,title=title,
                       size_text_xy_labels=25,size_text_xy_tick=25,size_num_inter=25)] 

In [None]:
model_name = "Scenario 6 - Decision Tree"
model_selected = DecisionTreeClassifier()
model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(y_test)

global fig
global ax

nrows=1
ncols=1

fig, ax = plt.subplots(nrows=nrows,ncols=ncols,figsize=(20*ncols,20*nrows)) 

title="ROC Curves for {}".format(model_name)
ROC_curves(y_test,y_pred_proba,classes,name=model_name,n_bootstraps=100,confidence_level=0.95,
           size_x=15,size_y=15,path_img_base=path_figures,nrows=nrows,ncols=ncols,multi_x=0,multi_y=0,title=title,
           size_text_legend=25,size_text_title=35,
           size_text_xy_labels=35,size_text_xy_tick=35)

fig.savefig(path_figures+"/"+model_name+"_ROC_AUC"+".pdf", bbox_inches = "tight", format='pdf') 

# Scenario 7

In [None]:
df2 = df_set2.copy() 
df2.drop(index=df2.index[:200000], axis=0, inplace=True) #Only 50.000 samples for Normal

In [None]:
# Eliminamos los labels
features = df2.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 

In [None]:
#Extraemos los labels
labels = df2.copy()
labels_multiclass = labels['tipo_ataque'].values

In [None]:
labels=labels_multiclass
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 

In [None]:
#ML Models
classifiers=[
    ExtraTreesClassifier(n_jobs=-1)
    ] 

#Deploy aggregate metrics 
classifier_metrics(X_train,X_test,y_train,y_test,CV=True) 

In [None]:
model_name = "Scenario 7 - Extra Trees"
model_selected = ExtraTreesClassifier(n_jobs=-1)
classes = np.unique(y_test)

model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

title="Confusion Matrix for {}".format(model_name)
visualization =[CR_viz(15,15), 
                CM_viz(y_test, y_pred, classes, name=model_name, path_img_base = './images',nrows=1,ncols=1, 
                            size_text_legend=25,size_text_title=25,title=title,
                       size_text_xy_labels=25,size_text_xy_tick=25,size_num_inter=25)] 

In [None]:
model_name = "Scenario 7 - Extra Trees"
model_selected = ExtraTreesClassifier(n_jobs=-1)
model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(y_test)

global fig
global ax

nrows=1
ncols=1

fig, ax = plt.subplots(nrows=nrows,ncols=ncols,figsize=(20*ncols,20*nrows)) 

title="ROC Curves for {}".format(model_name)
ROC_curves(y_test,y_pred_proba,classes,name=model_name,n_bootstraps=100,confidence_level=0.95,
           size_x=15,size_y=15,path_img_base=path_figures,nrows=nrows,ncols=ncols,multi_x=0,multi_y=0,title=title,
           size_text_legend=25,size_text_title=35,
           size_text_xy_labels=35,size_text_xy_tick=35)

fig.savefig(path_figures+"/"+model_name+"_ROC_AUC"+".pdf", bbox_inches = "tight", format='pdf') 

# Scenario 8

In [None]:
# Extraction of 50.000 samples of Normal class
df_normal = df_set1_2.copy() 
df_normal.drop(index=df_normal.index[50000:], axis=0, inplace=True) 

In [None]:
# Extracting the intrusions, each one has 50.000 samples
df_equal = df_set1_2.copy() 
df_equal.drop(df_equal[df_equal.tipo_ataque == "normal"].index, inplace=True)  

In [None]:
# Uniendo normal con los ataques, ahora todas las clases de a 50.000
df1 = pd.concat([df_normal, df_equal]).reset_index(drop=True)  # Concat all to a single df 

In [None]:
# Eliminamos los labels
features = df1.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 

In [None]:
#Extraemos los labels
labels = df1.copy()
labels_multiclass = labels['tipo_ataque'].values

In [None]:
labels=labels_multiclass
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 

In [None]:
#ML Models
classifiers=[
    DecisionTreeClassifier()
    ] 

#Deploy aggregate metrics 
classifier_metrics(X_train,X_test,y_train,y_test,CV=True) 

In [None]:
model_name = "Scenario 8 - Extra Trees"
model_selected = ExtraTreesClassifier(random_state=179, n_jobs=-1)
classes = np.unique(y_test)

model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

title="Confusion Matrix for {}".format(model_name)
visualization =[CR_viz(15,15), 
                CM_viz(y_test, y_pred, classes, name=model_name, path_img_base = './images',nrows=1,ncols=1, 
                            size_text_legend=25,size_text_title=25,title=title,
                       size_text_xy_labels=25,size_text_xy_tick=25,size_num_inter=25)] 

In [None]:
model_name = "Scenario 8 - Extra Trees"
model_selected = ExtraTreesClassifier(random_state=179, n_jobs=-1)
model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(y_test)

global fig
global ax

nrows=1
ncols=1

fig, ax = plt.subplots(nrows=nrows,ncols=ncols,figsize=(20*ncols,20*nrows)) 

title="ROC Curves for {}".format(model_name)
ROC_curves(y_test,y_pred_proba,classes,name=model_name,n_bootstraps=100,confidence_level=0.95,
           size_x=15,size_y=15,path_img_base=path_figures,nrows=nrows,ncols=ncols,multi_x=0,multi_y=0,title=title,
           size_text_legend=25,size_text_title=35,
           size_text_xy_labels=35,size_text_xy_tick=35)

fig.savefig(path_figures+"/"+model_name+"_ROC_AUC"+".pdf", bbox_inches = "tight", format='pdf') 

# ROC Figures for All Scenarios

In [None]:
global fig
global ax

nrows=4
ncols=2

fig,ax = plt.subplots(nrows=nrows,ncols=ncols,figsize=(20*ncols,20*nrows)) 
fig.subplots_adjust(hspace=0.25, wspace=0.2)
size_text_legend=25.5
size_text_title=85
size_text_xy_labels=35
size_text_xy_tick=35

###############################################SCENARIO 1
# Eliminamos los labels
features = df_set1.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 
#Extraemos los labels
labels = df_set1.copy()
labels_binary = labels['label'].values 
labels=labels_binary
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 
model_name = "Scenario 1 - Decision Tree"
model_selected = DecisionTreeClassifier(random_state=179)
model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(["Intrusion","Normal"])

#title="ROC Curves for {}".format(model_name)
title="A"
ROC_curves(y_test,y_pred_proba,classes,name=model_name,n_bootstraps=100,confidence_level=0.95,
           size_x=15,size_y=15,path_img_base=path_figures,nrows=nrows,ncols=ncols,multi_x=0,multi_y=0,title=title,
           size_text_legend=size_text_legend,size_text_title=size_text_title,
           size_text_xy_labels=size_text_xy_labels,size_text_xy_tick=size_text_xy_tick)
###############################################

###############################################SCENARIO 2
# Eliminamos los labels
features = df_set2.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 
#Extraemos los labels
labels = df_set2.copy()
labels_binary = labels['label'].values 
labels=labels_binary
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 
model_name = "Scenario 2 - Decision Tree"
model_selected = DecisionTreeClassifier()
model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(["Intrusion","Normal"])

#title="ROC Curves for {}".format(model_name)
title="B"
ROC_curves(y_test,y_pred_proba,classes,name=model_name,n_bootstraps=100,confidence_level=0.95,
           size_x=15,size_y=15,path_img_base=path_figures,nrows=nrows,ncols=ncols,multi_x=0,multi_y=1,title=title,
           size_text_legend=size_text_legend,size_text_title=size_text_title,
           size_text_xy_labels=size_text_xy_labels,size_text_xy_tick=size_text_xy_tick)
###############################################

###############################################SCENARIO 3
# Eliminamos los labels
features = df_set1_2.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 
#Extraemos los labels
labels = df_set1_2.copy()
labels_binary = labels['label'].values 
labels=labels_binary
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 
model_name = "Scenario 3 - Decision Tree"
model_selected = DecisionTreeClassifier()
model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(["Intrusion","Normal"])

#title="ROC Curves for {}".format(model_name)
title="C"
ROC_curves(y_test,y_pred_proba,classes,name=model_name,n_bootstraps=100,confidence_level=0.95,
           size_x=15,size_y=15,path_img_base=path_figures,nrows=nrows,ncols=ncols,multi_x=1,multi_y=0,title=title,
           size_text_legend=size_text_legend,size_text_title=size_text_title,
           size_text_xy_labels=size_text_xy_labels,size_text_xy_tick=size_text_xy_tick)
###############################################

###############################################SCENARIO 4
#SET 1

# Eliminamos los labels
features = df_set1.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 

#Extraemos los labels
labels = df_set1.copy()
labels_binary = labels['label'].values 
#SET 2

# Eliminamos los labels
features_ = df_set2.copy()
features_ = features_.drop(['label', 'tipo_ataque'], axis=1) 

#Extraemos los labels
labels_ = df_set1.copy()
labels_binary_ = labels_['label'].values 
labels=labels_binary
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 
X_test = features_
y_test = labels_binary_
model_name = "Scenario 4 - Extra Trees"
model_selected = ExtraTreesClassifier(random_state=179, n_jobs=-1)
model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(["Intrusion","Normal"])

#title="ROC Curves for {}".format(model_name)
title="D"
ROC_curves(y_test,y_pred_proba,classes,name=model_name,n_bootstraps=100,confidence_level=0.95,
           size_x=15,size_y=15,path_img_base=path_figures,nrows=nrows,ncols=ncols,multi_x=1,multi_y=1,title=title,
           size_text_legend=size_text_legend,size_text_title=size_text_title,
           size_text_xy_labels=size_text_xy_labels,size_text_xy_tick=size_text_xy_tick)
###############################################

###############################################SCENARIO 5
#SET 1

# Eliminamos los labels
features_ = df_set1.copy()
features_ = features_.drop(['label', 'tipo_ataque'], axis=1) 

#Extraemos los labels
labels_ = df_set1.copy()
labels_binary_ = labels_['label'].values 
#SET 2

# Eliminamos los labels
features = df_set2.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 

#Extraemos los labels
labels = df_set1.copy()
labels_binary = labels['label'].values 
labels=labels_binary
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 
X_test = features_
y_test = labels_binary_
model_name = "Scenario 5 - Extra Trees"
model_selected = ExtraTreesClassifier(min_samples_leaf = 2, n_estimators = 50, random_state=179, n_jobs=-1)
model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(["Intrusion","Normal"])

#title="ROC Curves for {}".format(model_name)
title="E"
ROC_curves(y_test,y_pred_proba,classes,name=model_name,n_bootstraps=100,confidence_level=0.95,
           size_x=15,size_y=15,path_img_base=path_figures,nrows=nrows,ncols=ncols,multi_x=2,multi_y=0,title=title,
           size_text_legend=size_text_legend,size_text_title=size_text_title,
           size_text_xy_labels=size_text_xy_labels,size_text_xy_tick=size_text_xy_tick)
###############################################

###############################################SCENARIO 6
df1 = df_set1.copy() 
df1.drop(index=df1.index[:200000], axis=0, inplace=True) #Only 50.000 samples for Normal
# Eliminamos los labels
features = df1.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 
#Extraemos los labels
labels = df1.copy()
labels_multiclass = labels['tipo_ataque'].values
labels=labels_multiclass
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 

model_name = "Scenario 6 - Decision Tree"
model_selected = DecisionTreeClassifier()
model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(y_test)

#title="ROC Curves for {}".format(model_name)
title="F"
ROC_curves(y_test,y_pred_proba,classes,name=model_name,n_bootstraps=100,confidence_level=0.95,
           size_x=15,size_y=15,path_img_base=path_figures,nrows=nrows,ncols=ncols,multi_x=2,multi_y=1,title=title,
           size_text_legend=25,size_text_title=size_text_title,
           size_text_xy_labels=size_text_xy_labels,size_text_xy_tick=size_text_xy_tick)
###############################################

###############################################SCENARIO 7
df2 = df_set2.copy() 
df2.drop(index=df2.index[:200000], axis=0, inplace=True) #Only 50.000 samples for Normal
# Eliminamos los labels
features = df2.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 
#Extraemos los labels
labels = df2.copy()
labels_multiclass = labels['tipo_ataque'].values
labels=labels_multiclass
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 
model_name = "Scenario 7 - Extra Trees"
model_selected = ExtraTreesClassifier(n_jobs=-1)
model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(y_test)

#title="ROC Curves for {}".format(model_name)
title="G"
ROC_curves(y_test,y_pred_proba,classes,name=model_name,n_bootstraps=100,confidence_level=0.95,
           size_x=15,size_y=15,path_img_base=path_figures,nrows=nrows,ncols=ncols,multi_x=3,multi_y=0,title=title,
           size_text_legend=25,size_text_title=size_text_title,
           size_text_xy_labels=size_text_xy_labels,size_text_xy_tick=size_text_xy_tick)
###############################################

###############################################SCENARIO 8
# Extraction of 50.000 samples of Normal class
df_normal = df_set1_2.copy() 
df_normal.drop(index=df_normal.index[50000:], axis=0, inplace=True) 
# Extracting the intrusions, each one has 50.000 samples
df_equal = df_set1_2.copy() 
df_equal.drop(df_equal[df_equal.tipo_ataque == "normal"].index, inplace=True)  
# Uniendo normal con los ataques, ahora todas las clases de a 50.000
df1 = pd.concat([df_normal, df_equal]).reset_index(drop=True)  # Concat all to a single df 
# Eliminamos los labels
features = df1.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 
#Extraemos los labels
labels = df1.copy()
labels_multiclass = labels['tipo_ataque'].values
labels=labels_multiclass
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 
model_name = "Scenario 8 - Decision Tree"
model_selected = DecisionTreeClassifier()
model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(y_test)

#title="ROC Curves for {}".format(model_name)
title="H"
ROC_curves(y_test,y_pred_proba,classes,name=model_name,n_bootstraps=100,confidence_level=0.95,
           size_x=15,size_y=15,path_img_base=path_figures,nrows=nrows,ncols=ncols,multi_x=3,multi_y=1,title=title,
           size_text_legend=25,size_text_title=size_text_title,
           size_text_xy_labels=size_text_xy_labels,size_text_xy_tick=size_text_xy_tick)
###############################################

model_name = "FigX_ROC_Curves_With_AUC_CI"
fig.savefig(path_figures+"/"+model_name+"_ROC_AUC"+".pdf", bbox_inches = "tight", format='pdf') 
#fig.savefig(path_figures+"/"+model_name+"_ROC_AUC"+".pdf", format='pdf') 


In [None]:
global fig
global ax

nrows=3
ncols=3

fig,ax = plt.subplots(nrows=nrows,ncols=ncols,figsize=(20*ncols,20*nrows)) 
fig.subplots_adjust(hspace=0.25, wspace=0.2)
size_text_legend=25.5
size_text_title=85
size_text_xy_labels=35
size_text_xy_tick=35

###############################################SCENARIO 1
# Eliminamos los labels
features = df_set1.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 
#Extraemos los labels
labels = df_set1.copy()
labels_binary = labels['label'].values 
labels=labels_binary
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 
model_name = "Scenario 1 - Decision Tree"
model_selected = DecisionTreeClassifier(random_state=179)
model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(["Intrusion","Normal"])

#title="ROC Curves for {}".format(model_name)
title="A"
ROC_curves(y_test,y_pred_proba,classes,name=model_name,n_bootstraps=100,confidence_level=0.95,
           size_x=15,size_y=15,path_img_base=path_figures,nrows=nrows,ncols=ncols,multi_x=0,multi_y=0,title=title,
           size_text_legend=size_text_legend,size_text_title=size_text_title,
           size_text_xy_labels=size_text_xy_labels,size_text_xy_tick=size_text_xy_tick)
###############################################

###############################################SCENARIO 2
# Eliminamos los labels
features = df_set2.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 
#Extraemos los labels
labels = df_set2.copy()
labels_binary = labels['label'].values 
labels=labels_binary
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 
model_name = "Scenario 2 - Decision Tree"
model_selected = DecisionTreeClassifier()
model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(["Intrusion","Normal"])

#title="ROC Curves for {}".format(model_name)
title="B"
ROC_curves(y_test,y_pred_proba,classes,name=model_name,n_bootstraps=100,confidence_level=0.95,
           size_x=15,size_y=15,path_img_base=path_figures,nrows=nrows,ncols=ncols,multi_x=0,multi_y=1,title=title,
           size_text_legend=size_text_legend,size_text_title=size_text_title,
           size_text_xy_labels=size_text_xy_labels,size_text_xy_tick=size_text_xy_tick)
###############################################

###############################################SCENARIO 3
# Eliminamos los labels
features = df_set1_2.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 
#Extraemos los labels
labels = df_set1_2.copy()
labels_binary = labels['label'].values 
labels=labels_binary
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 
model_name = "Scenario 3 - Decision Tree"
model_selected = DecisionTreeClassifier()
model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(["Intrusion","Normal"])

#title="ROC Curves for {}".format(model_name)
title="C"
ROC_curves(y_test,y_pred_proba,classes,name=model_name,n_bootstraps=100,confidence_level=0.95,
           size_x=15,size_y=15,path_img_base=path_figures,nrows=nrows,ncols=ncols,multi_x=1,multi_y=0,title=title,
           size_text_legend=size_text_legend,size_text_title=size_text_title,
           size_text_xy_labels=size_text_xy_labels,size_text_xy_tick=size_text_xy_tick)
###############################################

###############################################SCENARIO 4
#SET 1

# Eliminamos los labels
features = df_set1.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 

#Extraemos los labels
labels = df_set1.copy()
labels_binary = labels['label'].values 
#SET 2

# Eliminamos los labels
features_ = df_set2.copy()
features_ = features_.drop(['label', 'tipo_ataque'], axis=1) 

#Extraemos los labels
labels_ = df_set1.copy()
labels_binary_ = labels_['label'].values 
labels=labels_binary
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 
X_test = features_
y_test = labels_binary_
model_name = "Scenario 4 - Extra Trees"
model_selected = ExtraTreesClassifier(random_state=179, n_jobs=-1)
model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(["Intrusion","Normal"])

#title="ROC Curves for {}".format(model_name)
title="D"
ROC_curves(y_test,y_pred_proba,classes,name=model_name,n_bootstraps=100,confidence_level=0.95,
           size_x=15,size_y=15,path_img_base=path_figures,nrows=nrows,ncols=ncols,multi_x=1,multi_y=1,title=title,
           size_text_legend=size_text_legend,size_text_title=size_text_title,
           size_text_xy_labels=size_text_xy_labels,size_text_xy_tick=size_text_xy_tick)
###############################################

###############################################SCENARIO 5
#SET 1

# Eliminamos los labels
features_ = df_set1.copy()
features_ = features_.drop(['label', 'tipo_ataque'], axis=1) 

#Extraemos los labels
labels_ = df_set1.copy()
labels_binary_ = labels_['label'].values 
#SET 2

# Eliminamos los labels
features = df_set2.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 

#Extraemos los labels
labels = df_set1.copy()
labels_binary = labels['label'].values 
labels=labels_binary
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 
X_test = features_
y_test = labels_binary_
model_name = "Scenario 5 - Extra Trees"
model_selected = ExtraTreesClassifier(min_samples_leaf = 2, n_estimators = 50, random_state=179, n_jobs=-1)
model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(["Intrusion","Normal"])

#title="ROC Curves for {}".format(model_name)
title="E"
ROC_curves(y_test,y_pred_proba,classes,name=model_name,n_bootstraps=100,confidence_level=0.95,
           size_x=15,size_y=15,path_img_base=path_figures,nrows=nrows,ncols=ncols,multi_x=1,multi_y=2,title=title,
           size_text_legend=size_text_legend,size_text_title=size_text_title,
           size_text_xy_labels=size_text_xy_labels,size_text_xy_tick=size_text_xy_tick)
###############################################

###############################################SCENARIO 6
df1 = df_set1.copy() 
df1.drop(index=df1.index[:200000], axis=0, inplace=True) #Only 50.000 samples for Normal
# Eliminamos los labels
features = df1.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 
#Extraemos los labels
labels = df1.copy()
labels_multiclass = labels['tipo_ataque'].values
labels=labels_multiclass
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 

model_name = "Scenario 6 - Decision Tree"
model_selected = DecisionTreeClassifier()
model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(y_test)

#title="ROC Curves for {}".format(model_name)
title="F"
ROC_curves(y_test,y_pred_proba,classes,name=model_name,n_bootstraps=100,confidence_level=0.95,
           size_x=15,size_y=15,path_img_base=path_figures,nrows=nrows,ncols=ncols,multi_x=2,multi_y=0,title=title,
           size_text_legend=22,size_text_title=size_text_title,
           size_text_xy_labels=size_text_xy_labels,size_text_xy_tick=size_text_xy_tick)
###############################################

###############################################SCENARIO 7
df2 = df_set2.copy() 
df2.drop(index=df2.index[:200000], axis=0, inplace=True) #Only 50.000 samples for Normal
# Eliminamos los labels
features = df2.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 
#Extraemos los labels
labels = df2.copy()
labels_multiclass = labels['tipo_ataque'].values
labels=labels_multiclass
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 
model_name = "Scenario 7 - Extra Trees"
model_selected = ExtraTreesClassifier(n_jobs=-1)
model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(y_test)

#title="ROC Curves for {}".format(model_name)
title="G"
ROC_curves(y_test,y_pred_proba,classes,name=model_name,n_bootstraps=100,confidence_level=0.95,
           size_x=15,size_y=15,path_img_base=path_figures,nrows=nrows,ncols=ncols,multi_x=2,multi_y=1,title=title,
           size_text_legend=25,size_text_title=size_text_title,
           size_text_xy_labels=size_text_xy_labels,size_text_xy_tick=size_text_xy_tick)
###############################################

###############################################SCENARIO 8
# Extraction of 50.000 samples of Normal class
df_normal = df_set1_2.copy() 
df_normal.drop(index=df_normal.index[50000:], axis=0, inplace=True) 
# Extracting the intrusions, each one has 50.000 samples
df_equal = df_set1_2.copy() 
df_equal.drop(df_equal[df_equal.tipo_ataque == "normal"].index, inplace=True)  
# Uniendo normal con los ataques, ahora todas las clases de a 50.000
df1 = pd.concat([df_normal, df_equal]).reset_index(drop=True)  # Concat all to a single df 
# Eliminamos los labels
features = df1.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 
#Extraemos los labels
labels = df1.copy()
labels_multiclass = labels['tipo_ataque'].values
labels=labels_multiclass
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 
model_name = "Scenario 8 - Decision Tree"
model_selected = DecisionTreeClassifier()
model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(y_test)

#title="ROC Curves for {}".format(model_name)
title="H"
ROC_curves(y_test,y_pred_proba,classes,name=model_name,n_bootstraps=100,confidence_level=0.95,
           size_x=15,size_y=15,path_img_base=path_figures,nrows=nrows,ncols=ncols,multi_x=2,multi_y=2,title=title,
           size_text_legend=22,size_text_title=size_text_title,
           size_text_xy_labels=size_text_xy_labels,size_text_xy_tick=size_text_xy_tick)
###############################################

model_name = "FigX_ROC_Curves_With_AUC_CI"
fig.savefig(path_figures+"/"+model_name+"_ROC_AUC"+".pdf", bbox_inches = "tight", format='pdf') 
#fig.savefig(path_figures+"/"+model_name+"_ROC_AUC"+".pdf", format='pdf') 


# CM Figures for Selected Scenarios

In [None]:
size_text_legend=35
size_text_title=85
size_text_xy_labels=35
size_text_xy_tick=35
size_num_inter=35

nrows=2
ncols=2

fig = plt.figure(figsize=(20*ncols,20*nrows))
fig.subplots_adjust(hspace=0.35, wspace=0.6)
#https://jakevdp.github.io/PythonDataScienceHandbook/04.08-multiple-subplots.html

i=1
ax = fig.add_subplot(nrows, ncols, i)
###############################################SCENARIO 3
# Eliminamos los labels
features = df_set1_2.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 
#Extraemos los labels
labels = df_set1_2.copy()
labels_binary = labels['label'].values 
labels=labels_binary
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 
model_name = "Scenario 3 - Decision Tree"
model_selected = DecisionTreeClassifier()
model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(["Intrusion","Normal"])

#title="Confusion Matrix for {}".format(model_name)
title="A"
CM_viz(y_test, y_pred, classes, name=model_name, 
                            path_img_base = './images',nrows=nrows,ncols=ncols, 
                            size_text_legend=size_text_legend,size_text_title=size_text_title,title=title,
       size_text_xy_labels=size_text_xy_labels,size_text_xy_tick=size_text_xy_tick,size_num_inter=size_num_inter)
###############################################

i=2
ax = fig.add_subplot(nrows, ncols, i)
###############################################SCENARIO 4
#SET 1

# Eliminamos los labels
features = df_set1.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 

#Extraemos los labels
labels = df_set1.copy()
labels_binary = labels['label'].values 
#SET 2

# Eliminamos los labels
features_ = df_set2.copy()
features_ = features_.drop(['label', 'tipo_ataque'], axis=1) 

#Extraemos los labels
labels_ = df_set1.copy()
labels_binary_ = labels_['label'].values 
labels=labels_binary
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 
X_test = features_
y_test = labels_binary_
model_name = "Scenario 4 - Extra Trees"
model_selected = ExtraTreesClassifier(random_state=179, n_jobs=-1)
model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(["Intrusion","Normal"])

#title="Confusion Matrix for {}".format(model_name)
title="B"
CM_viz(y_test, y_pred, classes, name=model_name, 
                            path_img_base = './images',nrows=nrows,ncols=ncols, 
                            size_text_legend=size_text_legend,size_text_title=size_text_title,title=title,
       size_text_xy_labels=size_text_xy_labels,size_text_xy_tick=size_text_xy_tick,size_num_inter=size_num_inter)
###############################################

i=3
ax = fig.add_subplot(nrows, ncols, i)
###############################################SCENARIO 5
#SET 1

# Eliminamos los labels
features_ = df_set1.copy()
features_ = features_.drop(['label', 'tipo_ataque'], axis=1) 

#Extraemos los labels
labels_ = df_set1.copy()
labels_binary_ = labels_['label'].values 
#SET 2

# Eliminamos los labels
features = df_set2.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 

#Extraemos los labels
labels = df_set1.copy()
labels_binary = labels['label'].values 
labels=labels_binary
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 
X_test = features_
y_test = labels_binary_
model_name = "Scenario 5 - Extra Trees"
model_selected = ExtraTreesClassifier(min_samples_leaf = 2, n_estimators = 50, random_state=179, n_jobs=-1)
model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(["Intrusion","Normal"])

#title="Confusion Matrix for {}".format(model_name)
title="C"
CM_viz(y_test, y_pred, classes, name=model_name, 
                            path_img_base = './images',nrows=nrows,ncols=ncols, 
                            size_text_legend=size_text_legend,size_text_title=size_text_title,title=title,
       size_text_xy_labels=size_text_xy_labels,size_text_xy_tick=size_text_xy_tick,size_num_inter=size_num_inter)
###############################################

i=4
ax = fig.add_subplot(nrows, ncols, i)
###############################################SCENARIO 8
# Extraction of 50.000 samples of Normal class
df_normal = df_set1_2.copy() 
df_normal.drop(index=df_normal.index[50000:], axis=0, inplace=True) 
# Extracting the intrusions, each one has 50.000 samples
df_equal = df_set1_2.copy() 
df_equal.drop(df_equal[df_equal.tipo_ataque == "normal"].index, inplace=True)  
# Uniendo normal con los ataques, ahora todas las clases de a 50.000
df1 = pd.concat([df_normal, df_equal]).reset_index(drop=True)  # Concat all to a single df 
# Eliminamos los labels
features = df1.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 
#Extraemos los labels
labels = df1.copy()
labels_multiclass = labels['tipo_ataque'].values
labels=labels_multiclass
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 
model_name = "Scenario 8 - Decision Tree"
model_selected = DecisionTreeClassifier()
model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(y_test)

#title="Confusion Matrix for {}".format(model_name)
title="D"
CM_viz(y_test, y_pred, classes, name=model_name, 
                            path_img_base = './images',nrows=nrows,ncols=ncols, 
                            size_text_legend=size_text_legend,size_text_title=size_text_title,title=title,
       size_text_xy_labels=size_text_xy_labels-5,size_text_xy_tick=size_text_xy_tick-5,size_num_inter=18)
###############################################

model_name = "FigX_CM"
fig.savefig(path_figures+"/"+model_name+"_CM"+".pdf", bbox_inches = "tight", format='pdf') 
#fig.savefig(path_figures+"/"+model_name+"_CM"+".pdf", format='pdf') 

# Feature Importances

In [None]:
size_text_legend=35
size_text_title=85
size_text_xy_labels=35
size_text_xy_tick=35
size_num_inter=35

nrows=1
ncols=2
#import matplotlib.pyplot as plt
import matplotlib.pyplot as plt

SMALL_SIZE = 35
MEDIUM_SIZE = 45
BIGGER_SIZE = 65

plt.rc('font',   size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes',   titlesize=size_text_title)     # fontsize of the axes title
plt.rc('axes',   labelsize=size_text_xy_labels)    # fontsize of the x and y labels
plt.rc('xtick',  labelsize=size_text_xy_tick)    # fontsize of the tick labels
plt.rc('ytick',  labelsize=size_text_xy_tick)    # fontsize of the tick labels
plt.rc('legend', fontsize=size_text_legend)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

fig = plt.figure(figsize=(20*ncols,20*nrows))
fig.subplots_adjust(hspace=0.35, wspace=0.6)

#https://jakevdp.github.io/PythonDataScienceHandbook/04.08-multiple-subplots.html

i=1
ax = fig.add_subplot(nrows, ncols, i)
###############################################SCENARIO 3
# Eliminamos los labels
features = df_set1_2.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 
#Extraemos los labels
labels = df_set1_2.copy()
labels_binary = labels['label'].values 
labels=labels_binary
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 
model_name = "Scenario 3 - Decision Tree"
model_selected = DecisionTreeClassifier()
model_selected.fit(X_train, y_train)

feature_importances=pd.DataFrame({'features':features.columns,'feature_importance':model_selected.feature_importances_})
print(feature_importances.sort_values('feature_importance',ascending=False))

y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(["Intrusion","Normal"])

#title="Confusion Matrix for {}".format(model_name)
title="A"
#CM_viz(y_test, y_pred, classes, name=model_name, 
#                            path_img_base = './images',nrows=nrows,ncols=ncols, 
#                            size_text_legend=size_text_legend,size_text_title=size_text_title,title=title,
#       size_text_xy_labels=size_text_xy_labels,size_text_xy_tick=size_text_xy_tick,size_num_inter=size_num_inter)

#feature_names=list(X_train.columns)
viz = FeatureImportances(model_selected)#,topn=14)
viz.fit(X_train, y_train)
model_name = "FigX_3"
viz.show(outpath=path_figures+"/"+model_name+"_FI"+".pdf")
title="A"
viz.set_title(title)
###############################################

i=2
ax = fig.add_subplot(nrows, ncols, i)
###############################################SCENARIO 8
# Extraction of 50.000 samples of Normal class
df_normal = df_set1_2.copy() 
df_normal.drop(index=df_normal.index[50000:], axis=0, inplace=True) 
# Extracting the intrusions, each one has 50.000 samples
df_equal = df_set1_2.copy() 
df_equal.drop(df_equal[df_equal.tipo_ataque == "normal"].index, inplace=True)  
# Uniendo normal con los ataques, ahora todas las clases de a 50.000
df1 = pd.concat([df_normal, df_equal]).reset_index(drop=True)  # Concat all to a single df 
# Eliminamos los labels
features = df1.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 
#Extraemos los labels
labels = df1.copy()
labels_multiclass = labels['tipo_ataque'].values
labels=labels_multiclass
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 
model_name = "Scenario 8 - Decision Tree"
model_selected = DecisionTreeClassifier()
model_selected.fit(X_train, y_train)

feature_importances=pd.DataFrame({'features':features.columns,'feature_importance':model_selected.feature_importances_})
print(feature_importances.sort_values('feature_importance',ascending=False))

y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(y_test)

#title="Confusion Matrix for {}".format(model_name)
title="D"
#CM_viz(y_test, y_pred, classes, name=model_name, 
#                            path_img_base = './images',nrows=nrows,ncols=ncols, 
#                            size_text_legend=size_text_legend,size_text_title=size_text_title,title=title,
#       size_text_xy_labels=size_text_xy_labels,size_text_xy_tick=size_text_xy_tick,size_num_inter=20)
viz = FeatureImportances(model_selected)#,topn=14)
viz.fit(X_train, y_train)
model_name = "FigX_8"
viz.show(outpath=path_figures+"/"+model_name+"_FI"+".pdf")
title="B"
viz.set_title(title)

###############################################

model_name = "FigX_3y8"
fig.savefig(path_figures+"/"+model_name+"_CM"+".pdf", bbox_inches = "tight", format='pdf') 
#fig.savefig(path_figures+"/"+model_name+"_CM"+".pdf", format='pdf') 