# Libraries

In [None]:
#Classification Methods
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

#Metrics
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from yellowbrick.classifier import ClassificationReport 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import roc_auc_score

#Tools
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import model_selection
from scipy.sparse import csr_matrix 
import string 
import time as tm
import spacy 
import os

import warnings
warnings.filterwarnings('ignore')

# Functions

In [None]:
def classifier_metrics(X_train,X_test,y_train,y_test,CV=True):    
    def metrics(model):
        print("\nHold-Out in process...")
        start_time = tm.time()
        model.fit(X_train, y_train) 
        TIME = tm.time() - start_time 
        print("Time, Training: {0:.4f} [seconds]".format(TIME))
        start_time = tm.time()
        y_pred = model.predict(X_test)
        TIME = tm.time() - start_time 
        print("Time, Prediction: {0:.4f} [seconds]".format(TIME))
        
        accuracy_s  = accuracy_score(y_test,y_pred) 
        print('accuracy_score: {0:.4f}'.format(accuracy_s))
        f1_s        = f1_score(y_test,y_pred,average='weighted')
        print('f1_score: {0:.4f}'.format(f1_s))
        recall_s    = recall_score(y_test,y_pred,average='weighted')
        print('recall_score: {0:.4f}'.format(recall_s))
        precision_s = precision_score(y_test,y_pred,average='weighted')
        print('precision_score: {0:.4f}'.format(precision_s))
        
        if type(list(np.unique(np.array(y_train)))[0]).__name__ == 'str': #If the classes are categorical with string names
            le           = LabelEncoder() 
            le.fit(list(np.unique(np.array(y_train)))) 
            y_test_coded = le.transform(y_test) 
            y_pred_coded = le.transform(y_pred) 
            mse_s        = MSE(y_test_coded,y_pred_coded)
            print('MSE: {0:.4f}'.format(mse_s))
        else:
            mse_s        = MSE(y_test,y_pred)
            print('MSE: {0:.4f}'.format(mse_s))
        
        if len(list(np.unique(np.array(y_train)))) > 2: #For multiclass classification, more than 2 classes
            y_pred_proba = model.predict_proba(X_test)[:]
            roc_s        = roc_auc_score(y_test, y_pred_proba, multi_class='ovo', average='weighted')
            print('ROC_AUC: {0:.4f}'.format(roc_s))            
        else:
            y_pred_proba = model.predict_proba(X_test)[:,1]
            roc_s        = roc_auc_score(y_test, y_pred_proba, multi_class='ovo', average='weighted')
            print('ROC_AUC: {0:.4f}'.format(roc_s))
        
        ck_s         = cohen_kappa_score(y_test,y_pred)
        print('CK: {0:.4f}'.format(ck_s))
        
        if CV == True:
            print('\nCross-Validation in process...')
            start_time = tm.time() 
            kfold = model_selection.KFold(n_splits=10)
            y_CV = np.concatenate((y_train,y_test))
            if "GaussianNB" in str(name) or "LinearDiscriminantAnalysis" in str(name) or "QuadraticDiscriminantAnalysis" in str(name):
                X_CV = np.concatenate((X_train,X_test))
                cv_results = np.array(model_selection.cross_val_score(model, X_CV, y_CV, cv=kfold, scoring='accuracy', n_jobs=-3))
            else:
                X_CV = np.concatenate((X_train.toarray(),X_test.toarray()))
                X_CV = csr_matrix(X_CV)
                cv_results = np.array(model_selection.cross_val_score(model, X_CV, y_CV, cv=kfold, scoring='accuracy', n_jobs=-3))

            cv_results = cv_results[np.logical_not(np.isnan(cv_results))] 
            TIME = tm.time() - start_time 
            print("Time, CV: {0:.4f} [seconds]".format(TIME))
            print('CV: {0:.4f} {1:.4f}'.format(cv_results.mean(),cv_results.std()))

    for name in classifiers:
        print ("---------------------------------------------------------------------------------\n") 
        print(str(name))
        if "GaussianNB" in str(name) or "LinearDiscriminantAnalysis" in str(name) or "QuadraticDiscriminantAnalysis" in str(name):
            X_train=csr_matrix(X_train) 
            X_test =csr_matrix(X_test) 
            X_train=X_train.toarray() 
            X_test=X_test.toarray() 
        else:
            X_train=csr_matrix(X_train)
            X_test=csr_matrix(X_test)
            
        metrics(name)
        print()


In [None]:
# Classification report
def CR_viz(x,y):
    ax = plt.figure(figsize=(x,y)) 
    visualizer = ClassificationReport(model_selected, classes=classes, support=True,  
                                      cmap='Blues', title="Classification Report - "+model_name)
    visualizer.fit(X_train, y_train)   
    visualizer.score(X_test, y_test)      
    visualizer.poof()
    ax.show()
    ax.savefig(path_figures+"/"+model_name+"_CR"+".pdf", bbox_inches = "tight") 

# Confusion matrix
def CM_viz(x,y):
    model_selected.fit(X_train, y_train) 
    y_pred = model_selected.predict(X_test) 
    conf = confusion_matrix(y_test, y_pred) 
    plt.figure(figsize=(x,y)) 
    annot_kws={'fontsize':20, 'verticalalignment':'center' }
    ax = sns.heatmap(conf, annot=True, cmap='Blues',fmt = 'd',annot_kws= annot_kws, xticklabels=np.unique(classes), yticklabels=np.unique(classes)) 
    ax.set(title="Confusion Matrix", xlabel="Predicted Values", ylabel="Actual Values")
    sns.set(font_scale=2)
    plt.title("Confusion Matrix - "+model_name, fontsize = 25)
    plt.xlabel("Predicted Values", fontsize = 25)
    plt.ylabel("Actual Values", fontsize = 25)
    plt.savefig(path_figures+"/"+model_name+"_CM"+".pdf", bbox_inches = "tight") 

In [None]:
path_figures = "../images"
if not os.path.exists(path_figures):
    os.makedirs(path_figures)


# Loading data

In [None]:
path_folder_data = "../Datasets"

In [None]:
path = path_folder_data+'/1NID_final_Set1.csv'
df_set1=pd.read_csv(path)
df_set1

In [None]:
path = path_folder_data+'/2NID_final_Set2.csv'
df_set2=pd.read_csv(path)
df_set2

In [None]:
path = path_folder_data+'/3NID_final_Set1_2.csv'
df_set1_2=pd.read_csv(path)
df_set1_2

# Scenario 1

In [None]:
# Eliminamos los labels
features = df_set1.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 

In [None]:
#Extraemos los labels
labels = df_set1.copy()
labels_binary = labels['label'].values 

In [None]:
labels=labels_binary
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 

In [None]:
#ML Models
classifiers=[
    ExtraTreesClassifier(n_jobs=-3), 
    RandomForestClassifier(n_jobs=-3),
    LogisticRegression(solver='liblinear',n_jobs=-3),
    KNeighborsClassifier(n_jobs=-3),
    XGBClassifier(eval_metric='mlogloss',n_jobs=-3),
    DecisionTreeClassifier(),
    LinearDiscriminantAnalysis(),
    GaussianNB(),
    svm.SVC(probability=True),
    GradientBoostingClassifier(),
    QuadraticDiscriminantAnalysis(),
    AdaBoostClassifier(),
    MLPClassifier()
    ] 

#Deploy aggregate metrics 
classifier_metrics(X_train,X_test,y_train,y_test,CV=True) 

In [None]:
model_name = "Scenario 1 - Extra Trees"
model_selected = ExtraTreesClassifier(n_jobs=-3)
classes = np.unique(["Normal","Intrusion"])

visualization =[CR_viz(15,15), CM_viz(15,15)] 


# Scenario 2

In [None]:
# Eliminamos los labels
features = df_set2.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 

In [None]:
#Extraemos los labels
labels = df_set2.copy()
labels_binary = labels['label'].values 

In [None]:
labels=labels_binary
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 

In [None]:
#ML Models
classifiers=[
    ExtraTreesClassifier(n_jobs=-3), 
    RandomForestClassifier(n_jobs=-3),
    LogisticRegression(solver='liblinear',n_jobs=-3),
    KNeighborsClassifier(n_jobs=-3),
    XGBClassifier(eval_metric='mlogloss',n_jobs=-3),
    DecisionTreeClassifier(),
    LinearDiscriminantAnalysis(),
    GaussianNB(),
    svm.SVC(probability=True),
    GradientBoostingClassifier(),
    QuadraticDiscriminantAnalysis(),
    AdaBoostClassifier(),
    MLPClassifier()
    ] 

#Deploy aggregate metrics 
classifier_metrics(X_train,X_test,y_train,y_test,CV=True) 

In [None]:
model_name = "Scenario 2 - Extra Trees"
model_selected = ExtraTreesClassifier(n_jobs=-3)
classes = np.unique(["Normal","Intrusion"])

visualization =[CR_viz(15,15), CM_viz(15,15)] 


# Scenario 3

In [None]:
# Eliminamos los labels
features = df_set1_2.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 

In [None]:
#Extraemos los labels
labels = df_set1_2.copy()
labels_binary = labels['label'].values 

In [None]:
labels=labels_binary
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 

In [None]:
#ML Models
classifiers=[
    ExtraTreesClassifier(n_jobs=-3), 
    RandomForestClassifier(n_jobs=-3),
    LogisticRegression(solver='liblinear',n_jobs=-3),
    KNeighborsClassifier(n_jobs=-3),
    XGBClassifier(eval_metric='mlogloss',n_jobs=-3),
    DecisionTreeClassifier(),
    LinearDiscriminantAnalysis(),
    GaussianNB(),
    svm.SVC(probability=True),
    GradientBoostingClassifier(),
    QuadraticDiscriminantAnalysis(),
    AdaBoostClassifier(),
    MLPClassifier()
    ] 

#Deploy aggregate metrics 
classifier_metrics(X_train,X_test,y_train,y_test,CV=True) 

In [None]:
model_name = "Scenario 3 - Extra Trees"
model_selected = ExtraTreesClassifier(n_jobs=-3)
classes = np.unique(y_test)

visualization =[CR_viz(15,15), CM_viz(15,15)] 


# Scenario 4

In [None]:
#SET 1

# Eliminamos los labels
features = df_set1.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 

#Extraemos los labels
labels = df_set1.copy()
labels_binary = labels['label'].values 

In [None]:
#SET 2

# Eliminamos los labels
features_ = df_set2.copy()
features_ = features_.drop(['label', 'tipo_ataque'], axis=1) 

#Extraemos los labels
labels_ = df_set1.copy()
labels_binary_ = labels_['label'].values 

In [None]:
labels=labels_binary
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 

In [None]:
X_test = features_
y_test = labels_binary_

classifiers=[
    ExtraTreesClassifier(random_state=179, n_jobs=-1), 
    RandomForestClassifier(random_state=179,n_jobs=-1),
    LogisticRegression(random_state=179,solver='liblinear',n_jobs=-3),
    KNeighborsClassifier(n_jobs=-3),
    XGBClassifier(random_state=179,eval_metric='mlogloss',n_jobs=-3),
    DecisionTreeClassifier(random_state=179),
    LinearDiscriminantAnalysis(),
    GaussianNB(),
    svm.SVC(random_state=179,probability=True),
    GradientBoostingClassifier(random_state=179),
    QuadraticDiscriminantAnalysis(),
    AdaBoostClassifier(random_state=179),
    MLPClassifier(random_state=179)
    ] 

#Deploy aggregate metrics 
classifier_metrics(X_train,X_test,y_train,y_test,CV=False) 

In [None]:
model_name = "Scenario 4 - Extra Trees"
model_selected = ExtraTreesClassifier(random_state=179, n_jobs=-1)
classes = np.unique(["Normal","Intrusion"])

visualization =[CR_viz(10,10), CM_viz(10,10)] 


# Scenario 5

In [None]:
#SET 1

# Eliminamos los labels
features_ = df_set1.copy()
features_ = features_.drop(['label', 'tipo_ataque'], axis=1) 

#Extraemos los labels
labels_ = df_set1.copy()
labels_binary_ = labels_['label'].values 

In [None]:
#SET 2

# Eliminamos los labels
features = df_set2.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 

#Extraemos los labels
labels = df_set1.copy()
labels_binary = labels['label'].values 

In [None]:
labels=labels_binary
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 

In [None]:
X_test = features_
y_test = labels_binary_

classifiers=[
    ExtraTreesClassifier(random_state=179, n_jobs=-1), 
    RandomForestClassifier(random_state=179,n_jobs=-1),
    LogisticRegression(random_state=179,solver='liblinear',n_jobs=-3),
    KNeighborsClassifier(n_jobs=-3),
    XGBClassifier(random_state=179,eval_metric='mlogloss',n_jobs=-3),
    DecisionTreeClassifier(random_state=179),
    LinearDiscriminantAnalysis(),
    GaussianNB(),
    svm.SVC(random_state=179,probability=True),
    GradientBoostingClassifier(random_state=179),
    QuadraticDiscriminantAnalysis(),
    AdaBoostClassifier(random_state=179),
    MLPClassifier(random_state=179)
    ] 

#Deploy aggregate metrics 
classifier_metrics(X_train,X_test,y_train,y_test,CV=False) 

In [None]:
model_name = "Scenario 5 - Extra Trees"
model_selected = ExtraTreesClassifier(random_state=179, n_jobs=-1)
classes = np.unique(["Normal","Intrusion"])

visualization =[CR_viz(10,10), CM_viz(10,10)] 


# Scenario 6

In [None]:
df1 = df_set1.copy() 
df1.drop(index=df1.index[:200000], axis=0, inplace=True) #Only 50.000 samples for Normal

In [None]:
# Eliminamos los labels
features = df1.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 

In [None]:
#Extraemos los labels
labels = df1.copy()
labels_multiclass = labels['tipo_ataque'].values

In [None]:
labels=labels_multiclass
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 

In [None]:
#ML Models
classifiers=[
    ExtraTreesClassifier(n_jobs=-3), 
    RandomForestClassifier(n_jobs=-3),
    LogisticRegression(solver='liblinear',n_jobs=-3),
    KNeighborsClassifier(n_jobs=-3),
    XGBClassifier(eval_metric='mlogloss',n_jobs=-3),
    DecisionTreeClassifier(),
    LinearDiscriminantAnalysis(),
    GaussianNB(),
    svm.SVC(probability=True),
    GradientBoostingClassifier(),
    QuadraticDiscriminantAnalysis(),
    AdaBoostClassifier(),
    MLPClassifier()
    ] 

#Deploy aggregate metrics 
classifier_metrics(X_train,X_test,y_train,y_test,CV=True) 

In [None]:
model_name = "Scenario 6 - Extra Trees"
model_selected = ExtraTreesClassifier(n_jobs=-3)
classes = np.unique(y_test)

visualization =[CR_viz(15,15), CM_viz(15,15)] 


# Scenario 7

In [None]:
df2 = df_set2.copy() 
df2.drop(index=df2.index[:200000], axis=0, inplace=True) #Only 50.000 samples for Normal

In [None]:
# Eliminamos los labels
features = df2.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 

In [None]:
#Extraemos los labels
labels = df2.copy()
labels_multiclass = labels['tipo_ataque'].values

In [None]:
labels=labels_multiclass
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 

In [None]:
#ML Models
classifiers=[
    ExtraTreesClassifier(n_jobs=-3), 
    RandomForestClassifier(n_jobs=-3),
    LogisticRegression(solver='liblinear',n_jobs=-3),
    KNeighborsClassifier(n_jobs=-3),
    XGBClassifier(eval_metric='mlogloss',n_jobs=-3),
    DecisionTreeClassifier(),
    LinearDiscriminantAnalysis(),
    GaussianNB(),
    svm.SVC(probability=True),
    GradientBoostingClassifier(),
    QuadraticDiscriminantAnalysis(),
    AdaBoostClassifier(),
    MLPClassifier()
    ] 

#Deploy aggregate metrics 
classifier_metrics(X_train,X_test,y_train,y_test,CV=True) 

In [None]:
model_name = "Scenario 7 - Extra Trees"
model_selected = ExtraTreesClassifier(n_jobs=-3)
classes = np.unique(y_test)

visualization =[CR_viz(15,15), CM_viz(15,15)] 


# Scenario 8

In [None]:
# Extraction of 50.000 samples of Normal class
df_normal = df_set1_2.copy() 
df_normal.drop(index=df_normal.index[50000:], axis=0, inplace=True) 

In [None]:
# Extracting the intrusions, each one has 50.000 samples
df_equal = df_set1_2.copy() 
df_equal.drop(df_equal[df_equal.tipo_ataque == "normal"].index, inplace=True)  

In [None]:
# Uniendo normal con los ataques, ahora todas las clases de a 50.000
df1 = pd.concat([df_normal, df_equal]).reset_index(drop=True)  # Concat all to a single df 

In [None]:
# Eliminamos los labels
features = df1.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1) 

In [None]:
#Extraemos los labels
labels = df1.copy()
labels_multiclass = labels['tipo_ataque'].values

In [None]:
labels=labels_multiclass
X_train,X_test,y_train,y_test=train_test_split(features, labels,
                                               test_size=0.2,random_state=21, stratify=labels)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 

In [None]:
#ML Models
classifiers=[
    ExtraTreesClassifier(n_jobs=-3), 
    RandomForestClassifier(n_jobs=-3),
    LogisticRegression(solver='liblinear',n_jobs=-3),
    KNeighborsClassifier(n_jobs=-3),
    XGBClassifier(eval_metric='mlogloss',n_jobs=-3),
    DecisionTreeClassifier(),
    LinearDiscriminantAnalysis(),
    GaussianNB(),
    svm.SVC(probability=True),
    GradientBoostingClassifier(),
    QuadraticDiscriminantAnalysis(),
    AdaBoostClassifier(),
    MLPClassifier()
    ] 

#Deploy aggregate metrics 
classifier_metrics(X_train,X_test,y_train,y_test,CV=True) 

In [None]:
model_name = "Scenario 8 - Extra Trees"
model_selected = ExtraTreesClassifier(n_jobs=-3)
classes = np.unique(y_test)

visualization =[CR_viz(15,15), CM_viz(15,15)] 
