# Libraries

In [None]:
#Classification Methods
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

#Metrics
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from yellowbrick.classifier import ClassificationReport 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import roc_auc_score

#Tools
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import model_selection
from scipy.sparse import csr_matrix 
import string 
import time as tm
import spacy 
import os
sns.set(style='whitegrid',
        rc={'lines.linewidth': 2.5,
        'figure.figsize': (10, 8),
        'text.usetex': False,
        })
%matplotlib inline

# Hyperparameter tune
from sklearn.model_selection import GridSearchCV

# Feature importance
from yellowbrick.model_selection import FeatureImportances
from yellowbrick.features import PCADecomposition
from yellowbrick.features import RadViz 
from yellowbrick.features import Manifold

#mpl.rcParams["figure.figsize"] = (9,6)

import warnings
warnings.filterwarnings('ignore')

# Functions

In [None]:
def classifier_metrics(X_train,X_test,y_train,y_test,CV=True):    
    def metrics(model):
        print("\nHold-Out in process...")
        start_time = tm.time()
        model.fit(X_train, y_train) 
        TIME = tm.time() - start_time 
        print("Time, Training: {0:.4f} [seconds]".format(TIME))
        start_time = tm.time()
        y_pred = model.predict(X_test)
        TIME = tm.time() - start_time 
        print("Time, Prediction: {0:.4f} [seconds]".format(TIME))
        
        accuracy_s  = accuracy_score(y_test,y_pred) 
        print('accuracy_score: {0:.4f}'.format(accuracy_s))
        f1_s        = f1_score(y_test,y_pred,average='weighted')
        print('f1_score: {0:.4f}'.format(f1_s))
        recall_s    = recall_score(y_test,y_pred,average='weighted')
        print('recall_score: {0:.4f}'.format(recall_s))
        precision_s = precision_score(y_test,y_pred,average='weighted')
        print('precision_score: {0:.4f}'.format(precision_s))
        
        if type(list(np.unique(np.array(y_train)))[0]).__name__ == 'str': #If the classes are categorical with string names
            le           = LabelEncoder() 
            le.fit(list(np.unique(np.array(y_train)))) 
            y_test_coded = le.transform(y_test) 
            y_pred_coded = le.transform(y_pred) 
            mse_s        = MSE(y_test_coded,y_pred_coded)
            print('MSE: {0:.4f}'.format(mse_s))
        else:
            mse_s        = MSE(y_test,y_pred)
            print('MSE: {0:.4f}'.format(mse_s))
        
        if len(list(np.unique(np.array(y_train)))) > 2: #For multiclass classification, more than 2 classes
            y_pred_proba = model.predict_proba(X_test)[:]
            roc_s        = roc_auc_score(y_test, y_pred_proba, multi_class='ovo', average='weighted')
            print('ROC_AUC: {0:.4f}'.format(roc_s))            
        else:
            y_pred_proba = model.predict_proba(X_test)[:,1]
            roc_s        = roc_auc_score(y_test, y_pred_proba, multi_class='ovo', average='weighted')
            print('ROC_AUC: {0:.4f}'.format(roc_s))
        
        ck_s         = cohen_kappa_score(y_test,y_pred)
        print('CK: {0:.4f}'.format(ck_s))
        
        if CV == True:
            print('\nCross-Validation in process...')
            start_time = tm.time() 
            kfold = model_selection.KFold(n_splits=10)
            y_CV = np.concatenate((y_train,y_test))
            if "GaussianNB" in str(name) or "LinearDiscriminantAnalysis" in str(name) or "QuadraticDiscriminantAnalysis" in str(name):
                X_CV = np.concatenate((X_train,X_test))
                cv_results = np.array(model_selection.cross_val_score(model, X_CV, y_CV, cv=kfold, scoring='accuracy', n_jobs=-3))
            else:
                X_CV = np.concatenate((X_train.toarray(),X_test.toarray()))
                X_CV = csr_matrix(X_CV)
                cv_results = np.array(model_selection.cross_val_score(model, X_CV, y_CV, cv=kfold, scoring='accuracy', n_jobs=-3))

            cv_results = cv_results[np.logical_not(np.isnan(cv_results))] 
            TIME = tm.time() - start_time 
            print("Time, CV: {0:.4f} [seconds]".format(TIME))
            print('CV: {0:.4f} {1:.4f}'.format(cv_results.mean(),cv_results.std()))

    for name in classifiers:
        print ("---------------------------------------------------------------------------------\n") 
        print(str(name))
        if "GaussianNB" in str(name) or "LinearDiscriminantAnalysis" in str(name) or "QuadraticDiscriminantAnalysis" in str(name):
            X_train=csr_matrix(X_train) 
            X_test =csr_matrix(X_test) 
            X_train=X_train.toarray() 
            X_test=X_test.toarray() 
        else:
            X_train=csr_matrix(X_train)
            X_test=csr_matrix(X_test)
            
        metrics(name)
        print()


In [None]:
# Classification report
def CR_viz(x,y):
    ax = plt.figure(figsize=(x,y)) 
    visualizer = ClassificationReport(model_selected, classes=classes, support=True,  
                                      cmap='Blues', title="Classification Report - "+model_name)
    visualizer.fit(X_train, y_train)   
    visualizer.score(X_test, y_test)      
    visualizer.poof()
    ax.show()
    ax.savefig(path_figures+"/"+model_name+"_CR"+".pdf", bbox_inches = "tight") 

# Confusion Matrix
def CM_viz(y_test, y_pred, classes, name,
               path_img_base = './images',nrows=1,ncols=1,size_text_legend=25,size_text_title=25,title="",
           size_text_xy_labels=25,size_text_xy_tick=25,
          size_num_inter=25):
    if not os.path.exists(path_img_base):
        os.makedirs(path_img_base)
    
    if ncols==nrows and ncols==1:
        nrows=1
        ncols=1
        #fig = plt.figure(figsize=(20*ncols,20*nrows))
        conf = confusion_matrix(y_test, y_pred) 
        annot_kws={'fontsize':size_num_inter, 'verticalalignment':'center' } 
        ax = sns.heatmap(conf, annot=True, cbar=False, cmap='Blues',fmt = 'd',annot_kws= annot_kws, 
                                      xticklabels=np.unique(classes), yticklabels=np.unique(classes)) 
        #cbar = ax.collections[0].colorbar # use matplotlib.colorbar.Colorbar object
        #cbar.ax.tick_params(labelsize=size_text_xy_tick) # here set the labelsize 
        ax.xaxis.set_tick_params(labelsize=size_text_xy_tick,rotation=90)
        ax.yaxis.set_tick_params(labelsize=size_text_xy_tick,rotation=0)
        ax.set_xlabel('Predicted Values',fontsize=size_text_xy_labels)
        ax.set_ylabel('Actual Values',fontsize=size_text_xy_labels)
        ax.set_title(title,fontsize=size_text_title)
        #ax.figure.subplots_adjust(right=0.8)
        #ax.figure.savefig(path_figures+"/"+name+"_CM"+".pdf", bbox_inches = "tight", format='pdf')
    else:
        conf = confusion_matrix(y_test, y_pred) 
        annot_kws={'fontsize':size_num_inter, 'verticalalignment':'center' }

        ax = sns.heatmap(conf, annot=True, cbar=False, cmap='Blues',fmt = 'd',annot_kws= annot_kws, 
                                      xticklabels=np.unique(classes), yticklabels=np.unique(classes)) 
        #cbar = ax.collections[0].colorbar # use matplotlib.colorbar.Colorbar object
        #cbar.ax.tick_params(labelsize=size_text_xy_tick) # here set the labelsize 
        ax.xaxis.set_tick_params(labelsize=size_text_xy_tick,rotation=90)
        ax.yaxis.set_tick_params(labelsize=size_text_xy_tick,rotation=0)
        ax.set_xlabel('Predicted Values',fontsize=size_text_xy_labels)
        ax.set_ylabel('Actual Values',fontsize=size_text_xy_labels)
        ax.set_title(title,fontsize=size_text_title)
        
    return ax

In [None]:
path_figures = "../images"
if not os.path.exists(path_figures):
    os.makedirs(path_figures)


# Loading data

In [None]:
path_folder_data = "../DBs"

In [None]:
# IDSAI
path     = path_folder_data+'/IDSAI.csv'
df_IDSAI = pd.read_csv(path)
#Only once
#df_IDSAI["tipo_ataque"].replace({
#                        "dos-icmp_flood": "ICMP echo request Flood", 
#                        "dos-syn_rstflooding": "SYN/ACK and RST Flooding",
#                        "dos-synflooding": "SYN/ACK Flooding",
#                        "dos-synfloodingfaster": "SYN Flooding faster",
#                        "mitm-arp_spoofing": "ARP spoofing",                                                
#                        "ddos_mac": "DDoS MAC Flood", 
#                        "framentation_ip": "IP Fragmentation",
#                        "fuerzabrutassh": "Brute Force SSH",                        
#                        "scan_puerto_udp": "UDP port scan",                        
#                        "tcpnull": "TCP Null",
#                        "normal": "Normal"
#                        }, inplace=True)
#df_IDSAI.to_csv('../DBs'+'/IDSAI.csv', index = False, header=True)
#df_IDSAI=pd.read_csv('../DBs'+'/IDSAI.csv')
df_IDSAI

In [None]:
# Bot-IoT
path     = path_folder_data+'/Bot-IoT.csv'
df_BotIoT = pd.read_csv(path)
df_BotIoT

In [None]:
# Class distribution in IDSAI
df_IDSAI.groupby("tipo_ataque").size()

In [None]:
# Class distribution in Bot-IoT
df_BotIoT.groupby("tipo_ataque").size()

In [None]:
# column names
print("Same columns in DBs: ",(df_BotIoT.columns.values == df_IDSAI.columns.values).all()) 
print("Number of columns: ",len(list(df_BotIoT.columns.values))) 
list(df_BotIoT.columns.values)

In [None]:
# Load IDSAI
IDSAIpath_folder_data = "../DBs"
IDSAIpath = IDSAIpath_folder_data+'/IDSAI.csv'
IDSAIdf=pd.read_csv(IDSAIpath)
#IDSAIdf
# Delete labels and redundant features
IDSAIfeatures = IDSAIdf.copy()
IDSAIfeatures = IDSAIfeatures.drop(['label', 'tipo_ataque','ip_src', 'ip_dst', 'port_src', 'port_dst', 'protocols'], axis=1) 
# Obtain labels
IDSAIlabels = IDSAIdf.copy()
IDSAIlabels = IDSAIlabels['label'].values 

# Load BotIoT
BotIoTpath_folder_data = "../DBs"
BotIoTpath = BotIoTpath_folder_data+'/Bot-Iot.csv'
BotIoTdf=pd.read_csv(BotIoTpath)
#BotIoTdf
# Delete labels and redundant features
BotIoTfeatures = BotIoTdf.copy()
BotIoTfeatures = BotIoTfeatures.drop(['label', 'tipo_ataque','ip_src', 'ip_dst', 'port_src', 'port_dst', 'protocols'], axis=1) 
# Obtain labels
BotIoTlabels = BotIoTdf.copy()
BotIoTlabels = BotIoTlabels['label'].values 

# IDSAI dataset

In [None]:
# Delete labels
features = df_IDSAI.copy()
features = features.drop(['label', 'tipo_ataque'], axis=1)
features = features.drop(['ip_src', 'ip_dst', 'port_src', 'port_dst', 'protocols'], 
                         axis=1) # Features not recomended in literature

In [None]:
features

In [None]:
#Extraemos los labels
labels = df_IDSAI.copy()

labels_binary = labels['label'].values
labels_multiclass = labels['tipo_ataque'].values

In [None]:
labels_binary

In [None]:
labels_multiclass

# Data distribution

In [None]:
fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(aspect="equal"))

data = list(labels['tipo_ataque'].value_counts().values)
names = labels['tipo_ataque'].value_counts().index

color_palette_list = ["#0EBFE9", "#60D394", "#FFD97D", "#C1F0F6", "#007ACD", "#EE6055",
                      "#6DC36D", "#BBA9BB", "#E7D40A", "#E36B2C", "#C82A54"]

def func(pct, allvals):
    absolute = int(pct/100.*np.sum(allvals))
    return "{:.0f}%\n({:d})".format(pct, absolute)

wedges, texts, autotexts = ax.pie(data, autopct=lambda pct: func(pct, data),
                                  textprops=dict(color="gray"), colors=color_palette_list[0:],
                                 pctdistance=1.17)

ax.legend(wedges, names,
          loc="center left",
          bbox_to_anchor=(1, 0, 0.5, 1),
         fontsize=15)

plt.setp(autotexts, size=15, weight="bold", color="gray")

plt.subplots_adjust(right=0.7)
plt.savefig(path_figures+"/data_distribution.pdf", bbox_inches = "tight") 
plt.show()

# Feature importance

In [None]:
size_text_legend=35
size_text_title=85
size_text_xy_labels=35
size_text_xy_tick=35
size_num_inter=35

nrows=1
ncols=2

import matplotlib.pyplot as plt

SMALL_SIZE = 35
MEDIUM_SIZE = 45
BIGGER_SIZE = 65

plt.rc('font',   size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes',   titlesize=size_text_title)     # fontsize of the axes title
plt.rc('axes',   labelsize=size_text_xy_labels)    # fontsize of the x and y labels
plt.rc('xtick',  labelsize=size_text_xy_tick)    # fontsize of the tick labels
plt.rc('ytick',  labelsize=size_text_xy_tick)    # fontsize of the tick labels
plt.rc('legend', fontsize=size_text_legend)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

fig = plt.figure(figsize=(20*ncols,20*nrows))
fig.subplots_adjust(hspace=0.35, wspace=0.6)

#https://jakevdp.github.io/PythonDataScienceHandbook/04.08-multiple-subplots.html

i=1
ax = fig.add_subplot(nrows, ncols, i)
###############################################SCENARIO 1
X_train,X_test,y_train,y_test=train_test_split(features, labels_binary,
                                               test_size=0.2,random_state=21, stratify=labels_binary)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 

model_name = "Binary classification - Decision Tree"
model_selected = DecisionTreeClassifier(criterion='entropy', max_depth=20, random_state=32)
model_selected.fit(X_train, y_train)

feature_importances=pd.DataFrame({'features':features.columns,'feature_importance':model_selected.feature_importances_})
print(feature_importances.sort_values('feature_importance',ascending=False))

y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(["Intrusion","Normal"])

title="A"

#feature_names=list(X_train.columns)
viz = FeatureImportances(model_selected)#,topn=14)
viz.fit(X_train, y_train)
model_name = "FI_S1"
viz.show(outpath=path_figures+"/"+model_name+"_FI"+".pdf")
title="A"
viz.set_title(title)
###############################################

i=2
ax = fig.add_subplot(nrows, ncols, i)
###############################################SCENARIO 2
X_train,X_test,y_train,y_test=train_test_split(features, labels_multiclass,
                                               test_size=0.2,random_state=21, stratify=labels_multiclass)

#le_labels = LabelEncoder()
#y_train = le_labels.fit_transform(y_train) 
#y_test = le_labels.transform(y_test) 

print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 


model_name = "Multiclass classification - Decision Tree"
model_selected = DecisionTreeClassifier(max_depth=20, random_state=32)
model_selected.fit(X_train, y_train)

feature_importances=pd.DataFrame({'features':features.columns,'feature_importance':model_selected.feature_importances_})
print(feature_importances.sort_values('feature_importance',ascending=False))

y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(y_test)

#title="Confusion Matrix for {}".format(model_name)
title="D"
viz = FeatureImportances(model_selected)#,topn=14)
viz.fit(X_train, y_train)
model_name = "FI_S2"
viz.show(outpath=path_figures+"/"+model_name+"_FI"+".pdf")
title="B"
viz.set_title(title)

###############################################

model_name = "Feature_importance"
fig.savefig(path_figures+"/"+model_name+"_FI"+".pdf", bbox_inches = "tight", format='pdf') 
#fig.savefig(path_figures+"/"+model_name+"_CM"+".pdf", format='pdf') 

# Confusion Matrices

## Scenarios 1 and 3

In [None]:
size_text_legend=80
size_text_title=150
size_text_xy_labels=80
size_text_xy_tick=80
size_num_inter=80

nrows=1
ncols=2

fig = plt.figure(figsize=(20*ncols,20*nrows))
fig.subplots_adjust(hspace=0.25, wspace=0.5)
#https://jakevdp.github.io/PythonDataScienceHandbook/04.08-multiple-subplots.html

i=1
ax = fig.add_subplot(nrows, ncols, i)
###############################################SCENARIO 1
X_train,X_test,y_train,y_test=train_test_split(features, labels_binary,
                                               test_size=0.2,random_state=21, stratify=labels_binary)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 

model_name = "Binary classification - XGB - IDSAI dataset"
model_selected = XGBClassifier(eval_metric='mlogloss',n_jobs=-1, random_state=32)
model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(["Intrusion","Normal"])

#title="Confusion Matrix for {}".format(model_name)
title="A"
CM_viz(y_test, y_pred, classes, name=model_name, 
                            path_img_base = './images',nrows=nrows,ncols=ncols, 
                            size_text_legend=size_text_legend,size_text_title=size_text_title,title=title,
       size_text_xy_labels=size_text_xy_labels,size_text_xy_tick=size_text_xy_tick,size_num_inter=size_num_inter)
###############################################

i=2
ax = fig.add_subplot(nrows, ncols, i)
###############################################SCENARIO 3
X_train = IDSAIfeatures
X_test = BotIoTfeatures
y_train = IDSAIlabels
y_test = BotIoTlabels

model_name = "Binary classification - XGB - Bot-IoT dataset"
model_selected = XGBClassifier(eval_metric='mlogloss',n_jobs=-1, random_state=32)
model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(["Intrusion","Normal"])

#title="Confusion Matrix for {}".format(model_name)
title="B"
CM_viz(y_test, y_pred, classes, name=model_name, 
                            path_img_base = './images',nrows=nrows,ncols=ncols, 
                            size_text_legend=size_text_legend,size_text_title=size_text_title,title=title,
       size_text_xy_labels=size_text_xy_labels,size_text_xy_tick=size_text_xy_tick,size_num_inter=size_num_inter)

###############################################

model_name = "Fig_CM_1_3"
fig.savefig(path_figures+"/"+model_name+".pdf", bbox_inches = "tight", format='pdf') 
#fig.savefig(path_figures+"/"+model_name+"_CM"+".pdf", format='pdf') 

## Scenario 2

In [None]:
size_text_legend=35
size_text_title=85
size_text_xy_labels=35
size_text_xy_tick=35
size_num_inter=33

nrows=1
ncols=1

fig = plt.figure(figsize=(20*ncols,20*nrows))
fig.subplots_adjust(hspace=0.2, wspace=0.3)
#https://jakevdp.github.io/PythonDataScienceHandbook/04.08-multiple-subplots.html

i=1
ax = fig.add_subplot(nrows, ncols, i)
###############################################SCENARIO 2
X_train,X_test,y_train,y_test=train_test_split(features, labels_multiclass,
                                               test_size=0.2,random_state=21, stratify=labels_multiclass)

#le_labels = LabelEncoder()
#y_train = le_labels.fit_transform(y_train) 
#y_test = le_labels.transform(y_test) 

print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
print(np.unique(y_train, return_counts=True))
print(np.unique(y_test, return_counts=True)) 

#X_train,X_test,y_train,y_test=train_test_split(features, labels,
#                                               test_size=0.2,random_state=21, stratify=labels)
#print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 
#print(np.unique(y_train, return_counts=True))
#print(np.unique(y_test, return_counts=True)) 

model_name = "Multiclass classification - XGB - IDSAI dataset"
model_selected = XGBClassifier(eval_metric='mlogloss',n_jobs=-1, random_state=32)
model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)
acc_score=accuracy_score(y_test,y_pred) 
print('accuracy_score: {0:.4f}'.format(acc_score))

y_pred_proba = model_selected.predict_proba(X_test)
classes = np.unique(y_test)

#title="Confusion Matrix for {}".format(model_name)
title=""
CM_viz(y_test, y_pred, classes, name=model_name, 
                            path_img_base = './images',nrows=nrows,ncols=ncols, 
                            size_text_legend=size_text_legend,size_text_title=size_text_title,title=title,
       size_text_xy_labels=size_text_xy_labels,size_text_xy_tick=size_text_xy_tick,size_num_inter=size_num_inter)

###############################################

model_name = "Fig_CM_2"
fig.savefig(path_figures+"/"+model_name+".pdf", bbox_inches = "tight", format='pdf') 
#fig.savefig(path_figures+"/"+model_name+"_CM"+".pdf", format='pdf') 