# Importing the tools

In [60]:
# import required libraries 
import glob
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd 
import seaborn
import re
import tensorflow as tf
import datetime
import json

from sklearn.preprocessing import RobustScaler

from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dropout

import warnings
warnings.filterwarnings("ignore")

get_ipython().run_line_magic('matplotlib', 'inline')

# Combining the CSV files

In [61]:
# path to where ML files are stored
path = '/kaggle/input/cicids2017/'
all_files = glob.glob(path + "/*.csv")

# concatenate the 8 files into 1
dataset = [pd.read_csv(f) for f in all_files]

# Combining all tables into one dataset. This is possilbe since all tables have the same columns,
# as we checked in the cell above.

dataset = pd.concat([d for d in dataset])

# preprocessing  

## dropping duplicates and reseting index

In [62]:
dataset.drop_duplicates(keep=False , inplace = True)
dataset.reset_index(drop=True, inplace = True)

## column names

In [63]:
# Removing whitespaces in column names.

col_names = [col.replace(' ', '') for col in dataset.columns]
dataset.columns = col_names
dataset.head()

In [64]:
# dropping unneeded columns
dataset.drop("FwdHeaderLength.1",axis=1,inplace=True)

## target column 

In [65]:
#renaming target label to be more readable.

label_names = dataset['Label'].unique()

label_names = [re.sub("[^a-zA-Z ]+", "", l) for l in label_names]
label_names = [re.sub("[\s\s]", '_', l) for l in label_names]
label_names = [lab.replace("__", "_") for lab in label_names]

labels = dataset['Label'].unique()

for i in range(0,len(label_names)):
    dataset['Label'] = dataset['Label'].replace({labels[i] : label_names[i]})
    
dataset['Label'].unique()


## fixing data types and values

In [66]:
dataset.loc[:, dataset.columns != 'Label'] = dataset.loc[:, dataset.columns != 'Label'].astype('float64')

dataset = dataset.replace([np.inf, -np.inf], np.nan)

dataset.dropna(inplace=True)

## balancing the target classes

In [67]:
dataset.Label.value_counts()

In [68]:
dr = dataset[dataset.Label == "BENIGN"].iloc[1:1863997,:]
dataset.drop(dr.index,axis=0,inplace=True)

In [69]:
dataset.Label.value_counts()

## features and target label splitting

In [70]:
#labels = pd.DataFrame()
labels = dataset['Label'].copy()
features = dataset.loc[:, dataset.columns != 'Label']

## scaling

In [71]:
scaler = RobustScaler()
scaler.fit(features)

features = scaler.transform(features)

In [72]:
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()

LE.fit(labels)
labels= LE.transform(labels)

In [73]:
labels

In [74]:
# deleting variables for better memory allocation
del dr
del label_names
del dataset
del path
del col_names
del all_files

## solving imbalanced classes problem

In [75]:
from imblearn.over_sampling import SMOTE
smt = SMOTE()
features, labels = smt.fit_resample(features, labels)

# train test split

In [76]:
from sklearn.model_selection import train_test_split

# The next step is to split training and testing data. For this we will use sklearn function train_test_split().

x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=.2)

x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [77]:
## deleting variables for better memory allocation
del features
del labels

# Deep learning model

In [78]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(77,)),
    tf.keras.layers.Dense(67, activation='tanh'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(67, activation='tanh'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(15, activation='softmax')])

In [79]:
model.compile(optimizer='adam',
             loss='sparse_categorical_crossentropy',
             metrics=['accuracy'])

In [80]:
#model checkpoints
from tensorflow.keras.callbacks import ModelCheckpoint
filepath="weights.mapt.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [81]:
#fitting the model
metrics_plot = model.fit(x_train,
                          y_train,
                          epochs = 25, 
                          verbose=1 ,
                          callbacks=callbacks_list,
                          validation_data=(x_test,y_test))

# evaluating the model on the test set

In [82]:
p = model.predict(x_test)

In [83]:
preds = pd.DataFrame(np.ones(p.shape[0]),columns=["preds"])
for i,v in enumerate(p) :
    preds.iloc[i,:] = np.argmax(v)

In [84]:
p_1 = LE.inverse_transform(preds.astype("int"))
p_1 = pd.Series(p_1)
p_1

In [86]:
p_2 = LE.inverse_transform(y_test)
p_2= pd.Series(p_2)
p_2

In [88]:
# predicted - true
predictions = p_1.value_counts() - p_2.value_counts()
predictions

In [93]:
#error value
sum(abs(predictions))

In [92]:
# total sample
x_test.shape[0]