In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Dropout, Flatten, Dense
from keras.layers import Activation
from keras.utils import to_categorical
from sklearn.metrics import confusion_matrix, classification_report, f1_score, precision_score, recall_score, accuracy_score

In [2]:
df = pd.read_csv("compiled.csv")
df.head()

Unnamed: 0,Src IP,Src Port,Dst IP,Dst Port,Protocol,Flow Duration,Flow Byts/s,Flow Pkts/s,Flow IAT Mean,Flow IAT Std,...,Bwd IAT Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,131.202.240.87,1258,64.12.104.73,443,6,119999581,1.850006,0.041667,29999900.0,34580350.0,...,52837.0,52431.5,573.4636,52837.0,52026.0,59947360.0,487.9037,59947704.0,59947014.0,Chat
1,131.202.240.87,1254,64.12.104.85,443,6,119999899,19.375016,0.2,5217387.0,10372690.0,...,9508.0,1132177.0,1665046.0,3378621.0,20232.0,18867810.0,12937380.0,40602165.0,8405794.0,Chat
2,131.202.240.87,13389,64.12.104.73,443,6,60054835,2.464414,0.066606,20018280.0,34581070.0,...,52968.0,52968.0,0.0,52968.0,52968.0,59949060.0,0.0,59949058.0,59949058.0,Chat
3,131.202.240.87,13385,64.12.24.167,443,6,87805460,28.847864,0.296109,3512218.0,9550137.0,...,1876.0,3712991.0,3555993.0,7114375.0,20308.0,24384510.0,18202190.0,45086942.0,10890558.0,Chat
4,131.202.240.87,13407,178.237.19.228,443,6,119998971,0.250002,0.075001,14999870.0,15855890.0,...,168275.0,168042.0,189.1613,168275.0,167828.0,29831700.0,652.5541,29832657.0,29831207.0,Chat


# Data Pre-processing

In [3]:
# Remove 5-tuple
df = df.iloc[:, 5:].copy()
print(df.head())
print(df.shape)

   Flow Duration  Flow Byts/s  Flow Pkts/s  Flow IAT Mean  Flow IAT Std   
0      119999581     1.850006     0.041667   2.999990e+07  3.458035e+07  \
1      119999899    19.375016     0.200000   5.217387e+06  1.037269e+07   
2       60054835     2.464414     0.066606   2.001828e+07  3.458107e+07   
3       87805460    28.847864     0.296109   3.512218e+06  9.550137e+06   
4      119998971     0.250002     0.075001   1.499987e+07  1.585589e+07   

   Flow IAT Max  Flow IAT Min  Fwd IAT Mean   Fwd IAT Std  Fwd IAT Max  ...   
0    59947704.0       52026.0  5.999973e+07  0.000000e+00   59999730.0  ...  \
1    40602165.0          58.0  1.115004e+07  1.369498e+07   40655533.0  ...   
2    59949058.0       52809.0  0.000000e+00  0.000000e+00          0.0  ...   
3    45086942.0          36.0  7.626982e+06  1.358550e+07   45086942.0  ...   
4    29832657.0      167828.0  2.999972e+07  9.096027e+02   30000749.0  ...   

   Bwd IAT Min   Active Mean    Active Std  Active Max  Active Min   
0   

In [4]:
# Dataset Info
print(df.info())
print()
print(df['Label'].describe())
print(df['Label'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195207 entries, 0 to 195206
Data columns (total 24 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Flow Duration  195207 non-null  int64  
 1   Flow Byts/s    195202 non-null  float64
 2   Flow Pkts/s    195207 non-null  float64
 3   Flow IAT Mean  195207 non-null  float64
 4   Flow IAT Std   195207 non-null  float64
 5   Flow IAT Max   195207 non-null  float64
 6   Flow IAT Min   195207 non-null  float64
 7   Fwd IAT Mean   195207 non-null  float64
 8   Fwd IAT Std    195207 non-null  float64
 9   Fwd IAT Max    195207 non-null  float64
 10  Fwd IAT Min    195207 non-null  float64
 11  Bwd IAT Mean   195207 non-null  float64
 12  Bwd IAT Std    195207 non-null  float64
 13  Bwd IAT Max    195207 non-null  float64
 14  Bwd IAT Min    195207 non-null  float64
 15  Active Mean    195207 non-null  float64
 16  Active Std     195207 non-null  float64
 17  Active Max     195207 non-nul

In [5]:
# Replace infinite values with nan
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Remove NA values
df = df.dropna()
    
df.shape

(67568, 24)

In [6]:
print(df['Label'].value_counts())

Label
File Transfer    44859
Chat              7585
VoIP              6718
Email             5071
Streaming         3335
Name: count, dtype: int64


In [7]:
keys = df.keys()
# Normalise values
x = df.iloc[:,:-1].copy()
min_max_scaler = preprocessing.MinMaxScaler()
x = min_max_scaler.fit_transform(x)

le = preprocessing.LabelEncoder()
y = le.fit_transform(df["Label"]).copy()

In [8]:
print(len(y))
print(len(x))
len(le.classes_)

67568
67568


5

In [9]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=32)

In [10]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(47297, 23)
(47297,)
(20271, 23)
(20271,)


In [11]:
# Reshape
X_train_cnn = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_cnn = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
X_train_cnn.shape

(47297, 23, 1)

In [12]:
# Create the CNN Model
model = Sequential()

# 1st Convolutional Layer
model.add(Conv1D(filters=64, input_shape=(23, 1), kernel_size=1, strides=1, activation = 'relu'))
model.add(Dropout(0.3))

# 2nd Convolution Layer
model.add(Conv1D(filters=128, kernel_size=1, strides=1, activation = 'relu'))
model.add(Dropout(0.3))

# Fully Connected layer
model.add(Flatten())

# 1st Fully Connected Layer
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.3))

# 2nd Fully Connected Layer
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))

# 3rd Fully Connected Layer
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))

# Output Layer
model.add(Dense(len(le.classes_), activation='softmax'))
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy' , metrics = ['accuracy'])

model.summary()

model.fit(X_train_cnn, y_train, epochs = 20, validation_data = (X_test_cnn, y_test))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 23, 64)            128       
                                                                 
 dropout (Dropout)           (None, 23, 64)            0         
                                                                 
 conv1d_1 (Conv1D)           (None, 23, 128)           8320      
                                                                 
 dropout_1 (Dropout)         (None, 23, 128)           0         
                                                                 
 flatten (Flatten)           (None, 2944)              0         
                                                                 
 dense (Dense)               (None, 256)               753920    
                                                                 
 dropout_2 (Dropout)         (None, 256)               0

<keras.callbacks.History at 0x1ee7630f990>

In [13]:
y_pred = model.predict(X_test_cnn)
y_pred = np.argmax(y_pred, axis=1)

print(f"Accuracy of CNN on test set : {accuracy_score(y_pred, y_test)}")



ValueError: Classification metrics can't handle a mix of multiclass and multilabel-indicator targets

In [None]:
# Testing
# Normalise values
test_data = np.array([5157723,1052.7901556559,3.6837961248,286540.166666667,878838.525555463,3743359,135,644715.375,1272066.0582144,3743562,509,568901.666666667,1209110.28710422,3743573,451,0,0,0,0,0,0,0,0])
test_y_pred = model.predict(X_test_cnn[0:5])
test_y_pred = np.argmax(test_y_pred, axis=1)

print(test_y_pred)
print(y_test[:5])

In [14]:
# Load the data
url = 'https://raw.githubusercontent.com/micaelCZ/Paper_Repositorio/main/dataset/datasetPreprocesado/Escenario2.csv'
df_test = pd.read_csv(url)

x = df_test.iloc[4100: 4105,5:-1].copy()
min_max_scaler = preprocessing.MinMaxScaler()
x = min_max_scaler.fit_transform(x)
x = x.reshape((x.shape[0], x.shape[1], 1))

test_y_pred = model.predict(x)
test_y_pred = np.argmax(test_y_pred, axis=1)
le.inverse_transform(test_y_pred)



array(['File Transfer', 'File Transfer', 'File Transfer', 'File Transfer',
       'Chat'], dtype=object)

In [15]:
df_test["label"].values[4100: 4105]


array(['VIDEO', 'VIDEO', 'VIDEO', 'VIDEO', 'VIDEO'], dtype=object)

In [None]:
df_test["label"].value_counts()