# Proyecto 1
## Detección de ataques

In [1]:
import os.path

import pandas as pd
from pandas_profiling import ProfileReport

import utils

In [2]:
generate_reports: bool = True

In [3]:
part_1 = pd.read_csv("dataset/dataset-part1.csv")
part_2 = pd.read_csv("dataset/dataset-part2.csv")

In [4]:
df = pd.concat([part_1, part_2], ignore_index=True)

## Análisis Exploratorio

In [5]:
df.columns.values

array(['BIFLOW_DIRECTION', 'DIRECTION', 'DST_TO_SRC_SECOND_BYTES',
       'FIREWALL_EVENT', 'FIRST_SWITCHED', 'FLOW_ACTIVE_TIMEOUT',
       'FLOW_DURATION_MICROSECONDS', 'FLOW_DURATION_MILLISECONDS',
       'FLOW_END_MILLISECONDS', 'FLOW_END_SEC', 'FLOW_ID',
       'FLOW_INACTIVE_TIMEOUT', 'FLOW_START_MILLISECONDS',
       'FLOW_START_SEC', 'FRAME_LENGTH', 'IN_BYTES', 'IN_PKTS',
       'IPV4_DST_ADDR', 'IPV4_SRC_ADDR', 'L4_DST_PORT', 'L4_SRC_PORT',
       'LAST_SWITCHED', 'MAX_IP_PKT_LEN', 'MIN_IP_PKT_LEN',
       'OOORDER_IN_PKTS', 'OOORDER_OUT_PKTS', 'OUT_BYTES', 'OUT_PKTS',
       'PROTOCOL', 'PROTOCOL_MAP', 'RETRANSMITTED_IN_BYTES',
       'RETRANSMITTED_IN_PKTS', 'RETRANSMITTED_OUT_BYTES',
       'RETRANSMITTED_OUT_PKTS', 'SRC_TO_DST_SECOND_BYTES', 'TCP_FLAGS',
       'TCP_WIN_MAX_IN', 'TCP_WIN_MAX_OUT', 'TCP_WIN_MIN_IN',
       'TCP_WIN_MIN_OUT', 'TCP_WIN_MSS_IN', 'TCP_WIN_MSS_OUT',
       'TCP_WIN_SCALE_IN', 'TCP_WIN_SCALE_OUT', 'SRC_TOS', 'DST_TOS',
       'L7_PROTO_NAME', 'SAM

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12207873 entries, 0 to 12207872
Data columns (total 50 columns):
 #   Column                      Dtype 
---  ------                      ----- 
 0   BIFLOW_DIRECTION            int64 
 1   DIRECTION                   int64 
 2   DST_TO_SRC_SECOND_BYTES     object
 3   FIREWALL_EVENT              int64 
 4   FIRST_SWITCHED              int64 
 5   FLOW_ACTIVE_TIMEOUT         int64 
 6   FLOW_DURATION_MICROSECONDS  int64 
 7   FLOW_DURATION_MILLISECONDS  int64 
 8   FLOW_END_MILLISECONDS       int64 
 9   FLOW_END_SEC                int64 
 10  FLOW_ID                     int64 
 11  FLOW_INACTIVE_TIMEOUT       int64 
 12  FLOW_START_MILLISECONDS     int64 
 13  FLOW_START_SEC              int64 
 14  FRAME_LENGTH                int64 
 15  IN_BYTES                    int64 
 16  IN_PKTS                     int64 
 17  IPV4_DST_ADDR               object
 18  IPV4_SRC_ADDR               object
 19  L4_DST_PORT                 int64 
 20  

In [7]:
df.head()

Unnamed: 0,BIFLOW_DIRECTION,DIRECTION,DST_TO_SRC_SECOND_BYTES,FIREWALL_EVENT,FIRST_SWITCHED,FLOW_ACTIVE_TIMEOUT,FLOW_DURATION_MICROSECONDS,FLOW_DURATION_MILLISECONDS,FLOW_END_MILLISECONDS,FLOW_END_SEC,...,TCP_WIN_MSS_IN,TCP_WIN_MSS_OUT,TCP_WIN_SCALE_IN,TCP_WIN_SCALE_OUT,SRC_TOS,DST_TOS,L7_PROTO_NAME,SAMPLING_INTERVAL,TOTAL_FLOWS_EXP,LABEL
0,1,0,40,0,1616660040,120,339,0,1616660040010,1616660040,...,1460,0,0,0,0,0,Unknown,1,2293398,Normal flow
1,1,0,",",0,1616660040,120,0,0,1616660040068,1616660040,...,0,0,0,0,40,0,ICMP,1,2293400,Normal flow
2,1,0,104,0,1616660040,120,44725,44,1616660040114,1616660040,...,0,0,0,0,0,0,TLS,1,2293404,Normal flow
3,1,0,",",0,1616660040,120,0,0,1616660040122,1616660040,...,1440,0,8,0,40,0,Unknown,1,2293407,Normal flow
4,1,0,40,0,1616660040,120,1114,1,1616660040184,1616660040,...,0,0,0,0,0,0,TLS,1,2293409,Normal flow


In [8]:
df.describe(include=['object'])

Unnamed: 0,DST_TO_SRC_SECOND_BYTES,IPV4_DST_ADDR,IPV4_SRC_ADDR,PROTOCOL_MAP,SRC_TO_DST_SECOND_BYTES,L7_PROTO_NAME,LABEL
count,12207873,12207873,12207873,12207873,12207873,12207873,12207873
unique,569243,440887,88463,5,3160289,457,4
top,40,10.114.224.65,10.114.241.166,tcp,44,Unknown,Normal flow
freq,3053652,3604686,5671376,8950058,2534718,4090802,6570058


In [9]:
object_columns = df.select_dtypes(include=['object']).columns
df[object_columns].head(5)

Unnamed: 0,DST_TO_SRC_SECOND_BYTES,IPV4_DST_ADDR,IPV4_SRC_ADDR,PROTOCOL_MAP,SRC_TO_DST_SECOND_BYTES,L7_PROTO_NAME,LABEL
0,40,10.114.225.212,162.142.125.173,tcp,44,Unknown,Normal flow
1,",",10.114.225.215,45.79.106.170,icmp,68,ICMP,Normal flow
2,104,10.114.241.165,10.114.224.65,tcp,189,TLS,Normal flow
3,",",10.114.226.23,202.179.91.28,tcp,52,Unknown,Normal flow
4,40,10.114.224.65,10.114.241.165,tcp,189,TLS,Normal flow


In [10]:
df["LABEL"].unique()

array(['Normal flow', 'SYN Scan - aggressive',
       'Denial of Service R-U-Dead-Yet', 'Denial of Service Slowloris'],
      dtype=object)

In [11]:
# Check for nan columns
df.columns[df.isna().any()].tolist()
# No nan

[]

In [12]:
if not os.path.isfile("cleanEDA.html") and generate_reports:
    profile = ProfileReport(df, title="Exploratory Data Analysis - Clean DF", minimal=True)
    profile.to_file("cleanEDA.html")

## Balancing and sampling

### Dataset balancing

In [13]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

ros = RandomOverSampler(sampling_strategy='minority')

In [14]:
labels_dic = {
    0: "Normal flow",
    1: "SYN Scan - aggressive",
    2: "Denial of Service R-U-Dead-Yet",
    3: "Denial of Service Slowloris"
}

In [15]:
normal_flow = df.loc[df["LABEL"] == labels_dic[0]]
syn_attacks = df.loc[df["LABEL"] == labels_dic[1]]
r_u_dead_attacks = df.loc[df["LABEL"] == labels_dic[2]]
dos_attacks = df.loc[df["LABEL"] == labels_dic[3]]

In [16]:
print(len(normal_flow))
print(len(syn_attacks))
print(len(r_u_dead_attacks))
print(len(dos_attacks))

6570058
2496814
2276947
864054


#### Oversampling DDoS attack

In [17]:
attacks_oversampling = pd.concat([syn_attacks, dos_attacks])
y_oversampling = attacks_oversampling["LABEL"]
x_oversampling = attacks_oversampling.drop(["LABEL"], axis= 1)

In [18]:
X_oversampling_res, y_oversampling_res = ros.fit_resample(x_oversampling, y_oversampling)

In [19]:
print('Resampled dataset shape %s' % Counter(y_oversampling_res))

Resampled dataset shape Counter({'SYN Scan - aggressive': 2496814, 'Denial of Service Slowloris': 2496814})


In [20]:
oversampling_result = X_oversampling_res.copy()
oversampling_result["LABEL"] = y_oversampling_res.copy()

In [21]:
df = pd.concat([normal_flow, r_u_dead_attacks, oversampling_result])

#### Undersampling benign

In [22]:
from imblearn.under_sampling import RandomUnderSampler

In [23]:
normal_flow = df.loc[df["LABEL"] == labels_dic[0]]
syn_attacks = df.loc[df["LABEL"] == labels_dic[1]]
r_u_dead_attacks = df.loc[df["LABEL"] == labels_dic[2]]
dos_attacks = df.loc[df["LABEL"] == labels_dic[3]]

In [24]:
benign_us = pd.concat([syn_attacks, normal_flow])

In [25]:
y = benign_us["LABEL"]
X = benign_us.drop(["LABEL"], axis=1)

In [26]:
rus = RandomUnderSampler(random_state=0)

In [27]:
X_undersampled, y_undersampled = rus.fit_resample(X, y)

print('Resampled dataset shape %s' % Counter(y_undersampled))

Resampled dataset shape Counter({'Normal flow': 2496814, 'SYN Scan - aggressive': 2496814})


In [28]:
X_undersampled["LABEL"] = y_undersampled
undersample_res = X_undersampled

In [29]:
df = pd.concat([dos_attacks, r_u_dead_attacks, undersample_res])

In [43]:
# create csv if it doesn't exist
if not os.path.isfile("dataset/undersampled_df.csv"):
    df.to_csv("dataset/undersampled_df.csv", index_label=False)

df = pd.read_csv("dataset/undersampled_df.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


### Sampling

In [63]:
if not os.path.isfile("dataset/sampled_dataset.csv"):
    df = df.groupby('LABEL', group_keys=False).apply(lambda x: x.sample(frac=0.01))
    df.to_csv("dataset/sampled_dataset.csv", index_label=False)
else:
    df = pd.read_csv("dataset/sampled_dataset.csv")

## Preprocessing

### Remove unnecesary variables

#### Drop constants

In [64]:
# CONSTANTS 
constants = [
    'BIFLOW_DIRECTION',
    'FIREWALL_EVENT',
    'FLOW_ACTIVE_TIMEOUT',
    'FLOW_INACTIVE_TIMEOUT',
    'FRAME_LENGTH',
    'MAX_IP_PKT_LEN',
    'MIN_IP_PKT_LEN',
    'SAMPLING_INTERVAL',
    'DIRECTION',
    'OOORDER_IN_PKTS',
    'OOORDER_OUT_PKTS',
]

# DIRECTION, OOORDER_IN_PKTS and OOORDER_OUT_PKTS has 99%> zero values, so it will be considered a constant.

In [65]:
# Drop constants from dataframe
df = df.drop(constants, axis=1)
len(df.columns.values)

39

#### Drop unique values

In [66]:
# UNIQUE
unique_vars = ['FLOW_ID', 'TOTAL_FLOWS_EXP']

In [67]:
df = df.drop(unique_vars, axis=1)
len(df.columns.values)

37

#### DROP IPS

In [68]:
df = df.drop(["IPV4_DST_ADDR", "IPV4_SRC_ADDR"], axis = 1)

#### DROP PROTOCOL COLUMN

In [69]:
df = df.drop(["PROTOCOL"], axis = 1)

#### Clean data from report

In [70]:
# Drop columns that almost everything is 0
df = df.drop(columns=["RETRANSMITTED_IN_BYTES", "RETRANSMITTED_IN_PKTS"], axis = 1)

In [71]:
df = df.drop(columns=["RETRANSMITTED_OUT_BYTES", "RETRANSMITTED_OUT_PKTS"], axis=1)

In [72]:
df = df.drop(columns=["TCP_WIN_MSS_OUT", "TCP_WIN_SCALE_IN", "TCP_WIN_SCALE_IN", "TCP_WIN_SCALE_OUT", "SRC_TOS", "DST_TOS"], axis=1)

### Numerical preprocessing

#### Fix DST_TO_SRC_SECOND_BYTES column

In [73]:
# df["DST_TO_SRC_SECOND_BYTES"].apply(np.isreal)
DST_TO_SRC_SECOND_BYTES_INT_VALUES = list(filter(utils.is_int, df["DST_TO_SRC_SECOND_BYTES"]))
DST_TO_SRC_SECOND_BYTES_NON_INT_VALUES = list(filter(lambda x: not utils.is_int(x), df["DST_TO_SRC_SECOND_BYTES"]))

In [74]:
print("Correct int values: ", len(DST_TO_SRC_SECOND_BYTES_INT_VALUES))
print("Incorrect int values: ", len(DST_TO_SRC_SECOND_BYTES_NON_INT_VALUES))
print("Total values count: ", len(df["DST_TO_SRC_SECOND_BYTES"]))

Correct int values:  36902
Incorrect int values:  60771
Total values count:  97673


In [75]:
df_safe_clean = df.copy()

In [76]:
df["DST_TO_SRC_SECOND_BYTES"] = df["DST_TO_SRC_SECOND_BYTES"].apply(lambda x: int(x) if utils.is_int(x) else utils.normalize_dst_to_src_column(x))
df["DST_TO_SRC_SECOND_BYTES"].head()

0    216
1    164
2    624
3    164
4    676
Name: DST_TO_SRC_SECOND_BYTES, dtype: int64

In [77]:
# Replace the 0 with 1
df["DST_TO_SRC_SECOND_BYTES"].replace({0: 1}, inplace=True)
df["DST_TO_SRC_SECOND_BYTES"].head()

0    216
1    164
2    624
3    164
4    676
Name: DST_TO_SRC_SECOND_BYTES, dtype: int64

#### Fix SRC_TO_DST_SECOND_BYTES column

In [78]:
df["SRC_TO_DST_SECOND_BYTES"] = df["SRC_TO_DST_SECOND_BYTES"].apply(lambda x: int(x) if utils.is_int(x) else utils.normalize_dst_to_src_column(x))
df["SRC_TO_DST_SECOND_BYTES"].head()

0     748
1     216
2    1315
3     216
4    1273
Name: SRC_TO_DST_SECOND_BYTES, dtype: int64

### Second profile report

In [79]:
if not os.path.isfile("reports/profile-numerical.html") and generate_reports:
    ProfileReport(df, title="Profile after numerical preprocessing", minimal=True).to_file("reports/profile-numerical.html")

#### Reduce skew data

In [80]:
import numpy as np
from scipy.stats import skew
def reduce_skew(column, fn = np.log):
    print("Skew actual value for column ", column,": ",skew(df["SRC_TO_DST_SECOND_BYTES"]))
    df[column].replace({0: 1}, inplace=True)
    df[column] = df[column].apply(fn)
    print("New value: ", skew(df[column]))

In [81]:
safe_skew_copy = df.copy()

In [82]:
reduce_skew("SRC_TO_DST_SECOND_BYTES")
reduce_skew("DST_TO_SRC_SECOND_BYTES")
reduce_skew("OUT_BYTES")

Skew actual value for column  SRC_TO_DST_SECOND_BYTES :  113.30416504382764
New value:  0.1872506042276819
Skew actual value for column  DST_TO_SRC_SECOND_BYTES :  0.1872506042276819
New value:  -0.6001182906757618
Skew actual value for column  OUT_BYTES :  0.1872506042276819
New value:  -0.5996819020646983


### Correlation check

In [83]:
import seaborn as sn
import matplotlib.pyplot as plt
corrMatrix = df.corr()

In [84]:
fig, ax = plt.subplots(figsize=(60,20)) 
sn.heatmap(corrMatrix, annot=True, linewidths=.5, ax=ax)
plt.show()

  This is separate from the ipykernel package so we can avoid doing imports until


In [85]:
for column in corrMatrix:
        correl = []
        for index in corrMatrix[column].index:
            if column != index and abs(corrMatrix[column][index]) >= 0.75:
                correl.append((index, corrMatrix[column][index]))
        if len(correl) > 0:
            print("La columna ", column, " tuvo correlación con las columnas")
            print(correl)
            print("\n")

La columna  DST_TO_SRC_SECOND_BYTES  tuvo correlación con las columnas
[('OUT_BYTES', 0.9999994855511837), ('SRC_TO_DST_SECOND_BYTES', 0.7728654824382651)]


La columna  FIRST_SWITCHED  tuvo correlación con las columnas
[('FLOW_END_MILLISECONDS', 0.9999999987135727), ('FLOW_END_SEC', 0.9999999987144633), ('FLOW_START_MILLISECONDS', 0.999999999999915), ('FLOW_START_SEC', 1.0), ('LAST_SWITCHED', 0.9999999987144633)]


La columna  FLOW_DURATION_MICROSECONDS  tuvo correlación con las columnas
[('FLOW_DURATION_MILLISECONDS', 0.9999999999721548), ('SRC_TO_DST_SECOND_BYTES', 0.7678971150821113)]


La columna  FLOW_DURATION_MILLISECONDS  tuvo correlación con las columnas
[('FLOW_DURATION_MICROSECONDS', 0.9999999999721548), ('SRC_TO_DST_SECOND_BYTES', 0.7678967683146068)]


La columna  FLOW_END_MILLISECONDS  tuvo correlación con las columnas
[('FIRST_SWITCHED', 0.9999999987135727), ('FLOW_END_SEC', 0.9999999999999164), ('FLOW_START_MILLISECONDS', 0.9999999987132961), ('FLOW_START_SEC', 0.999999

In [86]:
# Remove useless columns
columns_to_drop = [
    "DST_TO_SRC_SECOND_BYTES", "FLOW_END_MILLISECONDS", "FLOW_END_SEC", "FLOW_START_MILLISECONDS", "FLOW_START_SEC",
    "FLOW_DURATION_MICROSECONDS",
    "IN_BYTES",
    "TCP_WIN_MIN_OUT",
    "TCP_WIN_MIN_IN",
    "TCP_WIN_MAX_IN"
]
df = df.drop(columns=columns_to_drop, axis = 1)

### Categorical variables preprocessing

#### Create PROTOCOL_MAP dummies

In [87]:
# Create the dummies
df = pd.get_dummies(df, columns = ["PROTOCOL_MAP"])
df.head(3)

Unnamed: 0,FIRST_SWITCHED,FLOW_DURATION_MILLISECONDS,IN_PKTS,L4_DST_PORT,L4_SRC_PORT,LAST_SWITCHED,OUT_BYTES,OUT_PKTS,SRC_TO_DST_SECOND_BYTES,TCP_FLAGS,TCP_WIN_MAX_OUT,TCP_WIN_MSS_IN,L7_PROTO_NAME,LABEL,PROTOCOL_MAP_icmp,PROTOCOL_MAP_tcp,PROTOCOL_MAP_udp
0,1618224176,8962,6,80,30258,1618224185,5.375278,4,6.617403,27,27960,1410,HTTP,Denial of Service R-U-Dead-Yet,0,1,0
1,1618228480,17,4,80,60087,1618228480,5.099866,3,5.375278,19,27960,1410,HTTP,Denial of Service R-U-Dead-Yet,0,1,0
2,1618230693,109775,15,80,48415,1618230803,6.43615,12,7.181592,24,114,0,HTTP,Denial of Service R-U-Dead-Yet,0,1,0


#### L7_PROTO_NAME dummies

In [88]:
df["L7_PROTO_NAME"].unique()

array(['HTTP', 'HTTP.TargusDataspeed', 'TLS.YouTube', 'HTTP.Microsoft',
       'TLS.Skype', 'TLS.Google', 'DNS.Google', 'Unknown', 'ICMP', 'DNS',
       'TLS.Microsoft', 'BitTorrent', 'TLS.Microsoft365', 'ICMP.Amazon',
       'TLS', 'SSH', 'Redis', 'DNS.Amazon', 'DNS.MS_OneDrive', 'Google',
       'QUIC.Google', 'TLS.GoogleServices', 'STUN.Messenger',
       'TLS.Amazon', 'DNS.Teams', 'TLS.Facebook', 'STUN.Skype', 'Amazon',
       'DNS.GoogleDocs', 'TLS.Cloudflare', 'HTTP.UbuntuONE',
       'DNS.Microsoft', 'DNS.Microsoft365', 'DNS.Apple', 'IMAP',
       'DNS.OpenDNS', 'TLS.Dropbox', 'NTP.UbuntuONE', 'DNS.UbuntuONE',
       'DNS.YouTube', 'TeamViewer', 'MsSQL-TDS', 'DNS.Yahoo',
       'NTP.Cloudflare', 'TargusDataspeed', 'TLS.WhatsApp', 'NTP', 'SNMP',
       'TLS.Teams', 'DNS.GoogleDrive', 'QUIC.YouTube', 'STUN.SkypeCall',
       'TLS.WhatsAppFiles', 'NFS', 'STUN', 'DNS.GoogleServices',
       'QUIC.Facebook', 'DNS.Skype', 'Telnet', 'DNS.Telegram',
       'DNS.Cloudflare', 'Memcached',

In [89]:
df["L7_PROTO_NAME"] = df["L7_PROTO_NAME"].map(lambda protocol: utils.lematize_protocol(protocol).upper())

In [90]:
print(len(df["L7_PROTO_NAME"].unique()))

df["L7_PROTO_NAME"].unique()

80


array(['HTTP', 'TLS', 'DNS', 'UNKNOWN', 'ICMP', 'BITTORRENT', 'SSH',
       'REDIS', 'GOOGLE', 'QUIC', 'STUN', 'AMAZON', 'IMAP', 'NTP',
       'TEAMVIEWER', 'MSSQL-TDS', 'TARGUSDATASPEED', 'SNMP', 'NFS',
       'TELNET', 'MEMCACHED', 'PLAYSTATION', 'SSDP', 'FTP_DATA', 'SKYPE',
       'RTP', 'HTTP_PROXY', 'SMBV23', 'OPENVPN', 'SIP', 'VNC', 'IMAPS',
       'STEAM', 'LDAP', 'WSD', 'RDP', 'COAP', 'APPLE', 'MICROSOFT',
       'MQTT', 'MDNS', 'SMTPS', 'RTSP', 'CISCOVPN', 'RSYNC', 'MODBUS',
       'MINING', 'IPSEC', 'KERBEROS', 'RX', 'CLOUDFLARE', 'H323', 'SOCKS',
       'NETBIOS', 'FTP_CONTROL', 'XBOX', 'POSTGRESQL', 'STARCRAFT',
       'IEC60870', 'BGP', 'VIBER', 'WHATSAPPFILES', 'UBNTAC2',
       'REMOTESCAN', 'SMTP', 'FACEBOOK', 'AJP', 'XDMCP', 'GIT', 'DCE_RPC',
       'S7COMM', 'TINC', 'IAX', 'ZABBIX', 'CITRIX', 'OSPF', 'DNP3',
       'NESTLOGSINK', 'MONGODB', 'CORBA'], dtype=object)

In [91]:
df = pd.get_dummies(df, columns = ["L7_PROTO_NAME"])
df.head(3)

Unnamed: 0,FIRST_SWITCHED,FLOW_DURATION_MILLISECONDS,IN_PKTS,L4_DST_PORT,L4_SRC_PORT,LAST_SWITCHED,OUT_BYTES,OUT_PKTS,SRC_TO_DST_SECOND_BYTES,TCP_FLAGS,...,L7_PROTO_NAME_TLS,L7_PROTO_NAME_UBNTAC2,L7_PROTO_NAME_UNKNOWN,L7_PROTO_NAME_VIBER,L7_PROTO_NAME_VNC,L7_PROTO_NAME_WHATSAPPFILES,L7_PROTO_NAME_WSD,L7_PROTO_NAME_XBOX,L7_PROTO_NAME_XDMCP,L7_PROTO_NAME_ZABBIX
0,1618224176,8962,6,80,30258,1618224185,5.375278,4,6.617403,27,...,0,0,0,0,0,0,0,0,0,0
1,1618228480,17,4,80,60087,1618228480,5.099866,3,5.375278,19,...,0,0,0,0,0,0,0,0,0,0
2,1618230693,109775,15,80,48415,1618230803,6.43615,12,7.181592,24,...,0,0,0,0,0,0,0,0,0,0


#### Treat LABEL column

In [92]:
df["LABEL"].unique()

array(['Denial of Service R-U-Dead-Yet', 'Denial of Service Slowloris',
       'Normal flow', 'SYN Scan - aggressive'], dtype=object)

In [93]:
labels_dic = {
    "Normal flow": 0,
    "SYN Scan - aggressive": 1,
    "Denial of Service R-U-Dead-Yet": 2,
    "Denial of Service Slowloris": 3
}
df["LABEL"] = df["LABEL"].apply(lambda x: labels_dic[x])
df["LABEL"].unique()

array([2, 3, 0, 1])

### Normalize column names

In [94]:
# Normalize columns
df.columns = [column.replace("-", "_") for column in df.columns]
df.columns

Index(['FIRST_SWITCHED', 'FLOW_DURATION_MILLISECONDS', 'IN_PKTS',
       'L4_DST_PORT', 'L4_SRC_PORT', 'LAST_SWITCHED', 'OUT_BYTES', 'OUT_PKTS',
       'SRC_TO_DST_SECOND_BYTES', 'TCP_FLAGS', 'TCP_WIN_MAX_OUT',
       'TCP_WIN_MSS_IN', 'LABEL', 'PROTOCOL_MAP_icmp', 'PROTOCOL_MAP_tcp',
       'PROTOCOL_MAP_udp', 'L7_PROTO_NAME_AJP', 'L7_PROTO_NAME_AMAZON',
       'L7_PROTO_NAME_APPLE', 'L7_PROTO_NAME_BGP', 'L7_PROTO_NAME_BITTORRENT',
       'L7_PROTO_NAME_CISCOVPN', 'L7_PROTO_NAME_CITRIX',
       'L7_PROTO_NAME_CLOUDFLARE', 'L7_PROTO_NAME_COAP', 'L7_PROTO_NAME_CORBA',
       'L7_PROTO_NAME_DCE_RPC', 'L7_PROTO_NAME_DNP3', 'L7_PROTO_NAME_DNS',
       'L7_PROTO_NAME_FACEBOOK', 'L7_PROTO_NAME_FTP_CONTROL',
       'L7_PROTO_NAME_FTP_DATA', 'L7_PROTO_NAME_GIT', 'L7_PROTO_NAME_GOOGLE',
       'L7_PROTO_NAME_H323', 'L7_PROTO_NAME_HTTP', 'L7_PROTO_NAME_HTTP_PROXY',
       'L7_PROTO_NAME_IAX', 'L7_PROTO_NAME_ICMP', 'L7_PROTO_NAME_IEC60870',
       'L7_PROTO_NAME_IMAP', 'L7_PROTO_NAME_IMAPS', 'L7_

### Third profile report

In [95]:
df.shape

(97673, 96)

In [96]:
if not os.path.isfile("reports/profile-categorical.html") and generate_reports:
    ProfileReport(df, title="Exploratory Data Analysis 2").to_file("reports/profile-categorical.html")

### Create clean dataset

In [97]:
df.to_csv("dataset/clean_df.csv", index_label=False)

In [98]:
df.columns

Index(['FIRST_SWITCHED', 'FLOW_DURATION_MILLISECONDS', 'IN_PKTS',
       'L4_DST_PORT', 'L4_SRC_PORT', 'LAST_SWITCHED', 'OUT_BYTES', 'OUT_PKTS',
       'SRC_TO_DST_SECOND_BYTES', 'TCP_FLAGS', 'TCP_WIN_MAX_OUT',
       'TCP_WIN_MSS_IN', 'LABEL', 'PROTOCOL_MAP_icmp', 'PROTOCOL_MAP_tcp',
       'PROTOCOL_MAP_udp', 'L7_PROTO_NAME_AJP', 'L7_PROTO_NAME_AMAZON',
       'L7_PROTO_NAME_APPLE', 'L7_PROTO_NAME_BGP', 'L7_PROTO_NAME_BITTORRENT',
       'L7_PROTO_NAME_CISCOVPN', 'L7_PROTO_NAME_CITRIX',
       'L7_PROTO_NAME_CLOUDFLARE', 'L7_PROTO_NAME_COAP', 'L7_PROTO_NAME_CORBA',
       'L7_PROTO_NAME_DCE_RPC', 'L7_PROTO_NAME_DNP3', 'L7_PROTO_NAME_DNS',
       'L7_PROTO_NAME_FACEBOOK', 'L7_PROTO_NAME_FTP_CONTROL',
       'L7_PROTO_NAME_FTP_DATA', 'L7_PROTO_NAME_GIT', 'L7_PROTO_NAME_GOOGLE',
       'L7_PROTO_NAME_H323', 'L7_PROTO_NAME_HTTP', 'L7_PROTO_NAME_HTTP_PROXY',
       'L7_PROTO_NAME_IAX', 'L7_PROTO_NAME_ICMP', 'L7_PROTO_NAME_IEC60870',
       'L7_PROTO_NAME_IMAP', 'L7_PROTO_NAME_IMAPS', 'L7_

In [99]:
for column in df.columns:
    print(column)

FIRST_SWITCHED
FLOW_DURATION_MILLISECONDS
IN_PKTS
L4_DST_PORT
L4_SRC_PORT
LAST_SWITCHED
OUT_BYTES
OUT_PKTS
SRC_TO_DST_SECOND_BYTES
TCP_FLAGS
TCP_WIN_MAX_OUT
TCP_WIN_MSS_IN
LABEL
PROTOCOL_MAP_icmp
PROTOCOL_MAP_tcp
PROTOCOL_MAP_udp
L7_PROTO_NAME_AJP
L7_PROTO_NAME_AMAZON
L7_PROTO_NAME_APPLE
L7_PROTO_NAME_BGP
L7_PROTO_NAME_BITTORRENT
L7_PROTO_NAME_CISCOVPN
L7_PROTO_NAME_CITRIX
L7_PROTO_NAME_CLOUDFLARE
L7_PROTO_NAME_COAP
L7_PROTO_NAME_CORBA
L7_PROTO_NAME_DCE_RPC
L7_PROTO_NAME_DNP3
L7_PROTO_NAME_DNS
L7_PROTO_NAME_FACEBOOK
L7_PROTO_NAME_FTP_CONTROL
L7_PROTO_NAME_FTP_DATA
L7_PROTO_NAME_GIT
L7_PROTO_NAME_GOOGLE
L7_PROTO_NAME_H323
L7_PROTO_NAME_HTTP
L7_PROTO_NAME_HTTP_PROXY
L7_PROTO_NAME_IAX
L7_PROTO_NAME_ICMP
L7_PROTO_NAME_IEC60870
L7_PROTO_NAME_IMAP
L7_PROTO_NAME_IMAPS
L7_PROTO_NAME_IPSEC
L7_PROTO_NAME_KERBEROS
L7_PROTO_NAME_LDAP
L7_PROTO_NAME_MDNS
L7_PROTO_NAME_MEMCACHED
L7_PROTO_NAME_MICROSOFT
L7_PROTO_NAME_MINING
L7_PROTO_NAME_MODBUS
L7_PROTO_NAME_MONGODB
L7_PROTO_NAME_MQTT
L7_PROTO_NAME_MS

In [100]:
len(df.columns)

96