# Generate The Dataset

In [None]:
import numpy as np  # for array
import pandas as pd  # for csv files and dataframe
import matplotlib.pyplot as plt  # for plotting
import seaborn as sns  # plotting

import warnings
warnings.filterwarnings("ignore")

# Set seaborn
sns.set()

import tensorflow as tf
print(tf.config.list_physical_devices('GPU'))

## 1. IoTBotnet Dataset

In [None]:
# Reading datasets
# 04_Semester_4/revisi/dataset/UNSW_2018_IoT_Botnet_Full5pc_1.csv
dfs = []
for i in range(1,5):
    path = '~/04_Semester_4/revisi/dataset/UNSW_2018_IoT_Botnet_Full5pc_{}.csv'  # There are 4 input csv files
    dfs.append(pd.read_csv(path.format(i), low_memory=False))
all_data = pd.concat(dfs).reset_index(drop=True)  # Concat all to a single df

In [None]:
all_data.head()

In [None]:
all_data.info()

In [None]:
all_data.isnull().sum()

In [None]:
all_data.value_counts('attack')

In [None]:
# Select interested data features
data=all_data[['proto','saddr','sport','daddr','dport','spkts','dpkts','sbytes','dbytes','state','stime','ltime','dur','attack','category']]

In [None]:
# Plotting target label
plt.figure(figsize=(15,4))
sns.set_theme(style="darkgrid")
sns.countplot(x=data['attack'],palette='CMRmap')
plt.title('The distribution of the target label in IoT Bot 5% dataset', weight='bold', fontsize='18')
plt.show()

In [None]:
# Plotting target label
plt.figure(figsize=(15,4))
sns.set_theme(style="darkgrid")
sns.countplot(x=data['attack'],palette='CMRmap')
plt.title('Distribusi target label di dataset IoT Bot 5%', weight='bold', fontsize='18')
plt.show()

In [None]:
data.value_counts('category')

In [None]:
# Plotting target label
plt.figure(figsize=(15,4))
sns.set_theme(style="darkgrid")
sns.countplot(x=data['category'],palette='CMRmap')
plt.title('Distribusi target Kategori di IoT Bot 5% dataset', weight='bold', fontsize='18')
plt.show()

In [None]:
data = pd.read_csv('~/04_Semester_4/revisi/csv_files/IoT_Botnet_Full5pc.csv', low_memory=False)

In [None]:
# Shuffle dataframe
data=data.sample(frac=1, random_state=99)

n=data.shape[0]

data_train = data.iloc[:round(0.8*n),:] # 80% of dataset
data_test = data.iloc[round(0.8*n):,:] # 20% of datas

In [None]:
# save to csv files
data_train.to_csv('~/04_Semester_4/revisi/csv_files/IoT_Botnet_Full5pc_train.csv', index=False)
data_test.to_csv('~/04_Semester_4/revisi/csv_files/IoT_Botnet_Full5pc_test.csv', index=False)

## 2. UNSW-NB15 Dataset

In [None]:
# Reading datasets
dfs = []
for i in range(1,5):
    path = '~/04_Semester_4/revisi/dataset/UNSW-NB15_{}.csv'  # There are 4 input csv files
    dfs.append(pd.read_csv(path.format(i), header = None))
all_data = pd.concat(dfs).reset_index(drop=True)  # Concat all to a single df

In [None]:
# This csv file contains names of all the features
df_col = pd.read_csv('~/04_Semester_4/revisi/dataset/NUSW-NB15_features.csv', encoding='ISO-8859-1')

# Making column names lower case, removing spaces
df_col['Name'] = df_col['Name'].apply(lambda x: x.strip().replace(' ', '').lower())

# Renaming our dataframe with proper column names
all_data.columns = df_col['Name']
del df_col

In [None]:
all_data['attack_cat'] = all_data.attack_cat.fillna(value='normal').apply(lambda x: x.strip().lower())

In [None]:
all_data['attack_cat'] = all_data['attack_cat'].replace('backdoors','backdoor', regex=True).apply(lambda x: x.strip().lower())

In [None]:
# Select features
all_data=all_data[['proto','srcip','sport','dstip','dsport','spkts','dpkts','sbytes','dbytes','state','stime','ltime','dur', 'label', 'attack_cat']]

In [None]:
all_data.head()

In [None]:
all_data.to_csv('~/04_Semester_4/revisi/csv_files/UNSW-NB15.csv', index=False)

### A. UNSW-NB15 Base 5

In [None]:
data = pd.read_csv('~/04_Semester_4/revisi/csv_files/UNSW-NB15.csv', low_memory=False)

In [None]:
data.head()

In [None]:
# Selecting attack type in different dataframes
normal = data.loc[data['attack_cat'] == 'normal', :]
generic = data.loc[data['attack_cat'] == 'generic', :]
exploits = data.loc[data['attack_cat'] == 'exploits', :]
fuzzers = data.loc[data['attack_cat'] == 'fuzzers', :]
dos = data.loc[data['attack_cat'] == 'dos', :]
reconnaissance = data.loc[data['attack_cat'] == 'reconnaissance', :]
analysis = data.loc[data['attack_cat'] == 'analysis', :]
backdoor = data.loc[data['attack_cat'] == 'backdoor', :]
shellcode = data.loc[data['attack_cat'] == 'shellcode', :]
worms = data.loc[data['attack_cat'] == 'worms', :]    

In [None]:
def cut_off(data=data):
    attacks = data.loc[data['label'] == 1, :]
    n_attacks = len(attacks)
    no_attacks_all = data.loc[data['label'] == 0, :]
    no_attacks = no_attacks_all.sample(n=n_attacks, random_state=42)
    data = pd.concat([attacks, no_attacks])
    return data

In [None]:
data.value_counts('attack_cat')

In [None]:
# Creating the base4 Dataset  detection of zero-day attack, 3 category
#(fuzzers, analysis, backdoor, shellcode, and worms).
base4 = pd.concat([normal, generic, dos, reconnaissance])

In [None]:
all_0, all_1 = base4['label'].value_counts()[0]*100 / len(base4.index), base4['label'].value_counts()[1]*100 / len(base4.index)

print("In base4 dataset: there are {} % of normal traffic and {} % of attacks".format(all_0, all_1))

In [None]:
# Balancing the dataset to obtain 50/50% attacks/normal traffic
base4=cut_off(base4)

In [None]:
all_0, all_1 = base4['label'].value_counts()[0]*100 / len(base4.index), base4['label'].value_counts()[1]*100 / len(base4.index)

print("In base4 dataset: there are {} % of normal traffic and {} % of attacks".format(all_0, all_1))

In [None]:
base4.value_counts('label')

In [None]:
# Plotting target label
plt.figure(figsize=(15,4))
sns.set_theme(style="darkgrid")
sns.countplot(x=base4['label'],palette='CMRmap')
plt.title('Distribusi target label di Base 4 dataset', weight='bold', fontsize='18')
plt.show()

In [None]:
# Plotting attack category distribution
plt.figure(figsize=(15, 4))
sns.set_theme(style="darkgrid")
sns.countplot(x=base4['attack_cat'], order = base4['attack_cat'].value_counts().index, palette='CMRmap')
plt.title('Distribusi kategori serangan di Base 4 dataset', weight='bold', fontsize='18')
plt.yticks(weight='bold', fontsize=12)
plt.show()

In [None]:
# Shuffle dataframe
base4=base4.sample(frac=1, random_state=42)

n=base4.shape[0]

base4_train = base4.iloc[:round(0.75*n),:] # 75% of dataset
base4_test = base4.iloc[round(0.75*n):,:] # 25% of datas

### B. Base4 Train

In [None]:
all_0, all_1 = base4_train['label'].value_counts()[0]*100 / len(base4_train.index), base4_train['label'].value_counts()[1]*100 / len(base4_train.index)

print("In Base 4 train dataset: there are {} % of normal traffic and {} % of attacks".format(all_0, all_1))

In [None]:
# Plotting target label
plt.figure(figsize=(15,4))
sns.set_theme(style="darkgrid")
sns.countplot(x=base4_train['label'],  palette='CMRmap')
plt.title('Distribusi target label di Base 4 train dataset', weight='bold', fontsize='18')
plt.show()

In [None]:
# Plotting attack category distribution
plt.figure(figsize=(15, 4))
sns.set_theme(style="darkgrid")
sns.countplot(x=base4_train['attack_cat'], order = base4_train['attack_cat'].value_counts().index, palette='CMRmap')
plt.title('Distribusi kategori serangan di Base 4 train dataset', weight='bold', fontsize='18')
plt.yticks(weight='bold', fontsize=12)
plt.show()

In [None]:
base4_train.value_counts('label')

### C. Base4 test

In [None]:
all_0, all_1 = base4_test['label'].value_counts()[0]*100 / len(base4_test.index), base4_test['label'].value_counts()[1]*100 / len(base4_test.index)

print("In Base 4 test dataset: there are {} % of normal traffic and {} % of attacks".format(all_0, all_1))

In [None]:
# Plotting target label
plt.figure(figsize=(15,4))
sns.set_theme(style="darkgrid")
sns.countplot(x=base4_test['label'],  palette='CMRmap')
plt.title('Distribusi label target di Base 5 test dataset', weight='bold', fontsize='18')
plt.show()

In [None]:
# Plotting attack category distribution
plt.figure(figsize=(15, 4))
sns.set_theme(style="darkgrid")
sns.countplot(x=base4_test['attack_cat'], order = base4_test['attack_cat'].value_counts().index, palette='CMRmap')
plt.title('Distribusi kategori serangan di Base 4 test dataset', weight='bold', fontsize='18')
plt.yticks(weight='bold', fontsize=12)
plt.show()

In [None]:
base4_test['attack_cat'].value_counts()

### D. first_test

In [None]:
# Creating the Base 5+ test dataset generic, exploits, DoS, and reconnaissance)
first_test = pd.concat([normal, exploits, fuzzers, analysis, backdoor, shellcode, worms])

In [None]:
# Balancing the dataset to obtain 50/50% attacks/normal traffic
first_test=cut_off(first_test)

In [None]:
all_0, all_1 = first_test['label'].value_counts()[0]*100 / len(first_test.index), first_test['label'].value_counts()[1]*100 / len(first_test.index)

print("In First test : there are {} % of normal traffic and {} % of attacks".format(all_0, all_1))

In [None]:
# Plotting target label
plt.figure(figsize=(15,4))
sns.set_theme(style="darkgrid")
sns.countplot(x=first_test['label'],  palette='CMRmap')
plt.title('Distribusi target label di First_test dataset', weight='bold', fontsize='18')
plt.show()

In [None]:
# Plotting attack category distribution
plt.figure(figsize=(15, 4))
sns.set_theme(style="darkgrid")
sns.countplot(x=first_test['attack_cat'], order = first_test['attack_cat'].value_counts().index, palette='CMRmap')
plt.title('Distribusi kategori serangan di first_test dataset', weight='bold', fontsize='18')
plt.yticks(weight='bold', fontsize=12)
plt.show()

In [None]:
first_test['attack_cat'].value_counts()

### E. Base Test

In [None]:
# Creating the UNSW-NB15-base test
full_test = cut_off(data)

In [None]:
# Plotting target label
plt.figure(figsize=(15,4))
sns.set_theme(style="darkgrid")
sns.countplot(x=full_test['label'],  palette='CMRmap')
plt.title('Distribusi label target di full_test dataset', weight='bold', fontsize='18')
plt.show()

In [None]:
# Plotting attack category distribution
plt.figure(figsize=(15, 4))
sns.set_theme(style="darkgrid")
sns.countplot(x=full_test['attack_cat'], order = full_test['attack_cat'].value_counts().index, palette='CMRmap')
plt.title('Distribusi serangan di full_test dataset', weight='bold', fontsize='18')
plt.yticks(weight='bold', fontsize=12)
plt.show()

In [None]:
full_test.value_counts('attack_cat')

In [None]:
base4.to_csv('~/04_Semester_4/revisi/csv_files/UNSW_NB15_base4.csv', index=False)
base4_train.to_csv('~/04_Semester_4/revisi/csv_files/UNSW_NB15_base4_train.csv', index=False)
base4_test.to_csv('~/04_Semester_4/revisi/csv_files/UNSW_NB15_base4_test.csv', index=False)
first_test.to_csv('~/04_Semester_4/revisi/csv_files/UNSW_NB15_first_test.csv', index=False)
full_test.to_csv('~/04_Semester_4/revisi/csv_files/UNSW_NB15_full_test.csv', index=False)