In [2]:
# Read original data
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
import warnings
from fastprogress import fastprogress
warnings.filterwarnings("ignore")

df = pd.read_csv("../dataset/UGR/original.csv")
print('Original dataset:')
print(df['Label'].value_counts())
print("----------------------------------")
df['split'] = np.random.randn(df.shape[0], 1)

msk = np.random.rand(len(df)) <= 0.7

train = df[msk]
test = df[~msk]
train = train.drop(['split'],axis=1)
test = test.drop(['split'],axis=1)

train.to_csv("../dataset/UGR/train_ori.csv", index=False, header = True)
test.to_csv("../dataset/UGR/test.csv", index=False, header = True)
print(train['Label'].value_counts())
print("----------------------------------")
print(test['Label'].value_counts())

Original dataset:
scan11             20000
background         20000
blacklist          20000
scan44             20000
dos                 4500
nerisbotnet         2000
anomaly-spam         400
anomaly-sshscan      109
Name: Label, dtype: int64
----------------------------------
scan44             14093
background         14029
scan11             14016
blacklist          14013
dos                 3165
nerisbotnet         1430
anomaly-spam         266
anomaly-sshscan       75
Name: Label, dtype: int64
----------------------------------
blacklist          5987
scan11             5984
background         5971
scan44             5907
dos                1335
nerisbotnet         570
anomaly-spam        134
anomaly-sshscan      34
Name: Label, dtype: int64


In [3]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
import warnings
from fastprogress import fastprogress
warnings.filterwarnings("ignore")

dt = pd.read_csv("../dataset/UGR/train_ori.csv")
print("Original label values: \n")
print(dt['Label'].value_counts())
print("-----------------------------------------")
# Random data attack and background 
attack1 = dt[dt['Label']=="dos"]
attack2 = dt[dt['Label']=="nerisbotnet"]
attack3 = dt[dt['Label']=="anomaly-spam"]
attack4 = dt[dt['Label']=="anomaly-sshscan"]
normal = dt[dt['Label']=="background"]

print(attack1['Label'].value_counts())
print(attack2['Label'].value_counts())
print(attack3['Label'].value_counts())
print(attack4['Label'].value_counts())
print(normal['Label'].value_counts())

print("Label converting...")
# Convert label
label = {'dos': 1}
attack1['Label'] = attack1['Label'].map(label).fillna(attack1['Label'])

label2 = {'nerisbotnet': 1}
attack2['Label'] = attack2['Label'].map(label2).fillna(attack2['Label'])

label3 = {'anomaly-spam': 1}
attack3['Label'] = attack3['Label'].map(label3).fillna(attack3['Label'])

label4 = {'anomaly-sshscan': 1}
attack4['Label'] = attack4['Label'].map(label4).fillna(attack4['Label'])

label5 = {'background': 0}
normal['Label'] = normal['Label'].map(label5).fillna(normal['Label'])

# Merge data
#DoS
data1 = pd.concat([attack1,normal],ignore_index = True,axis=0)
data_train1 = shuffle(data1).reset_index(drop=True)

#Botnet
data2 = pd.concat([attack2,normal],ignore_index = True,axis=0)
data_train2 = shuffle(data2).reset_index(drop=True)

#Spam
data3 = pd.concat([attack3,normal],ignore_index = True,axis=0)
data_train3 = shuffle(data3).reset_index(drop=True)

#Ssh
data4 = pd.concat([attack4,normal],ignore_index = True,axis=0)
data_train4 = shuffle(data4).reset_index(drop=True)

# Save file
print("-----------------------------------------")
print("Dos Dataset:")
print(data_train1['Label'].value_counts())
print("-----------------------------------------")
data_train1.to_csv("../dataset/UGR/traindos.csv", index=False, header = True)
print("Saved to file train 1.csv")

print("-----------------------------------------")
print("Botnet Dataset:")
print(data_train2['Label'].value_counts())
print("-----------------------------------------")
data_train2.to_csv("../dataset/UGR/trainbot.csv", index=False, header = True)
print("Saved to file train 2.csv")

print("-----------------------------------------")
print("Spam Dataset:")
print(data_train3['Label'].value_counts())
print("-----------------------------------------")
data_train3.to_csv("../dataset/UGR/trainspam.csv", index=False, header = True)
print("Saved to file train 3.csv")

print("-----------------------------------------")
print("Scan Dataset:")
print(data_train4['Label'].value_counts())
print("-----------------------------------------")
data_train4.to_csv("../dataset/UGR/trainscan.csv", index=False, header = True)
print("Saved to file train 4.csv")

Original label values: 

scan44             14093
background         14029
scan11             14016
blacklist          14013
dos                 3165
nerisbotnet         1430
anomaly-spam         266
anomaly-sshscan       75
Name: Label, dtype: int64
-----------------------------------------
dos    3165
Name: Label, dtype: int64
nerisbotnet    1430
Name: Label, dtype: int64
anomaly-spam    266
Name: Label, dtype: int64
anomaly-sshscan    75
Name: Label, dtype: int64
background    14029
Name: Label, dtype: int64
Label converting...
-----------------------------------------
Dos Dataset:
0    14029
1     3165
Name: Label, dtype: int64
-----------------------------------------
Saved to file train 1.csv
-----------------------------------------
Botnet Dataset:
0    14029
1     1430
Name: Label, dtype: int64
-----------------------------------------
Saved to file train 2.csv
-----------------------------------------
Spam Dataset:
0    14029
1      266
Name: Label, dtype: int64
--------------

In [4]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
import warnings
from fastprogress import fastprogress
warnings.filterwarnings("ignore")

dt1 = pd.read_csv("../dataset/UGR/test.csv")
print("Original label values: \n")
print(dt1['Label'].value_counts())
print("-----------------------------------------")
# Random data attack and background 
attack1 = dt1[dt1['Label']=="dos"]
attack2 = dt1[dt1['Label']=="nerisbotnet"]
attack3 = dt1[dt1['Label']=="anomaly-spam"]
attack4 = dt1[dt1['Label']=="anomaly-sshscan"]
normal = dt1[dt1['Label']=="background"]

print(attack1['Label'].value_counts())
print(attack2['Label'].value_counts())
print(attack3['Label'].value_counts())
print(attack4['Label'].value_counts())
print(normal['Label'].value_counts())

print("Label converting...")
# Convert label
label = {'dos': 1}
attack1['Label'] = attack1['Label'].map(label).fillna(attack1['Label'])

label2 = {'nerisbotnet': 1}
attack2['Label'] = attack2['Label'].map(label2).fillna(attack2['Label'])

label3 = {'anomaly-spam': 1}
attack3['Label'] = attack3['Label'].map(label3).fillna(attack3['Label'])

label4 = {'anomaly-sshscan': 1}
attack4['Label'] = attack4['Label'].map(label4).fillna(attack4['Label'])

label5 = {'background': 0}
normal['Label'] = normal['Label'].map(label5).fillna(normal['Label'])

# Merge data
#DoS
data1 = pd.concat([attack1,normal],ignore_index = True,axis=0)
data_train1 = shuffle(data1).reset_index(drop=True)

#Botnet
data2 = pd.concat([attack2,normal],ignore_index = True,axis=0)
data_train2 = shuffle(data2).reset_index(drop=True)

#Spam
data3 = pd.concat([attack3,normal],ignore_index = True,axis=0)
data_train3 = shuffle(data3).reset_index(drop=True)

#Ssh
data4 = pd.concat([attack4,normal],ignore_index = True,axis=0)
data_train4 = shuffle(data4).reset_index(drop=True)

# Save file
print("-----------------------------------------")
print("Dos Dataset:")
print(data_train1['Label'].value_counts())
print("-----------------------------------------")
data_train1.to_csv("../dataset/UGR/testdos.csv", index=False, header = True)
print("Saved to file test 1.csv")

print("-----------------------------------------")
print("Botnet Dataset:")
print(data_train2['Label'].value_counts())
print("-----------------------------------------")
data_train2.to_csv("../dataset/UGR/testbot.csv", index=False, header = True)
print("Saved to file test 2.csv")

print("-----------------------------------------")
print("Spam Dataset:")
print(data_train3['Label'].value_counts())
print("-----------------------------------------")
data_train3.to_csv("../dataset/UGR/testspam.csv", index=False, header = True)
print("Saved to file train 3.csv")

print("-----------------------------------------")
print("Scan Dataset:")
print(data_train4['Label'].value_counts())
print("-----------------------------------------")
data_train4.to_csv("../dataset/UGR/testscan.csv", index=False, header = True)
print("Saved to file train 4.csv")

Original label values: 

blacklist          5987
scan11             5984
background         5971
scan44             5907
dos                1335
nerisbotnet         570
anomaly-spam        134
anomaly-sshscan      34
Name: Label, dtype: int64
-----------------------------------------
dos    1335
Name: Label, dtype: int64
nerisbotnet    570
Name: Label, dtype: int64
anomaly-spam    134
Name: Label, dtype: int64
anomaly-sshscan    34
Name: Label, dtype: int64
background    5971
Name: Label, dtype: int64
Label converting...
-----------------------------------------
Dos Dataset:
0    5971
1    1335
Name: Label, dtype: int64
-----------------------------------------
Saved to file test 1.csv
-----------------------------------------
Botnet Dataset:
0    5971
1     570
Name: Label, dtype: int64
-----------------------------------------
Saved to file test 2.csv
-----------------------------------------
Spam Dataset:
0    5971
1     134
Name: Label, dtype: int64
--------------------------------