In [1]:
# Read original data
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
import warnings
from fastprogress import fastprogress
warnings.filterwarnings("ignore")

df = pd.read_csv("../dataset/KDD-2018/original.csv")
print('Original dataset:')
print(df['Label'].value_counts())
print("----------------------------------")
df['split'] = np.random.randn(df.shape[0], 1)

msk = np.random.rand(len(df)) <= 0.7

train = df[msk]
test = df[~msk]
train = train.drop(['split'],axis=1)
test = test.drop(['split'],axis=1)

train.to_csv("../dataset/KDD-2018/train_ori.csv", index=False, header = True)
test.to_csv("../dataset/KDD-2018/test.csv", index=False, header = True)
print(train['Label'].value_counts())
print("----------------------------------")
print(test['Label'].value_counts())

Original dataset:
normal    20000
Dos       20000
Probe      8153
R2L         697
U2R          36
Name: Label, dtype: int64
----------------------------------
Dos       14054
normal    14019
Probe      5653
R2L         506
U2R          24
Name: Label, dtype: int64
----------------------------------
normal    5981
Dos       5946
Probe     2500
R2L        191
U2R         12
Name: Label, dtype: int64


In [8]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
import warnings
from fastprogress import fastprogress
warnings.filterwarnings("ignore")

dt = pd.read_csv("../dataset/KDD-2018/train_ori.csv")
print("Original label values: \n")
print(dt['Label'].value_counts())
print("-----------------------------------------")
# Random data attack and background 
attack1 = dt[dt['Label']=="Probe"]
attack2 = dt[dt['Label']=="R2L"]
attack3 = dt[dt['Label']=="U2R"]
normal = dt[dt['Label']=="normal"]

print(attack1['Label'].value_counts())
print(attack2['Label'].value_counts())
print(attack3['Label'].value_counts())
print(normal['Label'].value_counts())

print("Label converting...")
# Convert label
label = {'Probe': 1}
attack1['Label'] = attack1['Label'].map(label).fillna(attack1['Label'])

label2 = {'R2L': 1}
attack2['Label'] = attack2['Label'].map(label2).fillna(attack2['Label'])

label3 = {'U2R': 1}
attack3['Label'] = attack3['Label'].map(label3).fillna(attack3['Label'])

label4 = {'normal': 0}
normal['Label'] = normal['Label'].map(label4).fillna(normal['Label'])

# Merge data
#Probe
data1 = pd.concat([attack1,normal],ignore_index = True,axis=0)
data_train1 = shuffle(data1).reset_index(drop=True)

#R2L
data2 = pd.concat([attack2,normal],ignore_index = True,axis=0)
data_train2 = shuffle(data2).reset_index(drop=True)

#U2R
data3 = pd.concat([attack3,normal],ignore_index = True,axis=0)
data_train3 = shuffle(data3).reset_index(drop=True)

# Save file
print("-----------------------------------------")
print("Probe Dataset:")
print(data_train1['Label'].value_counts())
print("-----------------------------------------")
#data_train1.to_csv("../dataset/KDD-2018/trainprobe.csv", index=False, header = True)
print("Saved to file train 1.csv")

print("-----------------------------------------")
print("R2L Dataset:")
print(data_train2['Label'].value_counts())
print("-----------------------------------------")
#data_train2.to_csv("../dataset/KDD-2018/trainr2l.csv", index=False, header = True)
print("Saved to file train 2.csv")

print("-----------------------------------------")
print("U2R Dataset:")
print(data_train3['Label'].value_counts())
print("-----------------------------------------")
data_train3.to_csv("../dataset/KDD-2018/trainu2r.csv", index=False, header = True)
print("Saved to file train 3.csv")

Original label values: 

Dos       14054
normal    14019
Probe      5653
R2L         506
U2R          24
Name: Label, dtype: int64
-----------------------------------------
Probe    5653
Name: Label, dtype: int64
R2L    506
Name: Label, dtype: int64
U2R    24
Name: Label, dtype: int64
normal    14019
Name: Label, dtype: int64
Label converting...
-----------------------------------------
Probe Dataset:
0    14019
1     5653
Name: Label, dtype: int64
-----------------------------------------
Saved to file train 1.csv
-----------------------------------------
R2L Dataset:
0    14019
1      506
Name: Label, dtype: int64
-----------------------------------------
Saved to file train 2.csv
-----------------------------------------
U2R Dataset:
0    14019
1       24
Name: Label, dtype: int64
-----------------------------------------
Saved to file train 3.csv


In [9]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
import warnings
from fastprogress import fastprogress
warnings.filterwarnings("ignore")

dt1 = pd.read_csv("../dataset/KDD-2018/test.csv")
print("Original label values: \n")
print(dt1['Label'].value_counts())
print("-----------------------------------------")
# Random data attack and background 
attack1 = dt1[dt1['Label']=="Probe"]
attack2 = dt1[dt1['Label']=="R2L"]
attack3 = dt1[dt1['Label']=="U2R"]
normal = dt1[dt1['Label']=="normal"]

print(attack1['Label'].value_counts())
print(attack2['Label'].value_counts())
print(attack3['Label'].value_counts())
print(normal['Label'].value_counts())

print("Label converting...")
# Convert label
label = {'Probe': 1}
attack1['Label'] = attack1['Label'].map(label).fillna(attack1['Label'])

label2 = {'R2L': 1}
attack2['Label'] = attack2['Label'].map(label2).fillna(attack2['Label'])

label3 = {'U2R': 1}
attack3['Label'] = attack3['Label'].map(label3).fillna(attack3['Label'])

label4 = {'normal': 0}
normal['Label'] = normal['Label'].map(label4).fillna(normal['Label'])


# Merge data
#Probe
data1 = pd.concat([attack1,normal],ignore_index = True,axis=0)
data_train1 = shuffle(data1).reset_index(drop=True)

#R2L
data2 = pd.concat([attack2,normal],ignore_index = True,axis=0)
data_train2 = shuffle(data2).reset_index(drop=True)

#U2R
data3 = pd.concat([attack3,normal],ignore_index = True,axis=0)
data_train3 = shuffle(data3).reset_index(drop=True)

# Save file
print("-----------------------------------------")
print("Probe Dataset:")
print(data_train1['Label'].value_counts())
print("-----------------------------------------")
#data_train1.to_csv("../dataset/KDD-2018/testprobe.csv", index=False, header = True)
print("Saved to file test 1.csv")

print("-----------------------------------------")
print("R2L Dataset:")
print(data_train2['Label'].value_counts())
print("-----------------------------------------")
#data_train2.to_csv("../dataset/KDD-2018/testr2l.csv", index=False, header = True)
print("Saved to file test 2.csv")

print("-----------------------------------------")
print("U2R Dataset:")
print(data_train3['Label'].value_counts())
print("-----------------------------------------")
data_train3.to_csv("../dataset/KDD-2018/testu2r.csv", index=False, header = True)
print("Saved to file train 3.csv")

Original label values: 

normal    5981
Dos       5946
Probe     2500
R2L        191
U2R         12
Name: Label, dtype: int64
-----------------------------------------
Probe    2500
Name: Label, dtype: int64
R2L    191
Name: Label, dtype: int64
U2R    12
Name: Label, dtype: int64
normal    5981
Name: Label, dtype: int64
Label converting...
-----------------------------------------
Probe Dataset:
0    5981
1    2500
Name: Label, dtype: int64
-----------------------------------------
Saved to file test 1.csv
-----------------------------------------
R2L Dataset:
0    5981
1     191
Name: Label, dtype: int64
-----------------------------------------
Saved to file test 2.csv
-----------------------------------------
U2R Dataset:
0    5981
1      12
Name: Label, dtype: int64
-----------------------------------------
Saved to file train 3.csv
