In [1]:
import pandas as pd

df_test = pd.read_csv('./NSL-KDD/KDDTest+.txt', header=None)

for column in df_test.select_dtypes([object]).columns:
    df_test[column] = df_test[column].str.strip()

df_test.rename(columns={41: 'Label'}, inplace=True)
df_test.to_csv('./NSL-KDD/test.csv', index=False)
df_train = pd.read_csv('./NSL-KDD/KDDTrain+.txt', header=None)

for column in df_train.select_dtypes([object]).columns:
    df_train[column] = df_train[column].str.strip()

df_train.rename(columns={41: 'Label'}, inplace=True)
df_train.to_csv('./NSL-KDD/train.csv', index=False)
df_dataset = pd.concat([df_train, df_test], ignore_index=True)
df_dataset.to_csv('./NSL-KDD/dataset.csv', index=False)


In [2]:
import pandas as pd

dataset_df = pd.read_csv('./NSL-KDD/dataset.csv')
label_counts = dataset_df['Label'].value_counts()
print(label_counts)


Label
normal             77054
neptune            45871
satan               4368
ipsweep             3740
smurf               3311
portsweep           3088
nmap                1566
back                1315
guess_passwd        1284
mscan                996
warezmaster          964
teardrop             904
warezclient          890
apache2              737
processtable         685
snmpguess            331
saint                319
mailbomb             293
pod                  242
snmpgetattack        178
httptunnel           133
buffer_overflow       50
land                  25
multihop              25
rootkit               23
named                 17
ps                    15
sendmail              14
xterm                 13
imap                  12
loadmodule            11
ftp_write             11
xlock                  9
phf                    6
perl                   5
xsnoop                 4
spy                    2
worm                   2
sqlattack              2
udpstorm           

In [3]:
import pandas as pd

df_dataset = pd.read_csv('./NSL-KDD/dataset.csv')
label_counts = df_dataset['Label'].value_counts()
small_labels = label_counts[label_counts < 6].index
small_label_samples = df_dataset[df_dataset['Label'].isin(small_labels)]
tripled_samples = pd.concat([small_label_samples] * 3, ignore_index=True)
df_dataset = df_dataset[~df_dataset['Label'].isin(small_labels)]
df_dataset = pd.concat([df_dataset, tripled_samples], ignore_index=True)
df_dataset = df_dataset.sample(frac=1, random_state=42).reset_index(drop=True)
df_dataset.to_csv('C:/Users/Monet/Desktop/TCPCyberIDS/NSL-KDD/dataset_balanced.csv', index=False)


In [4]:
import pandas as pd

dataset_df = pd.read_csv('./NSL-KDD/dataset_balanced.csv')
label_counts = dataset_df['Label'].value_counts()
print(label_counts)


Label
normal             77054
neptune            45871
satan               4368
ipsweep             3740
smurf               3311
portsweep           3088
nmap                1566
back                1315
guess_passwd        1284
mscan                996
warezmaster          964
teardrop             904
warezclient          890
apache2              737
processtable         685
snmpguess            331
saint                319
mailbomb             293
pod                  242
snmpgetattack        178
httptunnel           133
buffer_overflow       50
land                  25
multihop              25
rootkit               23
named                 17
perl                  15
ps                    15
sendmail              14
xterm                 13
xsnoop                12
imap                  12
ftp_write             11
loadmodule            11
xlock                  9
sqlattack              6
phf                    6
worm                   6
spy                    6
udpstorm           

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

df_dataset = pd.read_csv('./NSL-KDD/dataset_balanced.csv')

train_df, temp_df = train_test_split(df_dataset, test_size=0.3, stratify=df_dataset['Label'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['Label'], random_state=42)

train_df.to_csv('./NSL-KDD/train.csv', index=False)
val_df.to_csv('./NSL-KDD/val.csv', index=False)
test_df.to_csv('./NSL-KDD/test.csv', index=False)


In [6]:
import pandas as pd

train_df = pd.read_csv('./NSL-KDD/train.csv')
val_df = pd.read_csv('./NSL-KDD/val.csv')
test_df = pd.read_csv('./NSL-KDD/test.csv')

print("Train Set Label Distribution:")
print(train_df['Label'].value_counts())

print("Validation Set Label Distribution:")
print(val_df['Label'].value_counts())

print("Test Set Label Distribution:")
print(test_df['Label'].value_counts())


Train Set Label Distribution:
Label
normal             53937
neptune            32109
satan               3058
ipsweep             2618
smurf               2318
portsweep           2162
nmap                1096
back                 920
guess_passwd         899
mscan                697
warezmaster          675
teardrop             633
warezclient          623
apache2              516
processtable         479
snmpguess            232
saint                223
mailbomb             205
pod                  169
snmpgetattack        125
httptunnel            93
buffer_overflow       35
land                  18
multihop              18
rootkit               16
named                 12
ps                    11
perl                  11
sendmail              10
xterm                  9
ftp_write              8
xsnoop                 8
imap                   8
loadmodule             8
xlock                  6
spy                    4
sqlattack              4
worm                   4
phf           

In [7]:
print(train_df[train_df['Label'] == 'worm'])
print(val_df[val_df['Label'] == 'worm'])
print(test_df[test_df['Label'] == 'worm'])

missing_worm_in_train = df_dataset[df_dataset['Label'] == 'worm']
train_df = pd.concat([train_df, missing_worm_in_train])

train_df.to_csv('./NSL-KDD/train.csv', index=False)
val_df.to_csv('./NSL-KDD/val.csv', index=False)
test_df.to_csv('./NSL-KDD/test.csv', index=False)



       0    1       2   3     4     5  6  7  8  9  ...    33    34    35   36  \
24223  9  tcp  telnet  SF  4209  7872  0  0  0  0  ...  0.01  0.07  0.01  0.0   
52156  9  tcp  telnet  SF  4209  7919  0  0  0  0  ...  0.02  0.04  0.01  0.0   
78024  9  tcp  telnet  SF  4209  7919  0  0  0  0  ...  0.02  0.04  0.01  0.0   
80983  9  tcp  telnet  SF  4209  7872  0  0  0  0  ...  0.01  0.07  0.01  0.0   

        37   38   39   40  Label  42  
24223  0.0  0.0  0.0  0.0   worm   0  
52156  0.0  0.0  0.0  0.0   worm   0  
78024  0.0  0.0  0.0  0.0   worm   0  
80983  0.0  0.0  0.0  0.0   worm   0  

[4 rows x 43 columns]
      0    1       2   3     4     5  6  7  8  9  ...    33    34    35   36  \
7921  9  tcp  telnet  SF  4209  7919  0  0  0  0  ...  0.02  0.04  0.01  0.0   

       37   38   39   40  Label  42  
7921  0.0  0.0  0.0  0.0   worm   0  

[1 rows x 43 columns]
       0    1       2   3     4     5  6  7  8  9  ...    33    34    35   36  \
17193  9  tcp  telnet  SF  4209  78

In [8]:
print("Train Set Label Categories Count:", train_df['Label'].nunique())
print("Validation Set Label Categories Count:", val_df['Label'].nunique())
print("Test Set Label Categories Count:", test_df['Label'].nunique())

Train Set Label Categories Count: 40
Validation Set Label Categories Count: 40
Test Set Label Categories Count: 40
