In [1]:
# !pip install smote_variants

In [1]:
import pandas as pd
import numpy as np
import smote_variants as sv

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [4]:
# from google.colab import drive
# drive.mount('/content/drive/')

In [3]:
# csv_file = "/content/drive/MyDrive/ColabNotebooks/IDS/small_first_preprocess.csv"
csv_file = "./raw/first_preprocess.csv"

In [4]:
df = pd.read_csv(csv_file)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2522299 entries, 0 to 2522298
Data columns (total 69 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   destination_port             int64  
 1   flow_duration                int64  
 2   total_fwd_packets            int64  
 3   total_backward_packets       int64  
 4   total_length_of_fwd_packets  int64  
 5   total_length_of_bwd_packets  int64  
 6   fwd_packet_length_max        int64  
 7   fwd_packet_length_min        int64  
 8   fwd_packet_length_mean       float64
 9   fwd_packet_length_std        float64
 10  bwd_packet_length_max        int64  
 11  bwd_packet_length_min        int64  
 12  bwd_packet_length_mean       float64
 13  bwd_packet_length_std        float64
 14  flow_bytes_s                 float64
 15  flow_packets_s               float64
 16  flow_iat_mean                float64
 17  flow_iat_std                 float64
 18  flow_iat_max                 int64  
 19  

In [5]:
df.count()

destination_port               2522299
flow_duration                  2522299
total_fwd_packets              2522299
total_backward_packets         2522299
total_length_of_fwd_packets    2522299
                                ...   
idle_mean                      2522299
idle_std                       2522299
idle_max                       2522299
idle_min                       2522299
label                          2522299
Length: 69, dtype: int64

In [7]:
df[df.isin([np.inf, -np.inf]).any(axis = 1)]

Unnamed: 0,destination_port,flow_duration,total_fwd_packets,total_backward_packets,total_length_of_fwd_packets,total_length_of_bwd_packets,fwd_packet_length_max,fwd_packet_length_min,fwd_packet_length_mean,fwd_packet_length_std,...,min_seg_size_forward,active_mean,active_std,active_max,active_min,idle_mean,idle_std,idle_max,idle_min,label


In [8]:
nan_info = df.isnull().sum()
nan_info[nan_info > 0]

Series([], dtype: int64)

#### Change int64 to int32 or int8, float64 to float32 or float16. This change will save memory

In [9]:
int8_vals = np.iinfo(np.int8)
int16_vals = np.iinfo(np.int16)
int32_vals = np.iinfo(np.int32)

float16_vals = np.finfo(np.float16)
float32_vals = np.finfo(np.float32)

In [10]:
for col in df.columns:
    max_val = df[col].max()
    min_val = df[col].min()

    print(f"{col}: max {max_val} -- min {min_val}")
    
    if df[col].dtype == np.int64:
        if max_val <= int8_vals.max and min_val >= int8_vals.min:
            df[col] = df[col].astype(np.int8)
        elif max_val <= int16_vals.max and min_val >= int16_vals.min:
            df[col] = df[col].astype(np.int16)
        elif max_val <= int32_vals.max and min_val >= int32_vals.min:
            df[col] = df[col].astype(np.int32)

    elif df[col].dtype == np.float64:
        if max_val <= float16_vals.max and min_val >= float16_vals.min:
            df[col] = df[col].astype(np.float16)
        elif max_val <= float32_vals.max and min_val >= float32_vals.min:
            df[col] = df[col].astype(np.float32)

destination_port: max 65535 -- min 0
flow_duration: max 119999998 -- min -13
total_fwd_packets: max 219759 -- min 1
total_backward_packets: max 291922 -- min 0
total_length_of_fwd_packets: max 12900000 -- min 0
total_length_of_bwd_packets: max 655453030 -- min 0
fwd_packet_length_max: max 24820 -- min 0
fwd_packet_length_min: max 2325 -- min 0
fwd_packet_length_mean: max 5940.857143 -- min 0.0
fwd_packet_length_std: max 7125.5968458437 -- min 0.0
bwd_packet_length_max: max 19530 -- min 0
bwd_packet_length_min: max 2896 -- min 0
bwd_packet_length_mean: max 5800.5 -- min 0.0
bwd_packet_length_std: max 8194.660487 -- min 0.0
flow_bytes_s: max 2071000000.0 -- min -261000000.0
flow_packets_s: max 4000000.0 -- min -2000000.0
flow_iat_mean: max 120000000.0 -- min -13.0
flow_iat_std: max 84800261.5664079 -- min 0.0
flow_iat_max: max 120000000 -- min -13
flow_iat_min: max 120000000 -- min -14
fwd_iat_total: max 120000000 -- min 0
fwd_iat_mean: max 120000000.0 -- min 0.0
fwd_iat_std: max 8460292

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2522299 entries, 0 to 2522298
Data columns (total 69 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   destination_port             int32  
 1   flow_duration                int32  
 2   total_fwd_packets            int32  
 3   total_backward_packets       int32  
 4   total_length_of_fwd_packets  int32  
 5   total_length_of_bwd_packets  int32  
 6   fwd_packet_length_max        int16  
 7   fwd_packet_length_min        int16  
 8   fwd_packet_length_mean       float16
 9   fwd_packet_length_std        float16
 10  bwd_packet_length_max        int16  
 11  bwd_packet_length_min        int16  
 12  bwd_packet_length_mean       float16
 13  bwd_packet_length_std        float16
 14  flow_bytes_s                 float32
 15  flow_packets_s               float32
 16  flow_iat_mean                float32
 17  flow_iat_std                 float32
 18  flow_iat_max                 int32  
 19  

In [12]:
df.count()

destination_port               2522299
flow_duration                  2522299
total_fwd_packets              2522299
total_backward_packets         2522299
total_length_of_fwd_packets    2522299
                                ...   
idle_mean                      2522299
idle_std                       2522299
idle_max                       2522299
idle_min                       2522299
label                          2522299
Length: 69, dtype: int64

In [13]:
train_df, test_df = train_test_split(df, test_size = 0.3, random_state = 42)

In [16]:
test_df.to_csv("raw_testset.csv", index = False)

In [15]:
train_df.to_csv("raw_trainset.csv", index = False)

In [17]:
X_train = train_df.drop('label', axis=1)
y_train = train_df['label']

In [18]:
del df

In [19]:
attack_df = train_df[train_df['label'] != 0]

In [20]:
attack_df

Unnamed: 0,destination_port,flow_duration,total_fwd_packets,total_backward_packets,total_length_of_fwd_packets,total_length_of_bwd_packets,fwd_packet_length_max,fwd_packet_length_min,fwd_packet_length_mean,fwd_packet_length_std,...,min_seg_size_forward,active_mean,active_std,active_max,active_min,idle_mean,idle_std,idle_max,idle_min,label
61498,80,1307410,3,4,26,11607,20,0,8.664062,10.265625,...,20,0.0,0.0,0,0,0.0,0.0,0,0,1
357533,1,57,1,1,0,6,0,0,0.000000,0.000000,...,40,0.0,0.0,0,0,0.0,0.0,0,0,7
2008166,80,85588084,9,5,346,11595,346,0,38.437500,115.312500,...,32,974.0,0.0,974,974,84400000.0,0.0,84400000,84400000,2
1961741,80,103638512,15,3,2541,6,231,0,169.375000,105.750000,...,32,3708097.0,5243242.0,7415629,565,19200000.0,18800000.0,51300000,5830022,2
2160983,80,85774440,5,8,362,11595,350,0,72.375000,155.250000,...,20,2101.0,0.0,2101,2101,85600000.0,0.0,85600000,85600000,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327069,2968,60,1,1,2,6,2,2,2.000000,0.000000,...,24,0.0,0.0,0,0,0.0,0.0,0,0,7
329365,32785,12,1,1,2,6,2,2,2.000000,0.000000,...,24,0.0,0.0,0,0,0.0,0.0,0,0,7
2138242,80,84787082,7,6,378,11595,354,0,54.000000,132.375000,...,20,22964.0,0.0,22964,22964,84600000.0,0.0,84600000,84600000,2
2003274,80,85008160,8,6,335,11595,335,0,41.875000,118.437500,...,32,992.0,0.0,992,992,84900000.0,0.0,84900000,84900000,2


In [21]:
benign_df = train_df[train_df['label'] == 0]

In [22]:
sampled_benign_df = benign_df.sample(frac = 0.65)

In [23]:
sampled_benign_df

Unnamed: 0,destination_port,flow_duration,total_fwd_packets,total_backward_packets,total_length_of_fwd_packets,total_length_of_bwd_packets,fwd_packet_length_max,fwd_packet_length_min,fwd_packet_length_mean,fwd_packet_length_std,...,min_seg_size_forward,active_mean,active_std,active_max,active_min,idle_mean,idle_std,idle_max,idle_min,label
605389,53006,38,1,1,6,6,6,6,6.00000,0.000,...,20,0.000000,0.000000,0,0,0.0,0.000000,0,0,0
651875,443,115470947,20,17,1032,1179,517,0,51.59375,141.500,...,32,44635.546875,27571.160156,127763,36149,10012181.0,7639.007812,10031700,9998081,0
2435604,50618,64,1,1,0,0,0,0,0.00000,0.000,...,32,0.000000,0.000000,0,0,0.0,0.000000,0,0,0
259962,53,48252,4,4,116,456,29,29,29.00000,0.000,...,20,0.000000,0.000000,0,0,0.0,0.000000,0,0,0
2258153,443,3028486,9,7,2660,2068,1084,0,295.50000,457.500,...,32,0.000000,0.000000,0,0,0.0,0.000000,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
736966,443,117840363,18,22,1873,5287,815,0,104.06250,202.375,...,32,266017.000000,266495.812500,454458,77576,58635956.0,245308.781250,58809416,58462497,0
1721811,53,81844,2,2,90,196,45,45,45.00000,0.000,...,32,0.000000,0.000000,0,0,0.0,0.000000,0,0,0
824974,80,60042,3,4,432,2280,426,0,144.00000,244.250,...,20,0.000000,0.000000,0,0,0.0,0.000000,0,0,0
803461,443,6,1,1,0,0,0,0,0.00000,0.000,...,32,0.000000,0.000000,0,0,0.0,0.000000,0,0,0


In [24]:
new_df = pd.concat([sampled_benign_df, attack_df], ignore_index = True)

In [25]:
new_df 

Unnamed: 0,destination_port,flow_duration,total_fwd_packets,total_backward_packets,total_length_of_fwd_packets,total_length_of_bwd_packets,fwd_packet_length_max,fwd_packet_length_min,fwd_packet_length_mean,fwd_packet_length_std,...,min_seg_size_forward,active_mean,active_std,active_max,active_min,idle_mean,idle_std,idle_max,idle_min,label
0,53006,38,1,1,6,6,6,6,6.00000,0.000000,...,20,0.000000,0.000000,0,0,0.0,0.000000,0,0,0
1,443,115470947,20,17,1032,1179,517,0,51.59375,141.500000,...,32,44635.546875,27571.160156,127763,36149,10012181.0,7639.007812,10031700,9998081,0
2,50618,64,1,1,0,0,0,0,0.00000,0.000000,...,32,0.000000,0.000000,0,0,0.0,0.000000,0,0,0
3,53,48252,4,4,116,456,29,29,29.00000,0.000000,...,20,0.000000,0.000000,0,0,0.0,0.000000,0,0,0
4,443,3028486,9,7,2660,2068,1084,0,295.50000,457.500000,...,32,0.000000,0.000000,0,0,0.0,0.000000,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1251914,2968,60,1,1,2,6,2,2,2.00000,0.000000,...,24,0.000000,0.000000,0,0,0.0,0.000000,0,0,7
1251915,32785,12,1,1,2,6,2,2,2.00000,0.000000,...,24,0.000000,0.000000,0,0,0.0,0.000000,0,0,7
1251916,80,84787082,7,6,378,11595,354,0,54.00000,132.375000,...,20,22964.000000,0.000000,22964,22964,84600000.0,0.000000,84600000,84600000,2
1251917,80,85008160,8,6,335,11595,335,0,41.87500,118.437500,...,32,992.000000,0.000000,992,992,84900000.0,0.000000,84900000,84900000,2


In [26]:
new_df.count()

destination_port               1251919
flow_duration                  1251919
total_fwd_packets              1251919
total_backward_packets         1251919
total_length_of_fwd_packets    1251919
                                ...   
idle_mean                      1251919
idle_std                       1251919
idle_max                       1251919
idle_min                       1251919
label                          1251919
Length: 69, dtype: int64

In [27]:
new_df.to_csv("raw_cutoff_trainset.csv")

In [28]:
X_train = new_df.drop('label', axis=1)
y_train = new_df['label']

In [29]:
features = new_df.columns.to_list()
len(features)

69

In [30]:
del benign_df, sampled_benign_df, attack_df, new_df, test_df, train_df

In [31]:
import gc
gc.collect()

0

In [None]:
oversampler = sv.MulticlassOversampling('SMOTE')
X_samp, y_samp = oversampler.sample(X_train, y_train)

In [28]:
new_df = pd.DataFrame(X_samp, columns = features[:-1])
new_df['label'] = y_samp

In [33]:
new_df.count()

destination_port               9539966
flow_duration                  9539966
total_fwd_packets              9539966
total_backward_packets         9539966
total_length_of_fwd_packets    9539966
                                ...   
idle_mean                      9539966
idle_std                       9539966
idle_max                       9539966
idle_min                       9539966
label                          9539966
Length: 69, dtype: int64

In [37]:
new_df.to_csv("smote_trainset.csv")

In [38]:
del new_df, X_samp, y_samp
gc.collect()

4523