In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif, chi2, mutual_info_classif
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
# from google.colab import drive
# drive.mount('/content/drive/')

Mounted at /content/drive/


In [6]:
def change_datatype(df):
    int8_vals = np.iinfo(np.int8)
    int16_vals = np.iinfo(np.int16)
    int32_vals = np.iinfo(np.int32)

    float16_vals = np.finfo(np.float16)
    float32_vals = np.finfo(np.float32)
    
    for col in df.columns:
        max_val = df[col].max()
        min_val = df[col].min()

        # print(f"{col}: max {max_val} -- min {min_val}")
        
        if df[col].dtype == np.int64:
            if max_val <= int8_vals.max and min_val >= int8_vals.min:
                df[col] = df[col].astype(np.int8)
            elif max_val <= int16_vals.max and min_val >= int16_vals.min:
                df[col] = df[col].astype(np.int16)
            elif max_val <= int32_vals.max and min_val >= int32_vals.min:
                df[col] = df[col].astype(np.int32)

        elif df[col].dtype == np.float64:
            if max_val <= float16_vals.max and min_val >= float16_vals.min:
                df[col] = df[col].astype(np.float16)
            elif max_val <= float32_vals.max and min_val >= float32_vals.min:
                df[col] = df[col].astype(np.float32)
    return df

In [8]:
# csv_file = "/content/drive/MyDrive/ColabNotebooks/IDS/small_first_preprocess.csv"
csv_train = "./oversampling/NRAS_trainset.csv"
csv_test = "./raw/raw_testset.csv"

In [9]:
train_df = pd.read_csv(csv_train)
train_df = change_datatype(train_df)

In [17]:
train_df = train_df.drop('Unnamed: 0', axis = 1)

In [13]:
test_df = pd.read_csv(csv_test)
test_df = change_datatype(test_df)

In [18]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9539966 entries, 0 to 9539965
Data columns (total 69 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   destination_port             float32
 1   flow_duration                float32
 2   total_fwd_packets            float32
 3   total_backward_packets       float32
 4   total_length_of_fwd_packets  float32
 5   total_length_of_bwd_packets  float32
 6   fwd_packet_length_max        float16
 7   fwd_packet_length_min        float16
 8   fwd_packet_length_mean       float16
 9   fwd_packet_length_std        float16
 10  bwd_packet_length_max        float16
 11  bwd_packet_length_min        float16
 12  bwd_packet_length_mean       float16
 13  bwd_packet_length_std        float16
 14  flow_bytes_s                 float32
 15  flow_packets_s               float32
 16  flow_iat_mean                float32
 17  flow_iat_std                 float32
 18  flow_iat_max                 float32
 19  

In [14]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 756690 entries, 0 to 756689
Data columns (total 69 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   destination_port             756690 non-null  int32  
 1   flow_duration                756690 non-null  int32  
 2   total_fwd_packets            756690 non-null  int32  
 3   total_backward_packets       756690 non-null  int32  
 4   total_length_of_fwd_packets  756690 non-null  int32  
 5   total_length_of_bwd_packets  756690 non-null  int32  
 6   fwd_packet_length_max        756690 non-null  int16  
 7   fwd_packet_length_min        756690 non-null  int16  
 8   fwd_packet_length_mean       756690 non-null  float16
 9   fwd_packet_length_std        756690 non-null  float16
 10  bwd_packet_length_max        756690 non-null  int16  
 11  bwd_packet_length_min        756690 non-null  int16  
 12  bwd_packet_length_mean       756690 non-null  float16
 13 

In [19]:
X_train = train_df.drop('label', axis = 1)
y_train = train_df['label']

X_test = test_df.drop('label', axis = 1)
y_test = test_df['label']

In [20]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(9539966, 68) (9539966,)
(756690, 68) (756690,)


#### Scaler

In [21]:
scaler = QuantileTransformer(
       n_quantiles = 10000,
       random_state = 6969,
       output_distribution = "uniform"
)

In [22]:
X_train_scaler = scaler.fit_transform(X_train)

In [23]:
X_test_scaler = scaler.transform(X_test)

In [28]:
scaling_df = pd.DataFrame(X_train_scaler, columns = train_df.columns[:-1])

#### Feature Selection

In [25]:
fs = SelectKBest(score_func = chi2, k = 40)
X_selected = fs.fit_transform(X_train_scaler, y_train)

In [26]:
best_features_chi2 = fs.transform(X_train_scaler)

In [29]:
mask = fs.get_support()
best_features = scaling_df.columns[mask]

In [30]:
vif = pd.DataFrame()
vif["Feature"] = best_features

In [31]:
vif["VIF"] = [variance_inflation_factor(X_selected, i) for i in range(X_selected.shape[1])]

In [32]:
vif

Unnamed: 0,Feature,VIF
0,destination_port,17.38357
1,flow_duration,28.06028
2,total_fwd_packets,86289.24
3,total_length_of_fwd_packets,174766.6
4,fwd_packet_length_max,118.7234
5,fwd_packet_length_min,85.97692
6,fwd_packet_length_std,99.04196
7,bwd_packet_length_min,2.709571
8,bwd_packet_length_std,40.3731
9,flow_iat_std,35.90811


In [33]:
vif_threshold = 40

In [34]:
selected_features = vif[vif['VIF'] < vif_threshold]['Feature']

In [35]:
selected_features

0          destination_port
1             flow_duration
7     bwd_packet_length_min
9              flow_iat_std
13            bwd_iat_total
20        max_packet_length
22           fin_flag_count
24           psh_flag_count
25           ack_flag_count
26           urg_flag_count
27            down_up_ratio
28      average_packet_size
31         act_data_pkt_fwd
33               active_std
38                 idle_max
Name: Feature, dtype: object