# UNSW-NB15: Feature Engineering

In [1]:
import numpy as np  # for array
import pandas as pd  # for csv files and dataframe
import matplotlib.pyplot as plt  # for plotting
import seaborn as sns  # plotting
from scipy import stats

import pickle  # To load data int disk
from prettytable import PrettyTable  # To print in tabular format

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from sklearn.metrics import accuracy_score, confusion_matrix, make_scorer
from sklearn.metrics import auc, f1_score, roc_curve
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_validate, cross_val_predict

%matplotlib inline

In [2]:
import pandas as pd
# Loading data from disk
train = pd.read_csv('./train_alldata_EDA.csv')
test = pd.read_csv('./test_alldata_EDA.csv')

In [3]:
train['attack_cat'].value_counts(dropna=False)

attack_cat
normal            1552862
generic            150906
exploits            31507
fuzzers             16914
dos                 11433
reconnaissance       9764
analysis             1855
backdoor             1616
shellcode            1055
worms                 120
Name: count, dtype: int64

In [4]:
# Utility function
def multi_corr(col1, col2="label", df=train):
    '''
    This function returns correlation between 2 given features.
    Also gives corr of the given features with "label" afetr applying log1p to it.
    '''
    corr = df[[col1, col2]].corr().iloc[0,1]
    log_corr = df[col1].apply(np.log1p).corr(df[col2])

    print("Correlation : {}\nlog_Correlation: {}".format(corr, log_corr))

In [5]:
def corr(col1, col2="label", df=train):
    """
    This function returns correlation between 2 given features
    """
    return df[[col1, col2]].corr().iloc[0,1]

## Removing highly correlated features

In [6]:
# Selecting all the features with high correlation values with other features
# Refer: https://chrisalbon.com/machine_learning/feature_selection/drop_highly_correlated_features/

train_prep = train.select_dtypes(exclude=['object'])

corr_matrix = train_prep.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.9
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

In [7]:
# We don't want to use these features for plotting because these are having high corr
# And most likely have same kind of plots with already plotted feature
print(to_drop)

['sloss', 'dloss', 'dpkts', 'dwin', 'ltime', 'ct_srv_dst', 'ct_src_dport_ltm', 'ct_dst_src_ltm']


In [8]:
import pickle

# Load the saved object
with open('./final_ipynb', 'rb') as f:
    saved_dict = pickle.load(f)

# Now you can use the loaded_dict
print(saved_dict)
saved_dict['corr_col'] = to_drop

{'columns': ['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'sload', 'dload', 'spkts', 'dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'trans_depth', 'res_bdy_len', 'sjit', 'djit', 'stime', 'ltime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack', 'ackdat', 'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'label'], 'binary_col': ['is_sm_ips_ports', 'is_ftp_login']}


In [9]:
train.columns

Index(['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur', 'sbytes',
       'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'sload', 'dload',
       'spkts', 'dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz',
       'dmeansz', 'trans_depth', 'res_bdy_len', 'sjit', 'djit', 'stime',
       'ltime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack', 'ackdat',
       'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login',
       'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat',
       'label'],
      dtype='object')

In [10]:
to_drop

['sloss',
 'dloss',
 'dpkts',
 'dwin',
 'ltime',
 'ct_srv_dst',
 'ct_src_dport_ltm',
 'ct_dst_src_ltm']

In [11]:
# removing the features from train and test data
train.drop(columns=to_drop, inplace=True)

In [12]:
train.shape, test.shape

((1778032, 41), (762015, 49))

## Adding New Features

Refer: https://www.elastic.co/guide/en/ecs/master/ecs-network.html
* Network bytes: Total bytes trasferred by the network. It is sum of 'sbytes' (Source to destination bytes) and 'dbytes' (Destination to source bytes).

In [13]:
# creating new features
train['network_bytes'] = train['sbytes'] + train['dbytes']

In [14]:
train.shape, test.shape

((1778032, 42), (762015, 49))

In [15]:
# Dropping columns which are not useful for the classification
# attack_cat is for multiclass classification
# all the other columns are address related and not present in sample train data

# !!changed drop attack_cat into drop label
train.drop(['srcip', 'sport', 'dstip', 'dsport', 'label'], axis=1, inplace=True)

In [16]:
# To use during test data transformation
saved_dict['to_drop'] = ['srcip', 'sport', 'dstip', 'dsport', 'label']

In [17]:
train.shape, test.shape

((1778032, 37), (762015, 49))

----- Label Encoding attack category

In [18]:
train['attack_cat'].head()

0    normal
1    normal
2    normal
3    normal
4    normal
Name: attack_cat, dtype: object

In [19]:
from sklearn import preprocessing 
  
# label_encoder object knows  
# how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 
# lab_enc = pd.DataFrame()
# Encode labels in column 'species'. 
train['attack_cat']= label_encoder.fit_transform(train['attack_cat']) 

In [20]:
train['attack_cat'].head()

0    6
1    6
2    6
3    6
4    6
Name: attack_cat, dtype: int64

In [21]:
#save the label encoder
with open('datasets/final_UNSW-NB15/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

In [22]:
#train['attack_cat']=label_encoder.inverse_transform(train['attack_cat'])
#train['attack_cat'].head()

In [23]:
#load the saved LabelEncoder
with open('datasets/final_UNSW-NB15/label_encoder.pkl', 'rb') as f:
    loaded_label_encoder = pickle.load(f)
    
test_label_encoder_df = pd.DataFrame()

test_label_encoder_df['attack_cat'] = loaded_label_encoder.inverse_transform(train['attack_cat'])
test_label_encoder_df['attack_cat'].head()

0    normal
1    normal
2    normal
3    normal
4    normal
Name: attack_cat, dtype: object

## Applying log1p on Numerical columns

During EDA we found that few numerical columns shows better visualization for pdf curves if we apply log1p to the columns.

So I thought to try log1p on all the columns and check the correlation value of the original column and log1p column with target column i.e. "label"

In [24]:
# Getting number of unique values of all the columns
# If the unique values are high that means it has continuous set of values
col_unique_values = train.nunique()

In [25]:
# If the unique values are getter than some threshould than we will check its corr
col = col_unique_values[col_unique_values>200].index

In [26]:
# Checking corr value of original col and log1p applied col
# Taking those columns whose unique values are getter than some threshould
for column in col:
    print("{:-^30}".format(column))
    multi_corr(column,'attack_cat')

-------------dur--------------
Correlation : -0.013519441246284594
log_Correlation: -0.04098854103441191
------------sbytes------------
Correlation : -0.03845640607305405
log_Correlation: 0.19859453649804734
------------dbytes------------
Correlation : 0.05098117341142439
log_Correlation: 0.33854562205324307
------------sload-------------
Correlation : -0.1452123094012568
log_Correlation: -0.1868853429287026
------------dload-------------
Correlation : 0.16581141304636643
log_Correlation: 0.41108281642575945
------------spkts-------------
Correlation : 0.06814334845385042
log_Correlation: 0.19606709148920856
------------stcpb-------------
Correlation : 0.1394512038375975
log_Correlation: 0.18731095974617937
------------dtcpb-------------
Correlation : 0.1392885842189552
log_Correlation: 0.18711348220769292
-----------smeansz------------
Correlation : -0.006363283150753063
log_Correlation: 0.037845847375549066
-----------dmeansz------------
Correlation : 0.18151041243797844
log_Correlat

In [27]:
# Will apply log1p on this columns and remove original columns
log1p_col = ['dur', 'sbytes', 'dbytes', 'sload', 'dload', 'spkts', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'sjit', 'djit', 'network_bytes']

In [28]:
saved_dict['log1p_col'] = log1p_col

In [29]:
# mode values of every features, will use to fill Null values of test
mode_dict = train.mode().iloc[0].to_dict()

In [30]:
def log1p_transform(col, df=train):
    '''
    Apply log1p on given column.
    Remove the original cola and keep log1p applied col
    '''
    new_col = col+'_log1p'
    df[new_col] = df[col].apply(np.log1p)
    df.drop(col, axis=1, inplace=True)

In [31]:
# Transforming columns with log1p
for col in log1p_col:
    log1p_transform(col, df=train)

In [32]:
train.shape

(1778032, 37)

In [33]:
train.columns

Index(['proto', 'state', 'sttl', 'dttl', 'service', 'swin', 'trans_depth',
       'res_bdy_len', 'stime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack',
       'ackdat', 'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_dst_ltm', 'ct_src_ltm',
       'ct_dst_sport_ltm', 'attack_cat', 'dur_log1p', 'sbytes_log1p',
       'dbytes_log1p', 'sload_log1p', 'dload_log1p', 'spkts_log1p',
       'stcpb_log1p', 'dtcpb_log1p', 'smeansz_log1p', 'dmeansz_log1p',
       'sjit_log1p', 'djit_log1p', 'network_bytes_log1p'],
      dtype='object')

In [34]:
train.shape, test.shape

((1778032, 37), (762015, 49))

In [35]:
# creating x and y set from the dataset
x_train, y_train = train.drop(columns=['attack_cat']), train['attack_cat']
x_test, y_test = test.drop(columns=['attack_cat']), test['attack_cat']

In [36]:
print(x_train.shape, y_train.shape)
print()
print(x_test.shape, y_test.shape)

(1778032, 36) (1778032,)

(762015, 48) (762015,)


In [37]:
x_test.columns

Index(['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur', 'sbytes',
       'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'sload', 'dload',
       'spkts', 'dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz',
       'dmeansz', 'trans_depth', 'res_bdy_len', 'sjit', 'djit', 'stime',
       'ltime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack', 'ackdat',
       'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login',
       'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'label'],
      dtype='object')

In [38]:
test

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,label
0,59.166.0.3,42878,149.171.126.5,53,udp,CON,0.001037,146,178,31,...,0,2,2,2,3,1,1,1,,0
1,59.166.0.6,26948,149.171.126.7,53,udp,CON,0.001008,146,178,31,...,0,5,2,6,3,1,1,1,,0
2,149.171.126.18,47439,175.45.176.3,53,udp,INT,0.000009,264,0,60,...,,31,31,25,25,25,25,31,,0
3,59.166.0.5,10707,149.171.126.7,39181,tcp,FIN,0.005121,3920,2456,31,...,0,7,9,6,6,1,1,4,,0
4,59.166.0.8,16847,149.171.126.5,53,udp,CON,0.001134,146,178,31,...,,3,3,1,2,1,1,2,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
762010,149.171.126.18,47439,175.45.176.1,53,udp,INT,0.000002,264,0,60,...,,24,24,6,6,6,6,24,,0
762011,59.166.0.3,39778,149.171.126.1,63666,tcp,FIN,0.222460,8928,320,31,...,,1,1,1,3,1,1,1,,0
762012,59.166.0.8,14032,149.171.126.8,21,tcp,FIN,1.060533,2934,3742,31,...,1,1,1,4,2,1,1,5,,0
762013,175.45.176.1,47439,149.171.126.18,53,udp,INT,0.000004,114,0,254,...,,44,44,44,44,44,20,44,Generic,1


In [39]:
# Saving all the files to disk to use later
pickle.dump((x_train, y_train), open('datasets/final_UNSW-NB15/final_train.pkl', 'wb'))
pickle.dump((x_test, y_test), open('datasets/final_UNSW-NB15/final_test.pkl', 'wb'))

In [40]:
# getting categorical and numerical columns in 2 diff lists
cat_col = ['proto', 'service', 'state']
num_col = list(set(x_train.columns) - set(cat_col))

In [41]:
# To use later, during test data cleaning
saved_dict['cat_col'] = cat_col
saved_dict['num_col'] = num_col

In [42]:
x_train.head()

Unnamed: 0,proto,state,sttl,dttl,service,swin,trans_depth,res_bdy_len,stime,sintpkt,...,sload_log1p,dload_log1p,spkts_log1p,stcpb_log1p,dtcpb_log1p,smeansz_log1p,dmeansz_log1p,sjit_log1p,djit_log1p,network_bytes_log1p
0,udp,INT,254,0,none,0,0,0,1421930643,33.479,...,9.2756,0.0,1.609438,0.0,0.0,3.828641,0.0,3.878042,0.0,5.187386
1,udp,INT,60,0,dns,0,0,0,1424246229,0.008,...,18.698312,0.0,1.098612,0.0,0.0,4.890349,0.0,0.0,0.0,5.57973
2,tcp,FIN,31,29,none,255,0,0,1421948071,0.372205,...,14.105347,16.314201,3.713572,20.196135,21.733479,4.174387,6.313548,0.0,3.01207,10.152883
3,tcp,FIN,31,29,ftp,255,0,0,1421971944,16.14474,...,10.258074,10.501435,3.970292,21.803017,20.49442,4.043051,4.248495,7.264606,3.984562,8.806124
4,tcp,FIN,31,29,none,255,0,0,1421963050,1.2188,...,13.339317,13.412088,2.833213,20.673269,21.855078,4.574711,4.521789,4.309533,1.138118,8.066208


## Standardizing
    
As we have seen that the range of few features in this dataset is very large. So we will keep everything within certain range by applying standardscaler. After this all the features will have mean 0 and std 1

In [43]:
# Standardizing the data
scaler = StandardScaler()
scaler = scaler.fit(x_train[num_col])

In [44]:
x_train[num_col] = scaler.transform(x_train[num_col])

In [45]:
y_train.head()

0    6
1    6
2    6
3    6
4    6
Name: attack_cat, dtype: int64

## Onehot Encoding

In our dataset we have few categorical columns with text data.
But ML models can't process text data it can process numbers.

So we have to convert categorical columns to numerical columns in some way.
We will use onehotencoder where we will assign 1 if the value is present for the row and rest of the columns will be 0.

In [46]:
# Onehot Encoding
service_ = OneHotEncoder()
proto_ = OneHotEncoder()
state_ = OneHotEncoder()
ohe_service = service_.fit(x_train.service.values.reshape(-1,1))
ohe_proto = proto_.fit(x_train.proto.values.reshape(-1,1))
ohe_state = state_.fit(x_train.state.values.reshape(-1,1))

In [47]:
x_train.service.unique()

array(['none', 'dns', 'ftp', 'smtp', 'http', 'ftp-data', 'ssh', 'ssl',
       'pop3', 'dhcp', 'irc', 'snmp', 'radius'], dtype=object)

In [48]:
# We are onehot encoding the given column
# Remove the original categorical column
for col, ohe in zip(['proto', 'service', 'state'], [ohe_proto, ohe_service, ohe_state]):
    x = ohe.transform(x_train[col].values.reshape(-1,1))
    tmp_df = pd.DataFrame(x.todense(), columns=[col+'_'+str(i) for i in ohe.categories_[0]])
    x_train = pd.concat([x_train.drop(col, axis=1), tmp_df], axis=1)

In [49]:
x_train.head()

Unnamed: 0,sttl,dttl,swin,trans_depth,res_bdy_len,stime,sintpkt,dintpkt,tcprtt,synack,...,state_INT,state_MAS,state_PAR,state_REQ,state_RST,state_TST,state_TXD,state_URH,state_URN,state_no
0,2.561444,-0.71776,-1.196045,-0.225343,-0.089113,-1.172764,-0.057567,-0.055099,-0.136439,-0.128893,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.037542,-0.71776,-1.196045,-0.225343,-0.089113,0.868469,-0.069616,-0.055099,-0.136439,-0.128893,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.426051,-0.041365,0.836096,-0.225343,-0.089113,-1.157401,-0.069485,-0.054857,-0.121383,-0.107394,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.426051,-0.041365,0.836096,-0.225343,-0.089113,-1.136357,-0.063807,-0.044512,-0.12083,-0.107159,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.426051,-0.041365,0.836096,-0.225343,-0.089113,-1.144197,-0.06918,-0.054358,-0.123351,-0.110421,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Saving all the important parameters and objects to disk so that we can apply same process on test data

In [50]:
file_path = 'datasets/final_UNSW-NB15/'

In [51]:
pickle.dump(scaler, open(file_path+'scaler.pkl', 'wb'))  # Standard scaler
pickle.dump(saved_dict, open(file_path+'saved_dict.pkl', 'wb'))  # Dictionary with important parameters
pickle.dump(mode_dict, open(file_path+'mode_dict.pkl', 'wb'))  #  Dictionary with most frequent values of columns

In [52]:
# Onehot encoder for categorical columns
pickle.dump(ohe_proto, open(file_path+'ohe_proto.pkl', 'wb'))
pickle.dump(ohe_service, open(file_path+'ohe_service.pkl', 'wb'))
pickle.dump(ohe_state, open(file_path+'ohe_state.pkl', 'wb'))

In [53]:
# Cleaned and processed train data
pickle.dump((x_train, y_train), open(file_path+'final_train.pkl', 'wb'))

## Pipeline functions

We have to prepare a pipeline, where we can send raw data and get the output.

We will use test data to implement the pipeline. Here we will use all the parameters we have saved using train data.

Also standardize and onehot encode test data using train data objects for standardscaler and onehotencoder.

In [54]:
def clean_data(data):
    '''
    Cleans given raw data. Performs various cleaning, removes Null and wrong values.
    Check for columns datatype and fix them.
    '''
    numerical_col = data.select_dtypes(include=np.number).columns  # All the numerical columns list
    categorical_col = data.select_dtypes(exclude=np.number).columns  # All the categorical columns list
    
    # Cleaning the data
    for col in data.columns:
        val = mode_dict[col]  # Mode value of the column in train data
        data[col] = data[col].fillna(value=val)
        data[col] = data[col].replace(' ', value=val)
        data[col] = data[col].apply(lambda x:"none" if x=="-" else x)

        # Fixing binary columns
        if col in saved_dict['binary_col']:
            data[col] = np.where(data[col]>1, val, data[col])

    # Fixing datatype of columns
    bad_dtypes = list(set(categorical_col) - set(saved_dict['cat_col']))
    for bad_col in bad_dtypes:
        data[col] = data[col].astype(float)
    
    return data

In [55]:
def apply_log1p(data):
    '''
    Performs FE on the data. Apply log1p on the specified columns create new column and remove those original columns.
    '''
    for col in saved_dict['log1p_col']:
        new_col = col + '_log1p'  # New col name
        data[new_col] = data[col].apply(np.log1p)  # Creating new column on transformed data
        data.drop(col, axis=1, inplace=True)  # Removing old columns
    return data

In [56]:
def standardize(data):
    '''
    Stanardize the given data. Performs mean centering and varience scaling.
    Using stanardscaler object trained on train data.
    '''
    data[saved_dict['num_col']] = scaler.transform(data[saved_dict['num_col']])
    return data

In [57]:
def ohencoding(data):
    '''
    Onehot encoding the categoricla columns.
    Add the ohe columns with the data and removes categorical columns.
    Using Onehotencoder objects trained on train data.
    '''

    # Onehot encoding cat col using onehotencoder objects
    X = ohe_service.transform(data['service'].values.reshape(-1, 1))
    Xm = ohe_proto.transform(data['proto'].values.reshape(-1, 1))
    Xmm = ohe_state.transform(data['state'].values.reshape(-1, 1))
    
    # Adding encoding data to original data
    data = pd.concat([data,
                      pd.DataFrame(Xm.toarray(), columns=['proto_'+i for i in ohe_proto.categories_[0]]),
                      pd.DataFrame(X.toarray(), columns=['service_'+i for i in ohe_service.categories_[0]]),
                      pd.DataFrame(Xmm.toarray(), columns=['state_'+i for i in ohe_state.categories_[0]])],
                      axis=1)
    
    # Removing cat columns
    data.drop(['proto', 'service', 'state'], axis=1, inplace=True)

    return data

Loading all the objects from disk, that we have trained on train data.

In [58]:
# Parametrs
saved_dict = pickle.load(open(file_path+'saved_dict.pkl', 'rb'))
# Mode value of all the columns
mode_dict = pickle.load(open(file_path+'mode_dict.pkl', 'rb'))
# Stanardscaler object
scaler = pickle.load(open(file_path+'scaler.pkl', 'rb'))

In [59]:
# One hot encoder objects
ohe_proto = pickle.load(open(file_path+'ohe_proto.pkl', 'rb'))
ohe_service = pickle.load(open(file_path+'ohe_service.pkl', 'rb'))
ohe_state = pickle.load(open(file_path+'ohe_state.pkl', 'rb'))

In [60]:
x_test.shape

(762015, 48)

In [61]:
# Resetting index of test data
x_test.reset_index(drop=True, inplace=True)

In [62]:
x_test.shape

(762015, 48)

In [63]:
x_test.columns

Index(['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur', 'sbytes',
       'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'sload', 'dload',
       'spkts', 'dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz',
       'dmeansz', 'trans_depth', 'res_bdy_len', 'sjit', 'djit', 'stime',
       'ltime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack', 'ackdat',
       'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login',
       'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'label'],
      dtype='object')

In [64]:
# Adding column names
x_test.columns = saved_dict['columns']

In [65]:
# Creating new Feature
x_test['network_bytes'] = x_test['dbytes'] + x_test['sbytes']

In [66]:
saved_dict['to_drop'] 

['srcip', 'sport', 'dstip', 'dsport', 'label']

In [67]:
saved_dict['corr_col']

['sloss',
 'dloss',
 'dpkts',
 'dwin',
 'ltime',
 'ct_srv_dst',
 'ct_src_dport_ltm',
 'ct_dst_src_ltm']

In [68]:

dropable_col = saved_dict['to_drop'] + saved_dict['corr_col']
dropable_col

['srcip',
 'sport',
 'dstip',
 'dsport',
 'label',
 'sloss',
 'dloss',
 'dpkts',
 'dwin',
 'ltime',
 'ct_srv_dst',
 'ct_src_dport_ltm',
 'ct_dst_src_ltm']

In [69]:
# Droping all the unwanted columns
dropable_col = saved_dict['to_drop'] + saved_dict['corr_col']
x_test.drop(columns=dropable_col, inplace=True)

In [70]:
x_test.shape

(762015, 36)

In [71]:
# Cleaning data using clean_data()
x_test = clean_data(x_test)

In [72]:
x_test.shape

(762015, 36)

In [73]:
# FE: applying log1p using apply_log1p()
x_test = apply_log1p(x_test)

In [74]:
x_test.shape

(762015, 36)

In [75]:
x_test.columns

Index(['proto', 'state', 'sttl', 'dttl', 'service', 'swin', 'trans_depth',
       'res_bdy_len', 'stime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack',
       'ackdat', 'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_dst_ltm', 'ct_src_ltm',
       'ct_dst_sport_ltm', 'dur_log1p', 'sbytes_log1p', 'dbytes_log1p',
       'sload_log1p', 'dload_log1p', 'spkts_log1p', 'stcpb_log1p',
       'dtcpb_log1p', 'smeansz_log1p', 'dmeansz_log1p', 'sjit_log1p',
       'djit_log1p', 'network_bytes_log1p'],
      dtype='object')

In [76]:
# Standardscaling using stanardize()
x_test = standardize(x_test)

In [77]:
x_test.head()

Unnamed: 0,proto,state,sttl,dttl,service,swin,trans_depth,res_bdy_len,stime,sintpkt,...,sload_log1p,dload_log1p,spkts_log1p,stcpb_log1p,dtcpb_log1p,smeansz_log1p,dmeansz_log1p,sjit_log1p,djit_log1p,network_bytes_log1p
0,udp,CON,-0.426051,-0.041365,dns,-1.196045,-0.225343,-0.089113,-1.148407,-0.069615,...,-0.121716,0.482924,-1.080734,-1.190007,-1.189697,-0.374605,0.096502,-0.995343,-0.899657,-0.974789
1,udp,CON,-0.426051,-0.041365,dns,-1.196045,-0.225343,-0.089113,-1.163715,-0.069617,...,-0.112689,0.487911,-1.080734,-1.190007,-1.189697,-0.374605,0.096502,-0.995343,-0.899657,-0.974789
2,udp,INT,-0.037542,-0.71776,dns,-1.196045,-0.225343,-0.089113,0.855736,-0.069616,...,1.577442,-1.879995,-1.080734,-1.190007,-1.189697,0.565988,-1.850553,-0.995343,-0.899657,-1.062092
3,tcp,FIN,-0.426051,-0.041365,none,0.836096,-0.225343,-0.089113,-1.157963,-0.069518,...,0.619603,0.775428,0.277671,0.898135,0.898496,1.366104,0.278308,-0.163891,-0.794201,0.298473
4,udp,CON,-0.426051,-0.041365,dns,-1.196045,-0.225343,-0.089113,0.864749,-0.069616,...,-0.150173,0.467203,-1.080734,-1.190007,-1.189697,-0.374605,0.096502,-0.995343,-0.899657,-0.974789


In [78]:
# Onehot encoding categorical columns using ohencoding()
x_test = ohencoding(x_test)

In [79]:
x_test.shape

(762015, 197)

In [80]:
# Final test data
x_test.head()

Unnamed: 0,sttl,dttl,swin,trans_depth,res_bdy_len,stime,sintpkt,dintpkt,tcprtt,synack,...,state_INT,state_MAS,state_PAR,state_REQ,state_RST,state_TST,state_TXD,state_URH,state_URN,state_no
0,-0.426051,-0.041365,-1.196045,-0.225343,-0.089113,-1.148407,-0.069615,-0.055095,-0.136439,-0.128893,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.426051,-0.041365,-1.196045,-0.225343,-0.089113,-1.163715,-0.069617,-0.055093,-0.136439,-0.128893,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.037542,-0.71776,-1.196045,-0.225343,-0.089113,0.855736,-0.069616,-0.055099,-0.136439,-0.128893,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.426051,-0.041365,0.836096,-0.225343,-0.089113,-1.157963,-0.069518,-0.054912,-0.121029,-0.107473,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.426051,-0.041365,-1.196045,-0.225343,-0.089113,0.864749,-0.069616,-0.055095,-0.136439,-0.128893,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [81]:
# Matching test data columns with train data columns
all(x_train.columns == x_test.columns)

True