### Description

This notebook shows the creation of transformers and custom Pipelines

### Imports

In [1]:
from scipy.io import arff
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler



### Auxiliary functions

In [2]:
def load_kdd_dataset(data_path):
    data = arff.loadarff(data_path)
    df = pd.DataFrame(data[0])
    return df

In [3]:
def train_val_test_split(df, rstate=42, shuffle=True, stratify=None):
    strat = df[stratify] if stratify else None
    train_set, test_set = train_test_split(
        df, test_size=0.4, random_state=rstate, shuffle=shuffle, stratify=strat)
    strat = test_set[stratify] if stratify else None
    val_set, test_set = train_test_split(
        test_set, test_size=0.5, random_state=rstate, shuffle=shuffle, stratify=strat)
    return (train_set, val_set, test_set)

### Reading the data set

In [4]:
df = load_kdd_dataset("../datasets/NSL-KDD/KDDTrain+.arff")

In [5]:
df

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0.0,b'tcp',b'ftp_data',b'SF',491.0,0.0,b'0',0.0,0.0,0.0,...,25.0,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,b'normal'
1,0.0,b'udp',b'other',b'SF',146.0,0.0,b'0',0.0,0.0,0.0,...,1.0,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,b'normal'
2,0.0,b'tcp',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,26.0,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,b'anomaly'
3,0.0,b'tcp',b'http',b'SF',232.0,8153.0,b'0',0.0,0.0,0.0,...,255.0,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,b'normal'
4,0.0,b'tcp',b'http',b'SF',199.0,420.0,b'0',0.0,0.0,0.0,...,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,b'normal'
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,0.0,b'tcp',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,25.0,0.10,0.06,0.00,0.00,1.00,1.00,0.00,0.00,b'anomaly'
125969,8.0,b'udp',b'private',b'SF',105.0,145.0,b'0',0.0,0.0,0.0,...,244.0,0.96,0.01,0.01,0.00,0.00,0.00,0.00,0.00,b'normal'
125970,0.0,b'tcp',b'smtp',b'SF',2231.0,384.0,b'0',0.0,0.0,0.0,...,30.0,0.12,0.06,0.00,0.00,0.72,0.00,0.01,0.00,b'normal'
125971,0.0,b'tcp',b'klogin',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,8.0,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00,b'anomaly'


### Splitting of the data set

In [6]:
train_set, val_set, test_set = train_val_test_split(df, stratify='protocol_type')

In [7]:
print("Training Set Length:", len(train_set))
print("Validation Set Length:", len(val_set))
print("Test Set Length:", len(test_set))

Training Set Length: 75583
Validation Set Length: 25195
Test Set Length: 25195


## 1. Building custom transformers

Creating your own transformers allows you to keep your code much cleaner and more structured when preparing data for ML algorithms. In addition, they facilitate the reuse of code for other projects.

In [8]:
X_train = train_set.drop("class", axis=1)
y_train = train_set["class"].copy()

In [9]:
# To illustrate this section we are going to add some null values to some features of the dataset
X_train.loc[(X_train["src_bytes"]>400) & (X_train["src_bytes"]<800), "src_bytes"] = np.nan
X_train.loc[(X_train["dst_bytes"]>500) & (X_train["dst_bytes"]<2000), "dst_bytes"] = np.nan

In [10]:
X_train

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
113467,0.0,b'tcp',b'http',b'SF',,53508.0,b'0',0.0,0.0,0.0,...,9.0,255.0,1.00,0.00,0.11,0.03,0.00,0.00,0.0,0.0
31899,0.0,b'tcp',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,255.0,4.0,0.02,0.05,0.00,0.00,1.00,1.00,0.0,0.0
108116,0.0,b'tcp',b'http',b'SF',304.0,,b'0',0.0,0.0,0.0,...,39.0,255.0,1.00,0.00,0.03,0.06,0.00,0.00,0.0,0.0
89913,0.0,b'tcp',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,255.0,15.0,0.06,0.07,0.00,0.00,1.00,1.00,0.0,0.0
106319,0.0,b'icmp',b'eco_i',b'SF',8.0,0.0,b'0',0.0,0.0,0.0,...,2.0,7.0,1.00,0.00,1.00,0.57,0.00,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64559,0.0,b'tcp',b'systat',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,255.0,20.0,0.08,0.06,0.00,0.00,1.00,1.00,0.0,0.0
67272,0.0,b'tcp',b'http',b'SF',210.0,,b'0',0.0,0.0,0.0,...,119.0,255.0,1.00,0.00,0.01,0.02,0.02,0.01,0.0,0.0
32452,3.0,b'tcp',b'smtp',b'SF',889.0,328.0,b'0',0.0,0.0,0.0,...,111.0,155.0,0.64,0.04,0.01,0.01,0.01,0.00,0.0,0.0
112657,0.0,b'tcp',b'http',b'SF',284.0,444.0,b'0',0.0,0.0,0.0,...,255.0,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0


### Transformers for numeric attributes

In [11]:
# Transformer created to remove rows with null values
from sklearn.base import BaseEstimator, TransformerMixin

class DeleteNanRows(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X.dropna()

In [12]:
delete_nan = DeleteNanRows()
X_train_prep = delete_nan.fit_transform(X_train)

In [13]:
X_train_prep

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
31899,0.0,b'tcp',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,255.0,4.0,0.02,0.05,0.00,0.00,1.00,1.0,0.0,0.0
89913,0.0,b'tcp',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,255.0,15.0,0.06,0.07,0.00,0.00,1.00,1.0,0.0,0.0
106319,0.0,b'icmp',b'eco_i',b'SF',8.0,0.0,b'0',0.0,0.0,0.0,...,2.0,7.0,1.00,0.00,1.00,0.57,0.00,0.0,0.0,0.0
98007,0.0,b'udp',b'domain_u',b'SF',46.0,139.0,b'0',0.0,0.0,0.0,...,255.0,254.0,1.00,0.01,0.00,0.00,0.00,0.0,0.0,0.0
16447,0.0,b'tcp',b'smtp',b'SF',1790.0,363.0,b'0',0.0,0.0,0.0,...,141.0,137.0,0.55,0.04,0.01,0.01,0.00,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90665,0.0,b'tcp',b'ftp_data',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,255.0,63.0,0.25,0.02,0.02,0.00,1.00,1.0,0.0,0.0
64559,0.0,b'tcp',b'systat',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,255.0,20.0,0.08,0.06,0.00,0.00,1.00,1.0,0.0,0.0
32452,3.0,b'tcp',b'smtp',b'SF',889.0,328.0,b'0',0.0,0.0,0.0,...,111.0,155.0,0.64,0.04,0.01,0.01,0.01,0.0,0.0,0.0
112657,0.0,b'tcp',b'http',b'SF',284.0,444.0,b'0',0.0,0.0,0.0,...,255.0,255.0,1.00,0.00,0.00,0.00,0.00,0.0,0.0,0.0


In [14]:
# Transformer designed to easily scale only selected columns
class CustomScaler(BaseEstimator, TransformerMixin):
    def __init__(self, attributes):
        self.attributes = attributes
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        X_copy = X.copy()
        scale_attrs = X_copy[self.attributes]
        robust_scaler = RobustScaler()
        X_scaled = robust_scaler.fit_transform(scale_attrs)
        X_scaled = pd.DataFrame(X_scaled, columns=self.attributes, index=X_copy.index)
        for attr in self.attributes:
            X_copy[attr] = X_scaled[attr]
        return X_copy

In [15]:
custom_scaler = CustomScaler(["src_bytes", "dst_bytes"])
X_train_prep = custom_scaler.fit_transform(X_train_prep)

In [16]:
X_train.head(10)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
113467,0.0,b'tcp',b'http',b'SF',,53508.0,b'0',0.0,0.0,0.0,...,9.0,255.0,1.0,0.0,0.11,0.03,0.0,0.0,0.0,0.0
31899,0.0,b'tcp',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,255.0,4.0,0.02,0.05,0.0,0.0,1.0,1.0,0.0,0.0
108116,0.0,b'tcp',b'http',b'SF',304.0,,b'0',0.0,0.0,0.0,...,39.0,255.0,1.0,0.0,0.03,0.06,0.0,0.0,0.0,0.0
89913,0.0,b'tcp',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,255.0,15.0,0.06,0.07,0.0,0.0,1.0,1.0,0.0,0.0
106319,0.0,b'icmp',b'eco_i',b'SF',8.0,0.0,b'0',0.0,0.0,0.0,...,2.0,7.0,1.0,0.0,1.0,0.57,0.0,0.0,0.0,0.0
98007,0.0,b'udp',b'domain_u',b'SF',46.0,139.0,b'0',0.0,0.0,0.0,...,255.0,254.0,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0
16447,0.0,b'tcp',b'smtp',b'SF',1790.0,363.0,b'0',0.0,0.0,0.0,...,141.0,137.0,0.55,0.04,0.01,0.01,0.0,0.0,0.0,0.0
64957,1.0,b'tcp',b'smtp',b'SF',,329.0,b'0',0.0,0.0,0.0,...,198.0,181.0,0.65,0.03,0.01,0.01,0.02,0.02,0.0,0.0
100052,0.0,b'tcp',b'http',b'SF',206.0,,b'0',0.0,0.0,0.0,...,255.0,255.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28800,0.0,b'tcp',b'ftp_data',b'SF',334.0,0.0,b'0',0.0,0.0,0.0,...,8.0,28.0,1.0,0.0,1.0,0.11,0.0,0.0,0.0,0.0


In [17]:
X_train_prep.head(20)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
31899,0.0,b'tcp',b'private',b'S0',-0.034632,0.0,b'0',0.0,0.0,0.0,...,255.0,4.0,0.02,0.05,0.0,0.0,1.0,1.0,0.0,0.0
89913,0.0,b'tcp',b'private',b'S0',-0.034632,0.0,b'0',0.0,0.0,0.0,...,255.0,15.0,0.06,0.07,0.0,0.0,1.0,1.0,0.0,0.0
106319,0.0,b'icmp',b'eco_i',b'SF',0.0,0.0,b'0',0.0,0.0,0.0,...,2.0,7.0,1.0,0.0,1.0,0.57,0.0,0.0,0.0,0.0
98007,0.0,b'udp',b'domain_u',b'SF',0.164502,0.448387,b'0',0.0,0.0,0.0,...,255.0,254.0,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0
16447,0.0,b'tcp',b'smtp',b'SF',7.714286,1.170968,b'0',0.0,0.0,0.0,...,141.0,137.0,0.55,0.04,0.01,0.01,0.0,0.0,0.0,0.0
28800,0.0,b'tcp',b'ftp_data',b'SF',1.411255,0.0,b'0',0.0,0.0,0.0,...,8.0,28.0,1.0,0.0,1.0,0.11,0.0,0.0,0.0,0.0
78082,0.0,b'tcp',b'ftp_data',b'SF',44.939394,0.0,b'0',0.0,0.0,0.0,...,93.0,51.0,0.23,0.03,0.23,0.04,0.0,0.0,0.0,0.0
69315,0.0,b'tcp',b'systat',b'S0',-0.034632,0.0,b'0',0.0,0.0,0.0,...,255.0,5.0,0.02,0.07,0.0,0.0,1.0,1.0,0.0,0.0
100360,0.0,b'tcp',b'private',b'S0',-0.034632,0.0,b'0',0.0,0.0,0.0,...,255.0,13.0,0.05,0.07,0.0,0.0,1.0,1.0,0.0,0.0
29208,0.0,b'tcp',b'http',b'SF',1.419913,13.816129,b'0',0.0,0.0,0.0,...,255.0,255.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# Transformer to modify only the categorical columns and return a DataFrame
class CustomOneHotEncoding(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._oh = OneHotEncoder(sparse=False)
        self._columns = None
    def fit(self, X, y=None):
        X_cat = X.select_dtypes(include=['object'])
        self._columns = pd.get_dummies(X_cat).columns
        self._oh.fit(X_cat)
        return self
    def transform(self, X, y=None):
        X_copy = X.copy()
        X_cat = X_copy.select_dtypes(include=['object'])
        X_num = X_copy.select_dtypes(exclude=['object'])
        X_cat_oh = self._oh.transform(X_cat)
        X_cat_oh = pd.DataFrame(X_cat_oh, 
                                columns=self._columns, 
                                index=X_copy.index)
        X_copy.drop(list(X_cat), axis=1, inplace=True)
        return X_copy.join(X_cat_oh)

## 2. Building Custom Pipelines

Pipelines allow us to group all the transformation operations that we need to perform on the data set in an execution flow, this greatly facilitates the transformations for different data sets. Things to note about these structures are:  

* Receive a set of pairs (name, estimator).
* All but the last must be transformers.
* The pipeline exposes the same methods as the last estimator.
* When the fit() method of the pipeline is called, the fit_transform() method of the estimators is called sequentially and the output of the previous one is sequentially passed as the input of the next one. The latter calls the fit() method.

In [19]:
# Construction of a pipeline for numeric attributes
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('rbst_scaler', RobustScaler()),
    ])

In [20]:
# The imputer class does not support categorical values, we remove the categorical attributes
X_train_num = X_train.select_dtypes(exclude=['object'])

X_train_prep = num_pipeline.fit_transform(X_train_num)
X_train_prep = pd.DataFrame(X_train_prep, columns=X_train_num.columns, index=X_train_num.index)

In [21]:
X_train_num.head(10)

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,root_shell,su_attempted,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
113467,0.0,,53508.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9.0,255.0,1.0,0.0,0.11,0.03,0.0,0.0,0.0,0.0
31899,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,4.0,0.02,0.05,0.0,0.0,1.0,1.0,0.0,0.0
108116,0.0,304.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,39.0,255.0,1.0,0.0,0.03,0.06,0.0,0.0,0.0,0.0
89913,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,15.0,0.06,0.07,0.0,0.0,1.0,1.0,0.0,0.0
106319,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,7.0,1.0,0.0,1.0,0.57,0.0,0.0,0.0,0.0
98007,0.0,46.0,139.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,254.0,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0
16447,0.0,1790.0,363.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,141.0,137.0,0.55,0.04,0.01,0.01,0.0,0.0,0.0,0.0
64957,1.0,,329.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,198.0,181.0,0.65,0.03,0.01,0.01,0.02,0.02,0.0,0.0
100052,0.0,206.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28800,0.0,334.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.0,28.0,1.0,0.0,1.0,0.11,0.0,0.0,0.0,0.0


In [22]:
X_train_prep.head(10)

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,root_shell,su_attempted,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
113467,0.0,0.0,235.718062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.421965,0.787755,0.515789,-0.428571,1.833333,1.5,0.0,0.0,0.0,0.0
31899,0.0,-0.174089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.236735,-0.515789,0.285714,0.0,0.0,1.0,1.0,0.0,0.0
108116,0.0,1.05668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.248555,0.787755,0.515789,-0.428571,0.5,3.0,0.0,0.0,0.0,0.0
89913,0.0,-0.174089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.191837,-0.473684,0.571429,0.0,0.0,1.0,1.0,0.0,0.0
106319,0.0,-0.1417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.462428,-0.22449,0.515789,-0.428571,16.666667,28.5,0.0,0.0,0.0,0.0
98007,0.0,0.012146,0.612335,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.783673,0.515789,-0.285714,0.0,0.0,0.0,0.0,0.0,0.0
16447,0.0,7.072874,1.599119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.65896,0.306122,0.042105,0.142857,0.166667,0.5,0.0,0.0,0.0,0.0
64957,1.0,0.0,1.449339,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.32948,0.485714,0.147368,0.0,0.166667,0.5,0.02,0.02,0.0,0.0
100052,0.0,0.659919,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.787755,0.515789,-0.428571,0.0,0.0,0.0,0.0,0.0,0.0
28800,0.0,1.178138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.427746,-0.138776,0.515789,-0.428571,16.666667,5.5,0.0,0.0,0.0,0.0


Next, the ColumnTransformer method is presented, which executes all the pipelines in parallel and concatenates the result. For this, the result of the pipelines must be a numerical value.

In [23]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_attribs = list(X_train.select_dtypes(exclude=['object']))
cat_attribs = list(X_train.select_dtypes(include=['object']))

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

In [24]:
X_train_prep = full_pipeline.fit_transform(X_train)

In [25]:
X_train_prep = pd.DataFrame(X_train_prep, columns=list(pd.get_dummies(X_train)), index=X_train.index)

In [26]:
X_train_prep.head(10)

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,root_shell,su_attempted,...,flag_b'S3',flag_b'SF',flag_b'SH',land_b'0',land_b'1',logged_in_b'0',logged_in_b'1',is_host_login_b'0',is_guest_login_b'0',is_guest_login_b'1'
113467,0.0,0.0,235.718062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
31899,0.0,-0.174089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
108116,0.0,1.05668,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
89913,0.0,-0.174089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
106319,0.0,-0.1417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
98007,0.0,0.012146,0.612335,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
16447,0.0,7.072874,1.599119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
64957,1.0,0.0,1.449339,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
100052,0.0,0.659919,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
28800,0.0,1.178138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0


In [27]:
X_train.head(10)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
113467,0.0,b'tcp',b'http',b'SF',,53508.0,b'0',0.0,0.0,0.0,...,9.0,255.0,1.0,0.0,0.11,0.03,0.0,0.0,0.0,0.0
31899,0.0,b'tcp',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,255.0,4.0,0.02,0.05,0.0,0.0,1.0,1.0,0.0,0.0
108116,0.0,b'tcp',b'http',b'SF',304.0,,b'0',0.0,0.0,0.0,...,39.0,255.0,1.0,0.0,0.03,0.06,0.0,0.0,0.0,0.0
89913,0.0,b'tcp',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,255.0,15.0,0.06,0.07,0.0,0.0,1.0,1.0,0.0,0.0
106319,0.0,b'icmp',b'eco_i',b'SF',8.0,0.0,b'0',0.0,0.0,0.0,...,2.0,7.0,1.0,0.0,1.0,0.57,0.0,0.0,0.0,0.0
98007,0.0,b'udp',b'domain_u',b'SF',46.0,139.0,b'0',0.0,0.0,0.0,...,255.0,254.0,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0
16447,0.0,b'tcp',b'smtp',b'SF',1790.0,363.0,b'0',0.0,0.0,0.0,...,141.0,137.0,0.55,0.04,0.01,0.01,0.0,0.0,0.0,0.0
64957,1.0,b'tcp',b'smtp',b'SF',,329.0,b'0',0.0,0.0,0.0,...,198.0,181.0,0.65,0.03,0.01,0.01,0.02,0.02,0.0,0.0
100052,0.0,b'tcp',b'http',b'SF',206.0,,b'0',0.0,0.0,0.0,...,255.0,255.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28800,0.0,b'tcp',b'ftp_data',b'SF',334.0,0.0,b'0',0.0,0.0,0.0,...,8.0,28.0,1.0,0.0,1.0,0.11,0.0,0.0,0.0,0.0
