### Description

This notebook shows techniques for evaluating the results of a prediction with a Machine Learning algorithm.

### Imports

In [7]:
from scipy.io import arff
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin



### Auxiliary functions

In [12]:
def load_kdd_dataset2(data_path):
    data = arff.loadarff(data_path)
    df = pd.DataFrame(data[0])
    return df

In [12]:
def train_val_test_split(df, rstate=42, shuffle=True, stratify=None):
    strat = df[stratify] if stratify else None
    train_set, test_set = train_test_split(
        df, test_size=0.4, random_state=rstate, shuffle=shuffle, stratify=strat)
    strat = test_set[stratify] if stratify else None
    val_set, test_set = train_test_split(
        test_set, test_size=0.5, random_state=rstate, shuffle=shuffle, stratify=strat)
    return (train_set, val_set, test_set)

In [13]:
# Create a pipeline for numeric attributes
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('rbst_scaler', RobustScaler()),
    ])

In [14]:
# Transformer to modify only the categorical columns and return a df
class CustomOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._oh = OneHotEncoder(sparse=False)
        self._columns = None
    def fit(self, X, y=None):
        X_cat = X.select_dtypes(include=['object'])
        self._columns = pd.get_dummies(X_cat).columns
        self._oh.fit(X_cat)
        return self
    def transform(self, X, y=None):
        X_copy = X.copy()
        X_cat = X_copy.select_dtypes(include=['object'])
        X_num = X_copy.select_dtypes(exclude=['object'])
        X_cat_oh = self._oh.transform(X_cat)
        X_cat_oh = pd.DataFrame(X_cat_oh, 
                                columns=self._columns, 
                                index=X_copy.index)
        X_copy.drop(list(X_cat), axis=1, inplace=True)
        return X_copy.join(X_cat_oh)

In [15]:
# Transformer that prepares the entire dataset by calling pipelines and custom transformers
class DataFramePreparer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self._full_pipeline = None
        self._columns = None
    def fit(self, X, y=None):
        num_attribs = list(X.select_dtypes(exclude=['object']))
        cat_attribs = list(X.select_dtypes(include=['object']))
        self._full_pipeline = ColumnTransformer([
                ("num", num_pipeline, num_attribs),
                ("cat", CustomOneHotEncoder(), cat_attribs),
        ])
        self._full_pipeline.fit(X)
        self._columns = pd.get_dummies(X).columns
        return self
    def transform(self, X, y=None):
        X_copy = X.copy()
        X_prep = self._full_pipeline.transform(X_copy)
        return pd.DataFrame(X_prep, 
                            columns=self._columns, 
                            index=X_copy.index)

### Reading the data set

In [45]:
import arff
def load_kdd_dataset(data_path):
    """Lectura del conjunto de datos NSL-KDD."""
    with open(data_path, 'r') as train_set:
        dataset = arff.load(train_set)
    attributes = [attr[0] for attr in dataset["attributes"]]
    return pd.DataFrame(dataset["data"], columns=attributes)

In [2]:
data, meta = load_kdd_dataset("../datasets/NSL-KDD/KDDTrain+.arff")

In [3]:
from scipy.io import arff
def load_kdd_dataset(data_path):
    data, meta = arff.loadarff(data_path)
    
    return data, meta

In [None]:
data, meta = load_kdd_dataset("../datasets/NSL-KDD/KDDTrain+.arff")

In [5]:
type(meta)

scipy.io.arff.arffread.MetaData

In [8]:
df = pd.DataFrame(data, columns = meta)

In [9]:
df

'KDDTrain',duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0.0,b'tcp',b'ftp_data',b'SF',491.0,0.0,b'0',0.0,0.0,0.0,...,25.0,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,b'normal'
1,0.0,b'udp',b'other',b'SF',146.0,0.0,b'0',0.0,0.0,0.0,...,1.0,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,b'normal'
2,0.0,b'tcp',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,26.0,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,b'anomaly'
3,0.0,b'tcp',b'http',b'SF',232.0,8153.0,b'0',0.0,0.0,0.0,...,255.0,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,b'normal'
4,0.0,b'tcp',b'http',b'SF',199.0,420.0,b'0',0.0,0.0,0.0,...,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,b'normal'
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,0.0,b'tcp',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,25.0,0.10,0.06,0.00,0.00,1.00,1.00,0.00,0.00,b'anomaly'
125969,8.0,b'udp',b'private',b'SF',105.0,145.0,b'0',0.0,0.0,0.0,...,244.0,0.96,0.01,0.01,0.00,0.00,0.00,0.00,0.00,b'normal'
125970,0.0,b'tcp',b'smtp',b'SF',2231.0,384.0,b'0',0.0,0.0,0.0,...,30.0,0.12,0.06,0.00,0.00,0.72,0.00,0.01,0.00,b'normal'
125971,0.0,b'tcp',b'klogin',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,8.0,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00,b'anomaly'


In [10]:
columns_array = []
for i in meta:
    columns_array.append(i)

df2 = pd.DataFrame(data, columns = columns_array)

In [11]:
df2

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0.0,b'tcp',b'ftp_data',b'SF',491.0,0.0,b'0',0.0,0.0,0.0,...,25.0,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,b'normal'
1,0.0,b'udp',b'other',b'SF',146.0,0.0,b'0',0.0,0.0,0.0,...,1.0,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,b'normal'
2,0.0,b'tcp',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,26.0,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,b'anomaly'
3,0.0,b'tcp',b'http',b'SF',232.0,8153.0,b'0',0.0,0.0,0.0,...,255.0,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,b'normal'
4,0.0,b'tcp',b'http',b'SF',199.0,420.0,b'0',0.0,0.0,0.0,...,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,b'normal'
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,0.0,b'tcp',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,25.0,0.10,0.06,0.00,0.00,1.00,1.00,0.00,0.00,b'anomaly'
125969,8.0,b'udp',b'private',b'SF',105.0,145.0,b'0',0.0,0.0,0.0,...,244.0,0.96,0.01,0.01,0.00,0.00,0.00,0.00,0.00,b'normal'
125970,0.0,b'tcp',b'smtp',b'SF',2231.0,384.0,b'0',0.0,0.0,0.0,...,30.0,0.12,0.06,0.00,0.00,0.72,0.00,0.01,0.00,b'normal'
125971,0.0,b'tcp',b'klogin',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,8.0,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00,b'anomaly'


In [13]:
df3 = load_kdd_dataset2("../datasets/NSL-KDD/KDDTrain+.arff")

In [14]:
df3

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0.0,b'tcp',b'ftp_data',b'SF',491.0,0.0,b'0',0.0,0.0,0.0,...,25.0,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,b'normal'
1,0.0,b'udp',b'other',b'SF',146.0,0.0,b'0',0.0,0.0,0.0,...,1.0,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,b'normal'
2,0.0,b'tcp',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,26.0,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,b'anomaly'
3,0.0,b'tcp',b'http',b'SF',232.0,8153.0,b'0',0.0,0.0,0.0,...,255.0,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,b'normal'
4,0.0,b'tcp',b'http',b'SF',199.0,420.0,b'0',0.0,0.0,0.0,...,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,b'normal'
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,0.0,b'tcp',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,25.0,0.10,0.06,0.00,0.00,1.00,1.00,0.00,0.00,b'anomaly'
125969,8.0,b'udp',b'private',b'SF',105.0,145.0,b'0',0.0,0.0,0.0,...,244.0,0.96,0.01,0.01,0.00,0.00,0.00,0.00,0.00,b'normal'
125970,0.0,b'tcp',b'smtp',b'SF',2231.0,384.0,b'0',0.0,0.0,0.0,...,30.0,0.12,0.06,0.00,0.00,0.72,0.00,0.01,0.00,b'normal'
125971,0.0,b'tcp',b'klogin',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,8.0,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00,b'anomaly'


In [23]:
meta

Dataset: 'KDDTrain'
	duration's type is numeric
	protocol_type's type is nominal, range is ('tcp', 'udp', 'icmp')
	service's type is nominal, range is ('aol', 'auth', 'bgp', 'courier', 'csnet_ns', 'ctf', 'daytime', 'discard', 'domain', 'domain_u', 'echo', 'eco_i', 'ecr_i', 'efs', 'exec', 'finger', 'ftp', 'ftp_data', 'gopher', 'harvest', 'hostnames', 'http', 'http_2784', 'http_443', 'http_8001', 'imap4', 'IRC', 'iso_tsap', 'klogin', 'kshell', 'ldap', 'link', 'login', 'mtp', 'name', 'netbios_dgm', 'netbios_ns', 'netbios_ssn', 'netstat', 'nnsp', 'nntp', 'ntp_u', 'other', 'pm_dump', 'pop_2', 'pop_3', 'printer', 'private', 'red_i', 'remote_job', 'rje', 'shell', 'smtp', 'sql_net', 'ssh', 'sunrpc', 'supdup', 'systat', 'telnet', 'tftp_u', 'tim_i', 'time', 'urh_i', 'urp_i', 'uucp', 'uucp_path', 'vmnet', 'whois', 'X11', 'Z39_50')
	flag's type is nominal, range is ('OTH', 'REJ', 'RSTO', 'RSTOS0', 'RSTR', 'S0', 'S1', 'S2', 'S3', 'SF', 'SH')
	src_bytes's type is numeric
	dst_bytes's type is numeric

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125973 entries, 0 to 125972
Data columns (total 42 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     125973 non-null  float64
 1   protocol_type                125973 non-null  object 
 2   service                      125973 non-null  object 
 3   flag                         125973 non-null  object 
 4   src_bytes                    125973 non-null  float64
 5   dst_bytes                    125973 non-null  float64
 6   land                         125973 non-null  object 
 7   wrong_fragment               125973 non-null  float64
 8   urgent                       125973 non-null  float64
 9   hot                          125973 non-null  float64
 10  num_failed_logins            125973 non-null  float64
 11  logged_in                    125973 non-null  object 
 12  num_compromised              125973 non-null  float64
 13 

In [28]:
X_cat = df.select_dtypes(include=['object'])
X_cat

'KDDTrain',protocol_type,service,flag,land,logged_in,is_host_login,is_guest_login,class
0,b'tcp',b'ftp_data',b'SF',b'0',b'0',b'0',b'0',b'normal'
1,b'udp',b'other',b'SF',b'0',b'0',b'0',b'0',b'normal'
2,b'tcp',b'private',b'S0',b'0',b'0',b'0',b'0',b'anomaly'
3,b'tcp',b'http',b'SF',b'0',b'1',b'0',b'0',b'normal'
4,b'tcp',b'http',b'SF',b'0',b'1',b'0',b'0',b'normal'
...,...,...,...,...,...,...,...,...
125968,b'tcp',b'private',b'S0',b'0',b'0',b'0',b'0',b'anomaly'
125969,b'udp',b'private',b'SF',b'0',b'0',b'0',b'0',b'normal'
125970,b'tcp',b'smtp',b'SF',b'0',b'1',b'0',b'0',b'normal'
125971,b'tcp',b'klogin',b'S0',b'0',b'0',b'0',b'0',b'anomaly'


In [29]:
X_cat2 = df2.select_dtypes(include=['object'])
X_cat2

Unnamed: 0,protocol_type,service,flag,land,logged_in,is_host_login,is_guest_login,class
0,b'tcp',b'ftp_data',b'SF',b'0',b'0',b'0',b'0',b'normal'
1,b'udp',b'other',b'SF',b'0',b'0',b'0',b'0',b'normal'
2,b'tcp',b'private',b'S0',b'0',b'0',b'0',b'0',b'anomaly'
3,b'tcp',b'http',b'SF',b'0',b'1',b'0',b'0',b'normal'
4,b'tcp',b'http',b'SF',b'0',b'1',b'0',b'0',b'normal'
...,...,...,...,...,...,...,...,...
125968,b'tcp',b'private',b'S0',b'0',b'0',b'0',b'0',b'anomaly'
125969,b'udp',b'private',b'SF',b'0',b'0',b'0',b'0',b'normal'
125970,b'tcp',b'smtp',b'SF',b'0',b'1',b'0',b'0',b'normal'
125971,b'tcp',b'klogin',b'S0',b'0',b'0',b'0',b'0',b'anomaly'


In [40]:
x_columns = df2.select_dtypes(include=['object']).columns

In [41]:
x_columns

Index(['protocol_type', 'service', 'flag', 'land', 'logged_in',
       'is_host_login', 'is_guest_login', 'class'],
      dtype='object')

In [44]:
for i in x_columns:
    df3[i] = df3[i].str.decode('utf-8') 

In [38]:
df2['protocol_type'] = df2['protocol_type'].str.decode('utf-8') 
df2['protocol_type'] = df2['protocol_type'].str.decode('utf-8') 

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0.0,tcp,b'ftp_data',b'SF',491.0,0.0,b'0',0.0,0.0,0.0,...,25.0,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,b'normal'
1,0.0,udp,b'other',b'SF',146.0,0.0,b'0',0.0,0.0,0.0,...,1.0,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,b'normal'
2,0.0,tcp,b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,26.0,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,b'anomaly'
3,0.0,tcp,b'http',b'SF',232.0,8153.0,b'0',0.0,0.0,0.0,...,255.0,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,b'normal'
4,0.0,tcp,b'http',b'SF',199.0,420.0,b'0',0.0,0.0,0.0,...,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,b'normal'
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,0.0,tcp,b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,25.0,0.10,0.06,0.00,0.00,1.00,1.00,0.00,0.00,b'anomaly'
125969,8.0,udp,b'private',b'SF',105.0,145.0,b'0',0.0,0.0,0.0,...,244.0,0.96,0.01,0.01,0.00,0.00,0.00,0.00,0.00,b'normal'
125970,0.0,tcp,b'smtp',b'SF',2231.0,384.0,b'0',0.0,0.0,0.0,...,30.0,0.12,0.06,0.00,0.00,0.72,0.00,0.01,0.00,b'normal'
125971,0.0,tcp,b'klogin',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,8.0,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00,b'anomaly'


In [43]:
df3

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0.0,b'tcp',b'ftp_data',b'SF',491.0,0.0,b'0',0.0,0.0,0.0,...,25.0,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,b'normal'
1,0.0,b'udp',b'other',b'SF',146.0,0.0,b'0',0.0,0.0,0.0,...,1.0,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,b'normal'
2,0.0,b'tcp',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,26.0,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,b'anomaly'
3,0.0,b'tcp',b'http',b'SF',232.0,8153.0,b'0',0.0,0.0,0.0,...,255.0,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,b'normal'
4,0.0,b'tcp',b'http',b'SF',199.0,420.0,b'0',0.0,0.0,0.0,...,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,b'normal'
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,0.0,b'tcp',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,25.0,0.10,0.06,0.00,0.00,1.00,1.00,0.00,0.00,b'anomaly'
125969,8.0,b'udp',b'private',b'SF',105.0,145.0,b'0',0.0,0.0,0.0,...,244.0,0.96,0.01,0.01,0.00,0.00,0.00,0.00,0.00,b'normal'
125970,0.0,b'tcp',b'smtp',b'SF',2231.0,384.0,b'0',0.0,0.0,0.0,...,30.0,0.12,0.06,0.00,0.00,0.72,0.00,0.01,0.00,b'normal'
125971,0.0,b'tcp',b'klogin',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,8.0,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00,b'anomaly'


In [45]:
df3

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0.0,tcp,ftp_data,SF,491.0,0.0,0,0.0,0.0,0.0,...,25.0,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,normal
1,0.0,udp,other,SF,146.0,0.0,0,0.0,0.0,0.0,...,1.0,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,normal
2,0.0,tcp,private,S0,0.0,0.0,0,0.0,0.0,0.0,...,26.0,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,anomaly
3,0.0,tcp,http,SF,232.0,8153.0,0,0.0,0.0,0.0,...,255.0,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,normal
4,0.0,tcp,http,SF,199.0,420.0,0,0.0,0.0,0.0,...,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,0.0,tcp,private,S0,0.0,0.0,0,0.0,0.0,0.0,...,25.0,0.10,0.06,0.00,0.00,1.00,1.00,0.00,0.00,anomaly
125969,8.0,udp,private,SF,105.0,145.0,0,0.0,0.0,0.0,...,244.0,0.96,0.01,0.01,0.00,0.00,0.00,0.00,0.00,normal
125970,0.0,tcp,smtp,SF,2231.0,384.0,0,0.0,0.0,0.0,...,30.0,0.12,0.06,0.00,0.00,0.72,0.00,0.01,0.00,normal
125971,0.0,tcp,klogin,S0,0.0,0.0,0,0.0,0.0,0.0,...,8.0,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00,anomaly


In [27]:
df

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0.0,b'tcp',b'ftp_data',b'SF',491.0,0.0,b'0',0.0,0.0,0.0,...,25.0,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,b'normal'
1,0.0,b'udp',b'other',b'SF',146.0,0.0,b'0',0.0,0.0,0.0,...,1.0,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,b'normal'
2,0.0,b'tcp',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,26.0,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,b'anomaly'
3,0.0,b'tcp',b'http',b'SF',232.0,8153.0,b'0',0.0,0.0,0.0,...,255.0,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,b'normal'
4,0.0,b'tcp',b'http',b'SF',199.0,420.0,b'0',0.0,0.0,0.0,...,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,b'normal'
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,0.0,b'tcp',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,25.0,0.10,0.06,0.00,0.00,1.00,1.00,0.00,0.00,b'anomaly'
125969,8.0,b'udp',b'private',b'SF',105.0,145.0,b'0',0.0,0.0,0.0,...,244.0,0.96,0.01,0.01,0.00,0.00,0.00,0.00,0.00,b'normal'
125970,0.0,b'tcp',b'smtp',b'SF',2231.0,384.0,b'0',0.0,0.0,0.0,...,30.0,0.12,0.06,0.00,0.00,0.72,0.00,0.01,0.00,b'normal'
125971,0.0,b'tcp',b'klogin',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,8.0,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00,b'anomaly'


In [1]:
from scipy.io import arff
def load_kdd_dataset(data_path):
    data, meta = arff.loadarff(data_path)
    
    return data, meta

In [68]:
data['protocol_type']

array([b'tcp', b'udp', b'tcp', ..., b'tcp', b'tcp', b'tcp'], dtype='|S4')

In [69]:
data

array([(0., b'tcp', b'ftp_data', b'SF',  491.,   0., b'0', 0., 0., 0., 0., b'0', 0., 0., 0., 0., 0., 0., 0., 0., b'0', b'0',   2., 2., 0., 0., 0., 0., 1.  , 0.  , 0., 150., 25., 0.17, 0.03, 0.17, 0., 0.  , 0., 0.05, 0., b'normal'),
       (0., b'udp', b'other', b'SF',  146.,   0., b'0', 0., 0., 0., 0., b'0', 0., 0., 0., 0., 0., 0., 0., 0., b'0', b'0',  13., 1., 0., 0., 0., 0., 0.08, 0.15, 0., 255.,  1., 0.  , 0.6 , 0.88, 0., 0.  , 0., 0.  , 0., b'normal'),
       (0., b'tcp', b'private', b'S0',    0.,   0., b'0', 0., 0., 0., 0., b'0', 0., 0., 0., 0., 0., 0., 0., 0., b'0', b'0', 123., 6., 1., 1., 0., 0., 0.05, 0.07, 0., 255., 26., 0.1 , 0.05, 0.  , 0., 1.  , 1., 0.  , 0., b'anomaly'),
       ...,
       (0., b'tcp', b'smtp', b'SF', 2231., 384., b'0', 0., 0., 0., 0., b'1', 0., 0., 0., 0., 0., 0., 0., 0., b'0', b'0',   1., 1., 0., 0., 0., 0., 1.  , 0.  , 0., 255., 30., 0.12, 0.06, 0.  , 0., 0.72, 0., 0.01, 0., b'normal'),
       (0., b'tcp', b'klogin', b'S0',    0.,   0., b'0', 0., 0., 0.

In [60]:
type(meta)

scipy.io.arff.arffread.MetaData

In [70]:
type(data)

numpy.ndarray

In [114]:
j = 0
for i in data:    
    data[j][1] = data[j][1].decode('UTF-8')
    print(data[j][1])
    print(type(data[j][1]))    
    j+=1
    if j > 10:
        break

b'0'
<class 'numpy.bytes_'>
b'0'
<class 'numpy.bytes_'>
b'0'
<class 'numpy.bytes_'>
b'0'
<class 'numpy.bytes_'>
b'0'
<class 'numpy.bytes_'>
b'0'
<class 'numpy.bytes_'>
b'0'
<class 'numpy.bytes_'>
b'0'
<class 'numpy.bytes_'>
b'0'
<class 'numpy.bytes_'>
b'0'
<class 'numpy.bytes_'>
b'0'
<class 'numpy.bytes_'>


In [108]:
j = 0
for i in data:
    print(i)
    j+=1
    if j > 5:
        break

(0., b'tcp', b'ftp_data', b'SF', 491., 0., b'0', 0., 0., 0., 0., b'0', 0., 0., 0., 0., 0., 0., 0., 0., b'0', b'0', 2., 2., 0., 0., 0., 0., 1., 0., 0., 150., 25., 0.17, 0.03, 0.17, 0., 0., 0., 0.05, 0., b'normal')
(0., b'udp', b'other', b'SF', 146., 0., b'0', 0., 0., 0., 0., b'0', 0., 0., 0., 0., 0., 0., 0., 0., b'0', b'0', 13., 1., 0., 0., 0., 0., 0.08, 0.15, 0., 255., 1., 0., 0.6, 0.88, 0., 0., 0., 0., 0., b'normal')
(0., b'tcp', b'private', b'S0', 0., 0., b'0', 0., 0., 0., 0., b'0', 0., 0., 0., 0., 0., 0., 0., 0., b'0', b'0', 123., 6., 1., 1., 0., 0., 0.05, 0.07, 0., 255., 26., 0.1, 0.05, 0., 0., 1., 1., 0., 0., b'anomaly')
(0., b'tcp', b'http', b'SF', 232., 8153., b'0', 0., 0., 0., 0., b'1', 0., 0., 0., 0., 0., 0., 0., 0., b'0', b'0', 5., 5., 0.2, 0.2, 0., 0., 1., 0., 0., 30., 255., 1., 0., 0.03, 0.04, 0.03, 0.01, 0., 0.01, b'normal')
(0., b'tcp', b'http', b'SF', 199., 420., b'0', 0., 0., 0., 0., b'1', 0., 0., 0., 0., 0., 0., 0., 0., b'0', b'0', 30., 32., 0., 0., 0., 0., 1., 0., 0.0

In [76]:
if "b" in data:
    print(':V')

  """Entry point for launching an IPython kernel.


In [99]:
import numpy as np

my_array = np.array([(1,2,3,4, "b'paco'"),
                    (5,6,7,8, 'juan')])

print(my_array)
print(type(my_array))
print(type(my_array[0][4]))
print(type(my_array[1][4]))
print(my_array[0][4])

[['1' '2' '3' '4' "b'paco'"]
 ['5' '6' '7' '8' 'juan']]
<class 'numpy.ndarray'>
<class 'numpy.str_'>
<class 'numpy.str_'>
b'paco'


In [126]:
data[0][1].decode('UTF-8')
data[0][1] = data[0][1].decode('UTF-8')
data[0][1]

b'0'

In [164]:
my_array = np.array([(1,2,3,4, "b'paco'"),
                    (5,6,7,8, 'juan')])

my_array[0][2] = int(my_array[0][2]) + 2

my_array

array([['1', '2', '5', '4', "b'paco'"],
       ['5', '6', '7', '8', 'juan']], dtype='<U21')

In [138]:
data

array([(0., b'0', b'ftp_data', b'SF',  491.,   0., b'0', 0., 0., 0., 0., b'0', 0., 0., 0., 0., 0., 0., 0., 0., b'0', b'0',   2., 2., 0., 0., 0., 0., 1.  , 0.  , 0., 150., 25., 0.17, 0.03, 0.17, 0., 0.  , 0., 0.05, 0., b'normal'),
       (0., b'0', b'other', b'SF',  146.,   0., b'0', 0., 0., 0., 0., b'0', 0., 0., 0., 0., 0., 0., 0., 0., b'0', b'0',  13., 1., 0., 0., 0., 0., 0.08, 0.15, 0., 255.,  1., 0.  , 0.6 , 0.88, 0., 0.  , 0., 0.  , 0., b'normal'),
       (0., b'0', b'private', b'S0',    0.,   0., b'0', 0., 0., 0., 0., b'0', 0., 0., 0., 0., 0., 0., 0., 0., b'0', b'0', 123., 6., 1., 1., 0., 0., 0.05, 0.07, 0., 255., 26., 0.1 , 0.05, 0.  , 0., 1.  , 1., 0.  , 0., b'anomaly'),
       ...,
       (0., b'tcp', b'smtp', b'SF', 2231., 384., b'0', 0., 0., 0., 0., b'1', 0., 0., 0., 0., 0., 0., 0., 0., b'0', b'0',   1., 1., 0., 0., 0., 0., 1.  , 0.  , 0., 255., 30., 0.12, 0.06, 0.  , 0., 0.72, 0., 0.01, 0., b'normal'),
       (0., b'tcp', b'klogin', b'S0',    0.,   0., b'0', 0., 0., 0., 0., 

In [153]:
print(data.dtype)



[('duration', '<f8'), ('protocol_type', 'S4'), ('service', 'S11'), ('flag', 'S6'), ('src_bytes', '<f8'), ('dst_bytes', '<f8'), ('land', 'S1'), ('wrong_fragment', '<f8'), ('urgent', '<f8'), ('hot', '<f8'), ('num_failed_logins', '<f8'), ('logged_in', 'S1'), ('num_compromised', '<f8'), ('root_shell', '<f8'), ('su_attempted', '<f8'), ('num_root', '<f8'), ('num_file_creations', '<f8'), ('num_shells', '<f8'), ('num_access_files', '<f8'), ('num_outbound_cmds', '<f8'), ('is_host_login', 'S1'), ('is_guest_login', 'S1'), ('count', '<f8'), ('srv_count', '<f8'), ('serror_rate', '<f8'), ('srv_serror_rate', '<f8'), ('rerror_rate', '<f8'), ('srv_rerror_rate', '<f8'), ('same_srv_rate', '<f8'), ('diff_srv_rate', '<f8'), ('srv_diff_host_rate', '<f8'), ('dst_host_count', '<f8'), ('dst_host_srv_count', '<f8'), ('dst_host_same_srv_rate', '<f8'), ('dst_host_diff_srv_rate', '<f8'), ('dst_host_same_src_port_rate', '<f8'), ('dst_host_srv_diff_host_rate', '<f8'), ('dst_host_serror_rate', '<f8'), ('dst_host_srv_

In [144]:
type(data)

numpy.ndarray

In [142]:
data.tolist()

[(0.0,
  b'0',
  b'ftp_data',
  b'SF',
  491.0,
  0.0,
  b'0',
  0.0,
  0.0,
  0.0,
  0.0,
  b'0',
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  b'0',
  b'0',
  2.0,
  2.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  150.0,
  25.0,
  0.17,
  0.03,
  0.17,
  0.0,
  0.0,
  0.0,
  0.05,
  0.0,
  b'normal'),
 (0.0,
  b'0',
  b'other',
  b'SF',
  146.0,
  0.0,
  b'0',
  0.0,
  0.0,
  0.0,
  0.0,
  b'0',
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  b'0',
  b'0',
  13.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.08,
  0.15,
  0.0,
  255.0,
  1.0,
  0.0,
  0.6,
  0.88,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  b'normal'),
 (0.0,
  b'0',
  b'private',
  b'S0',
  0.0,
  0.0,
  b'0',
  0.0,
  0.0,
  0.0,
  0.0,
  b'0',
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  b'0',
  b'0',
  123.0,
  6.0,
  1.0,
  1.0,
  0.0,
  0.0,
  0.05,
  0.07,
  0.0,
  255.0,
  26.0,
  0.1,
  0.05,
  0.0,
  0.0,
  1.0,
  1.0,
  0.0,
  0.0,
  b'anomaly'),
 (0.0,
  b'0',
  b'http',
  b'

In [159]:
my_array2 = []
for i in meta:
    my_array2.append(i)
    

In [165]:
my_array3 = [attri[0] for attri in meta]

In [166]:
my_array3

['d',
 'p',
 's',
 'f',
 's',
 'd',
 'l',
 'w',
 'u',
 'h',
 'n',
 'l',
 'n',
 'r',
 's',
 'n',
 'n',
 'n',
 'n',
 'n',
 'i',
 'i',
 'c',
 's',
 's',
 's',
 'r',
 's',
 's',
 'd',
 's',
 'd',
 'd',
 'd',
 'd',
 'd',
 'd',
 'd',
 'd',
 'd',
 'd',
 'c']

In [157]:
type(meta)

scipy.io.arff.arffread.MetaData

In [160]:
my_array2

['duration',
 'protocol_type',
 'service',
 'flag',
 'src_bytes',
 'dst_bytes',
 'land',
 'wrong_fragment',
 'urgent',
 'hot',
 'num_failed_logins',
 'logged_in',
 'num_compromised',
 'root_shell',
 'su_attempted',
 'num_root',
 'num_file_creations',
 'num_shells',
 'num_access_files',
 'num_outbound_cmds',
 'is_host_login',
 'is_guest_login',
 'count',
 'srv_count',
 'serror_rate',
 'srv_serror_rate',
 'rerror_rate',
 'srv_rerror_rate',
 'same_srv_rate',
 'diff_srv_rate',
 'srv_diff_host_rate',
 'dst_host_count',
 'dst_host_srv_count',
 'dst_host_same_srv_rate',
 'dst_host_diff_srv_rate',
 'dst_host_same_src_port_rate',
 'dst_host_srv_diff_host_rate',
 'dst_host_serror_rate',
 'dst_host_srv_serror_rate',
 'dst_host_rerror_rate',
 'dst_host_srv_rerror_rate',
 'class']

In [161]:
df = pd.DataFrame(data, columns = my_array2)

In [162]:
df

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0.0,b'0',b'ftp_data',b'SF',491.0,0.0,b'0',0.0,0.0,0.0,...,25.0,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,b'normal'
1,0.0,b'0',b'other',b'SF',146.0,0.0,b'0',0.0,0.0,0.0,...,1.0,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,b'normal'
2,0.0,b'0',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,26.0,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,b'anomaly'
3,0.0,b'0',b'http',b'SF',232.0,8153.0,b'0',0.0,0.0,0.0,...,255.0,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,b'normal'
4,0.0,b'0',b'http',b'SF',199.0,420.0,b'0',0.0,0.0,0.0,...,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,b'normal'
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,0.0,b'tcp',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,25.0,0.10,0.06,0.00,0.00,1.00,1.00,0.00,0.00,b'anomaly'
125969,8.0,b'udp',b'private',b'SF',105.0,145.0,b'0',0.0,0.0,0.0,...,244.0,0.96,0.01,0.01,0.00,0.00,0.00,0.00,0.00,b'normal'
125970,0.0,b'tcp',b'smtp',b'SF',2231.0,384.0,b'0',0.0,0.0,0.0,...,30.0,0.12,0.06,0.00,0.00,0.72,0.00,0.01,0.00,b'normal'
125971,0.0,b'tcp',b'klogin',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,8.0,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00,b'anomaly'


### Splitting of the data set

In [13]:
train_set, val_set, test_set = train_val_test_split(df)

In [14]:
print("Training Set Length:", len(train_set))
print("Validation Set Length:", len(val_set))
print("Test Set Length:", len(test_set))

Training Set Length: 75583
Validation Set Length: 25195
Test Set Length: 25195


For each of the subsets, we separate the labels from the input features.

In [38]:
# Overall data set
X_df = df.drop("class", axis=1)
y_df = df["class"].copy()

In [39]:
# Training data set
X_train = train_set.drop("class", axis=1)
y_train = train_set["class"].copy()

In [40]:
# Validation dataset
X_val = val_set.drop("class", axis=1)
y_val = val_set["class"].copy()

In [41]:
# test data set
X_test = test_set.drop("class", axis=1)
y_test = test_set["class"].copy()

### Data set preparation

In [42]:
# Instantiate our custom transformer
data_preparer = DataFramePreparer()

In [43]:
# We apply the fit with the general data set so that it acquires all possible values
data_preparer.fit(X_df)

DataFramePreparer()

In [44]:
# We transform the training data subset
X_train_prep = data_preparer.transform(X_train)

In [22]:
X_train.head(5)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
98320,0.0,b'icmp',b'ecr_i',b'SF',1032.0,0.0,b'0',0.0,0.0,0.0,...,210.0,65.0,0.31,0.01,0.31,0.0,0.0,0.0,0.0,0.0
8590,0.0,b'tcp',b'smtp',b'SF',1762.0,331.0,b'0',0.0,0.0,0.0,...,30.0,122.0,0.73,0.07,0.03,0.02,0.0,0.0,0.0,0.0
91385,0.0,b'icmp',b'eco_i',b'SF',8.0,0.0,b'0',0.0,0.0,0.0,...,2.0,126.0,1.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0
54349,0.0,b'tcp',b'csnet_ns',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,255.0,18.0,0.07,0.07,0.0,0.0,1.0,1.0,0.0,0.0
69568,0.0,b'tcp',b'smtp',b'SF',1518.0,342.0,b'0',0.0,0.0,0.0,...,83.0,125.0,0.66,0.05,0.01,0.02,0.0,0.0,0.0,0.0


In [23]:
X_train_prep.head(5)

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,root_shell,su_attempted,...,flag_b'SF',flag_b'SH',land_b'0',land_b'1',logged_in_b'0',logged_in_b'1',is_host_login_b'0',is_host_login_b'1',is_guest_login_b'0',is_guest_login_b'1'
98320,0.0,3.57971,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
8590,0.0,6.224638,0.641473,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
91385,0.0,-0.130435,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
54349,0.0,-0.15942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
69568,0.0,5.34058,0.662791,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0


In [45]:
# We transform the validation data subset
X_val_prep = data_preparer.transform(X_val)

## Training of a Logistic Regression algorithm

Instantiating a Machine Learning algorithm using Sklearn is done using the methods exposed by the sklearn API as presented in previous notebooks.

In [47]:
# We train an algorithm based on logistic regression
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=5000)
clf.fit(X_train, y_train)

ValueError: could not convert string to float: b'icmp'