## Description

This notebook shows some of the most used techniques to transform the data set

### Imports

In [2]:
from scipy.io import arff
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split



### Auxiliary functions

In [3]:
def load_kdd_dataset(data_path):
    data = arff.loadarff(data_path)
    df = pd.DataFrame(data[0])
    return df

In [4]:
def train_val_test_split(df, rstate=42, shuffle=True, stratify=None):
    strat = df[stratify] if stratify else None
    train_set, test_set = train_test_split(
        df, test_size=0.4, random_state=rstate, shuffle=shuffle, stratify=strat)
    strat = test_set[stratify] if stratify else None
    val_set, test_set = train_test_split(
        test_set, test_size=0.5, random_state=rstate, shuffle=shuffle, stratify=strat)
    return (train_set, val_set, test_set)

### 1. Reading the data set

In [5]:
df = load_kdd_dataset("../datasets/NSL-KDD/KDDTrain+.arff")

In [6]:
df

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0.0,b'tcp',b'ftp_data',b'SF',491.0,0.0,b'0',0.0,0.0,0.0,...,25.0,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,b'normal'
1,0.0,b'udp',b'other',b'SF',146.0,0.0,b'0',0.0,0.0,0.0,...,1.0,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,b'normal'
2,0.0,b'tcp',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,26.0,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,b'anomaly'
3,0.0,b'tcp',b'http',b'SF',232.0,8153.0,b'0',0.0,0.0,0.0,...,255.0,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,b'normal'
4,0.0,b'tcp',b'http',b'SF',199.0,420.0,b'0',0.0,0.0,0.0,...,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,b'normal'
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,0.0,b'tcp',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,25.0,0.10,0.06,0.00,0.00,1.00,1.00,0.00,0.00,b'anomaly'
125969,8.0,b'udp',b'private',b'SF',105.0,145.0,b'0',0.0,0.0,0.0,...,244.0,0.96,0.01,0.01,0.00,0.00,0.00,0.00,0.00,b'normal'
125970,0.0,b'tcp',b'smtp',b'SF',2231.0,384.0,b'0',0.0,0.0,0.0,...,30.0,0.12,0.06,0.00,0.00,0.72,0.00,0.01,0.00,b'normal'
125971,0.0,b'tcp',b'klogin',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,8.0,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00,b'anomaly'


### 2. Splitting of the data set

In [7]:
train_set, val_set, test_set = train_val_test_split(df, stratify='protocol_type')

In [8]:
print("Training Set Length:", len(train_set))
print("Validation Set Length:", len(val_set))
print("Test Set Length:", len(test_set))

Training Set Length: 75583
Validation Set Length: 25195
Test Set Length: 25195


### 3 Cleaning the data

Before we start, let's retrieve the clean dataset and separate the labels from the rest of the data, we don't necessarily want to apply the same transformations on both sets.

In [9]:
# We separate the input features from the output feature
X_train = train_set.drop("class", axis=1)
y_train = train_set["class"].copy()

In [10]:
# To illustrate this section we are going to add some null values to some features of the dataset
X_train.loc[(X_train["src_bytes"]>400) & (X_train["src_bytes"]<800), "src_bytes"] = np.nan
X_train.loc[(X_train["dst_bytes"]>500) & (X_train["dst_bytes"]<2000), "dst_bytes"] = np.nan
X_train

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
113467,0.0,b'tcp',b'http',b'SF',,53508.0,b'0',0.0,0.0,0.0,...,9.0,255.0,1.00,0.00,0.11,0.03,0.00,0.00,0.0,0.0
31899,0.0,b'tcp',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,255.0,4.0,0.02,0.05,0.00,0.00,1.00,1.00,0.0,0.0
108116,0.0,b'tcp',b'http',b'SF',304.0,,b'0',0.0,0.0,0.0,...,39.0,255.0,1.00,0.00,0.03,0.06,0.00,0.00,0.0,0.0
89913,0.0,b'tcp',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,255.0,15.0,0.06,0.07,0.00,0.00,1.00,1.00,0.0,0.0
106319,0.0,b'icmp',b'eco_i',b'SF',8.0,0.0,b'0',0.0,0.0,0.0,...,2.0,7.0,1.00,0.00,1.00,0.57,0.00,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64559,0.0,b'tcp',b'systat',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,255.0,20.0,0.08,0.06,0.00,0.00,1.00,1.00,0.0,0.0
67272,0.0,b'tcp',b'http',b'SF',210.0,,b'0',0.0,0.0,0.0,...,119.0,255.0,1.00,0.00,0.01,0.02,0.02,0.01,0.0,0.0
32452,3.0,b'tcp',b'smtp',b'SF',889.0,328.0,b'0',0.0,0.0,0.0,...,111.0,155.0,0.64,0.04,0.01,0.01,0.01,0.00,0.0,0.0
112657,0.0,b'tcp',b'http',b'SF',284.0,444.0,b'0',0.0,0.0,0.0,...,255.0,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0


Most machine learning algorithms cannot work on features that contain null values. Therefore, there are three options to replace them:  

* Delete the corresponding rows
* Delete the corresponding attribute (column)
* Fill them with a given value (zero, mean...)

In [11]:
# Check if there is any attribute with null values
X_train.isna().any()

duration                       False
protocol_type                  False
service                        False
flag                           False
src_bytes                       True
dst_bytes                       True
land                           False
wrong_fragment                 False
urgent                         False
hot                            False
num_failed_logins              False
logged_in                      False
num_compromised                False
root_shell                     False
su_attempted                   False
num_root                       False
num_file_creations             False
num_shells                     False
num_access_files               False
num_outbound_cmds              False
is_host_login                  False
is_guest_login                 False
count                          False
srv_count                      False
serror_rate                    False
srv_serror_rate                False
rerror_rate                    False
s

In [12]:
# Select the rows that contain null values
filas_valores_nulos  = X_train[X_train.isnull().any(axis=1)]
filas_valores_nulos

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
113467,0.0,b'tcp',b'http',b'SF',,53508.0,b'0',0.0,0.0,0.0,...,9.0,255.0,1.00,0.00,0.11,0.03,0.00,0.00,0.0,0.0
108116,0.0,b'tcp',b'http',b'SF',304.0,,b'0',0.0,0.0,0.0,...,39.0,255.0,1.00,0.00,0.03,0.06,0.00,0.00,0.0,0.0
64957,1.0,b'tcp',b'smtp',b'SF',,329.0,b'0',0.0,0.0,0.0,...,198.0,181.0,0.65,0.03,0.01,0.01,0.02,0.02,0.0,0.0
100052,0.0,b'tcp',b'http',b'SF',206.0,,b'0',0.0,0.0,0.0,...,255.0,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0
99158,0.0,b'tcp',b'http',b'SF',291.0,,b'0',0.0,0.0,0.0,...,255.0,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117260,0.0,b'tcp',b'http',b'SF',321.0,,b'0',0.0,0.0,0.0,...,2.0,255.0,1.00,0.00,0.50,0.02,0.00,0.00,0.0,0.0
110723,0.0,b'tcp',b'http',b'SF',361.0,,b'0',0.0,0.0,0.0,...,40.0,255.0,1.00,0.00,0.03,0.06,0.00,0.00,0.0,0.0
58053,0.0,b'tcp',b'http',b'SF',202.0,,b'0',0.0,0.0,0.0,...,83.0,255.0,1.00,0.00,0.01,0.01,0.00,0.00,0.0,0.0
70184,0.0,b'tcp',b'http',b'SF',315.0,,b'0',0.0,0.0,0.0,...,255.0,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0


#### Option 1: We delete the rows with null values

In [13]:
# We copy the dataset so as not to alter the original
X_train_copy = X_train.copy()

In [14]:
# remove rows with null values
X_train_copy.dropna(subset=["src_bytes", "dst_bytes"], inplace=True)
X_train_copy

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
31899,0.0,b'tcp',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,255.0,4.0,0.02,0.05,0.00,0.00,1.00,1.0,0.0,0.0
89913,0.0,b'tcp',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,255.0,15.0,0.06,0.07,0.00,0.00,1.00,1.0,0.0,0.0
106319,0.0,b'icmp',b'eco_i',b'SF',8.0,0.0,b'0',0.0,0.0,0.0,...,2.0,7.0,1.00,0.00,1.00,0.57,0.00,0.0,0.0,0.0
98007,0.0,b'udp',b'domain_u',b'SF',46.0,139.0,b'0',0.0,0.0,0.0,...,255.0,254.0,1.00,0.01,0.00,0.00,0.00,0.0,0.0,0.0
16447,0.0,b'tcp',b'smtp',b'SF',1790.0,363.0,b'0',0.0,0.0,0.0,...,141.0,137.0,0.55,0.04,0.01,0.01,0.00,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90665,0.0,b'tcp',b'ftp_data',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,255.0,63.0,0.25,0.02,0.02,0.00,1.00,1.0,0.0,0.0
64559,0.0,b'tcp',b'systat',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,255.0,20.0,0.08,0.06,0.00,0.00,1.00,1.0,0.0,0.0
32452,3.0,b'tcp',b'smtp',b'SF',889.0,328.0,b'0',0.0,0.0,0.0,...,111.0,155.0,0.64,0.04,0.01,0.01,0.01,0.0,0.0,0.0
112657,0.0,b'tcp',b'http',b'SF',284.0,444.0,b'0',0.0,0.0,0.0,...,255.0,255.0,1.00,0.00,0.00,0.00,0.00,0.0,0.0,0.0


In [15]:
# Count the number of rows deleted
print("The number of rows removed is:", len(X_train) - len(X_train_copy))

The number of rows removed is: 9886


#### Option 2: We remove the attributes with null values

In [16]:
# We copy the dataset so as not to alter the original
X_train_copy = X_train.copy()

In [17]:
# Remove attributes with null values
X_train_copy.drop(["src_bytes", "dst_bytes"], axis=1, inplace=True)
X_train_copy

Unnamed: 0,duration,protocol_type,service,flag,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
113467,0.0,b'tcp',b'http',b'SF',b'0',0.0,0.0,0.0,0.0,b'1',...,9.0,255.0,1.00,0.00,0.11,0.03,0.00,0.00,0.0,0.0
31899,0.0,b'tcp',b'private',b'S0',b'0',0.0,0.0,0.0,0.0,b'0',...,255.0,4.0,0.02,0.05,0.00,0.00,1.00,1.00,0.0,0.0
108116,0.0,b'tcp',b'http',b'SF',b'0',0.0,0.0,0.0,0.0,b'1',...,39.0,255.0,1.00,0.00,0.03,0.06,0.00,0.00,0.0,0.0
89913,0.0,b'tcp',b'private',b'S0',b'0',0.0,0.0,0.0,0.0,b'0',...,255.0,15.0,0.06,0.07,0.00,0.00,1.00,1.00,0.0,0.0
106319,0.0,b'icmp',b'eco_i',b'SF',b'0',0.0,0.0,0.0,0.0,b'0',...,2.0,7.0,1.00,0.00,1.00,0.57,0.00,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64559,0.0,b'tcp',b'systat',b'S0',b'0',0.0,0.0,0.0,0.0,b'0',...,255.0,20.0,0.08,0.06,0.00,0.00,1.00,1.00,0.0,0.0
67272,0.0,b'tcp',b'http',b'SF',b'0',0.0,0.0,0.0,0.0,b'1',...,119.0,255.0,1.00,0.00,0.01,0.02,0.02,0.01,0.0,0.0
32452,3.0,b'tcp',b'smtp',b'SF',b'0',0.0,0.0,0.0,0.0,b'1',...,111.0,155.0,0.64,0.04,0.01,0.01,0.01,0.00,0.0,0.0
112657,0.0,b'tcp',b'http',b'SF',b'0',0.0,0.0,0.0,0.0,b'1',...,255.0,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0


In [18]:
# Count the number of attributes removed
print("The number of attributes removed is:", len(list(X_train)) - len(list(X_train_copy)))

The number of attributes removed is: 2


#### Option 3: We fill the null values with a certain value

In [19]:
# We copy the dataset so as not to alter the original
X_train_copy = X_train.copy()

In [20]:
# We fill the null values with the average of the attribute values
media_srcbytes = X_train_copy["src_bytes"].mean()
media_dstbytes = X_train_copy["dst_bytes"].mean()

X_train_copy["src_bytes"].fillna(media_srcbytes, inplace=True)
X_train_copy["dst_bytes"].fillna(media_dstbytes, inplace=True)

X_train_copy

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
113467,0.0,b'tcp',b'http',b'SF',66914.530762,53508.000000,b'0',0.0,0.0,0.0,...,9.0,255.0,1.00,0.00,0.11,0.03,0.00,0.00,0.0,0.0
31899,0.0,b'tcp',b'private',b'S0',0.000000,0.000000,b'0',0.0,0.0,0.0,...,255.0,4.0,0.02,0.05,0.00,0.00,1.00,1.00,0.0,0.0
108116,0.0,b'tcp',b'http',b'SF',304.000000,9181.334754,b'0',0.0,0.0,0.0,...,39.0,255.0,1.00,0.00,0.03,0.06,0.00,0.00,0.0,0.0
89913,0.0,b'tcp',b'private',b'S0',0.000000,0.000000,b'0',0.0,0.0,0.0,...,255.0,15.0,0.06,0.07,0.00,0.00,1.00,1.00,0.0,0.0
106319,0.0,b'icmp',b'eco_i',b'SF',8.000000,0.000000,b'0',0.0,0.0,0.0,...,2.0,7.0,1.00,0.00,1.00,0.57,0.00,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64559,0.0,b'tcp',b'systat',b'S0',0.000000,0.000000,b'0',0.0,0.0,0.0,...,255.0,20.0,0.08,0.06,0.00,0.00,1.00,1.00,0.0,0.0
67272,0.0,b'tcp',b'http',b'SF',210.000000,9181.334754,b'0',0.0,0.0,0.0,...,119.0,255.0,1.00,0.00,0.01,0.02,0.02,0.01,0.0,0.0
32452,3.0,b'tcp',b'smtp',b'SF',889.000000,328.000000,b'0',0.0,0.0,0.0,...,111.0,155.0,0.64,0.04,0.01,0.01,0.01,0.00,0.0,0.0
112657,0.0,b'tcp',b'http',b'SF',284.000000,444.000000,b'0',0.0,0.0,0.0,...,255.0,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0


In [21]:
# We copy the dataset so as not to alter the original
X_train_copy = X_train.copy()

In [22]:
# A very high value in the attribute can trigger the average
# Fill the values with the median
mediana_srcbytes = X_train_copy["src_bytes"].median()
mediana_dstbytes = X_train_copy["dst_bytes"].median()

X_train_copy["src_bytes"].fillna(mediana_srcbytes, inplace=True)
X_train_copy["dst_bytes"].fillna(mediana_dstbytes, inplace=True)

X_train_copy

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
113467,0.0,b'tcp',b'http',b'SF',43.0,53508.0,b'0',0.0,0.0,0.0,...,9.0,255.0,1.00,0.00,0.11,0.03,0.00,0.00,0.0,0.0
31899,0.0,b'tcp',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,255.0,4.0,0.02,0.05,0.00,0.00,1.00,1.00,0.0,0.0
108116,0.0,b'tcp',b'http',b'SF',304.0,0.0,b'0',0.0,0.0,0.0,...,39.0,255.0,1.00,0.00,0.03,0.06,0.00,0.00,0.0,0.0
89913,0.0,b'tcp',b'private',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,255.0,15.0,0.06,0.07,0.00,0.00,1.00,1.00,0.0,0.0
106319,0.0,b'icmp',b'eco_i',b'SF',8.0,0.0,b'0',0.0,0.0,0.0,...,2.0,7.0,1.00,0.00,1.00,0.57,0.00,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64559,0.0,b'tcp',b'systat',b'S0',0.0,0.0,b'0',0.0,0.0,0.0,...,255.0,20.0,0.08,0.06,0.00,0.00,1.00,1.00,0.0,0.0
67272,0.0,b'tcp',b'http',b'SF',210.0,0.0,b'0',0.0,0.0,0.0,...,119.0,255.0,1.00,0.00,0.01,0.02,0.02,0.01,0.0,0.0
32452,3.0,b'tcp',b'smtp',b'SF',889.0,328.0,b'0',0.0,0.0,0.0,...,111.0,155.0,0.64,0.04,0.01,0.01,0.01,0.00,0.0,0.0
112657,0.0,b'tcp',b'http',b'SF',284.0,444.0,b'0',0.0,0.0,0.0,...,255.0,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0


#### There is another alternative to option 3 which is to use sklearn's Imputer class

In [23]:
# We copy the dataset so as not to alter the original
X_train_copy = X_train.copy()

In [24]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")

In [25]:
# The imputer class does not support categorical values, we remove the categorical attributes
X_train_copy_num = X_train_copy.select_dtypes(exclude=['object'])
X_train_copy_num.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 75583 entries, 113467 to 99030
Data columns (total 34 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   duration                     75583 non-null  float64
 1   src_bytes                    73696 non-null  float64
 2   dst_bytes                    67572 non-null  float64
 3   wrong_fragment               75583 non-null  float64
 4   urgent                       75583 non-null  float64
 5   hot                          75583 non-null  float64
 6   num_failed_logins            75583 non-null  float64
 7   num_compromised              75583 non-null  float64
 8   root_shell                   75583 non-null  float64
 9   su_attempted                 75583 non-null  float64
 10  num_root                     75583 non-null  float64
 11  num_file_creations           75583 non-null  float64
 12  num_shells                   75583 non-null  float64
 13  num_access_

In [26]:
# Numerical attributes are provided for you to calculate the values
imputer.fit(X_train_copy_num)

SimpleImputer(strategy='median')

In [27]:
# fill in the null values
X_train_copy_num_nonan = imputer.transform(X_train_copy_num)

In [28]:
# We transform the result to a Pandas DataFrame
X_train_copy = pd.DataFrame(X_train_copy_num_nonan, columns=X_train_copy_num.columns)

In [29]:
X_train_copy.head(10)

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,root_shell,su_attempted,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0.0,43.0,53508.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9.0,255.0,1.0,0.0,0.11,0.03,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,4.0,0.02,0.05,0.0,0.0,1.0,1.0,0.0,0.0
2,0.0,304.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,39.0,255.0,1.0,0.0,0.03,0.06,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,15.0,0.06,0.07,0.0,0.0,1.0,1.0,0.0,0.0
4,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,7.0,1.0,0.0,1.0,0.57,0.0,0.0,0.0,0.0
5,0.0,46.0,139.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,254.0,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,1790.0,363.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,141.0,137.0,0.55,0.04,0.01,0.01,0.0,0.0,0.0,0.0
7,1.0,43.0,329.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,198.0,181.0,0.65,0.03,0.01,0.01,0.02,0.02,0.0,0.0
8,0.0,206.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,334.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.0,28.0,1.0,0.0,1.0,0.11,0.0,0.0,0.0,0.0


#### sklearn APIs