# EDA for network dataframe: Traffic Data from Kyoto University's Honeypots

## Load libreraries

In [1]:
import ipaddress
import os
import shutil
from zipfile import ZipFile

import pandas as pd
import requests as req
from tqdm import tqdm
from ydata_profiling import ProfileReport
from pycaret.classification import (
    setup,
    compare_models,
    create_model,
    evaluate_model,
    get_config,
    tune_model,
)
from sklearn.decomposition import PCA

### Reading and working with data

In [2]:
# Variables to data
FOLDER_RAW:str = '../data/raw/'
YEAR:int = 2007
MONTH:int = 1
DAYS:list[int] = range(1, 2)

print(list(DAYS))

[1]


In [3]:
# Download file
FILE = f"{YEAR}{MONTH:02d}.zip"
URL_PATH = f"http://www.takakura.com/Kyoto_data/new_data201704/{YEAR}/{FILE}"
PATH = "../data/raw"

# Check if file exists
if not os.path.exists(f"{PATH}/{FILE}"):
    print("Download files:")
    resp=req.get(URL_PATH, stream=True)
    save_path=f"{PATH}/{FILE}"
    with open(save_path, "wb") as handle:
        for data in tqdm(resp.iter_content(),
                        desc=FILE,
                        postfix=f"save to {save_path}",
                        total=int(resp.headers["Content-Length"])):
            handle.write(data)

    # Unzip file
    with ZipFile(f"{PATH}/{FILE}", 'r') as zip_ref:
        zip_ref.extractall(f"{PATH}")
        os.remove(f"{PATH}/{FILE}")

Download files:


200701.zip: 100%|██████████| 43382104/43382104 [07:58<00:00, 90709.61it/s, save to ../data/raw/200701.zip]


In [4]:
# Move file to the correct folder
if os.path.exists(f"{PATH}/Kyoto2016"):
    print("Move files:")
    shutil.move(f"{PATH}/Kyoto2016/{YEAR}", f"{PATH}/{YEAR}")
    os.removedirs(f"{PATH}/Kyoto2016/")

Move files:


In [5]:
# Load dataset
headers = [
    "duration",
    "service",
    "source_bytes",
    "destination_bytes",
    "count",
    "same_srv_rate",
    "serror_rate",
    "srv_serror_rate",
    "dst_host_count",
    "dst_host_srv_count",
    "dst_host_same_src_port_rate",
    "dst_host_serror_rate",
    "dst_host_srv_serror_rate",
    "flag",
    "ids_detection",
    "malware_detection",
    "ashula_detection",
    "label",
    "source_ip_address",
    "source_port_number",
    "destination_ip_address",
    "destination_port_number",
    "start_time",
    "protocol",
]

df_list = []

for DAY in DAYS:
    path: str = f"{FOLDER_RAW}{YEAR:04d}/{MONTH:02d}/{YEAR:04d}{MONTH:02d}{DAY:02d}.txt"

    if not os.path.exists(path):
        continue

    df_dirty = pd.read_csv(path, sep="\t", header=None)
    df_list.append(df_dirty)

df = pd.concat(df_list)
print(len(headers), len(df.columns))
df.columns = headers

with pd.option_context(
    'display.max_colwidth', None,
    'display.max_columns', None,
    'display.max_rows', None):
    display(df.head())

# Preclean data
del df["ids_detection"]

24 24


Unnamed: 0,duration,service,source_bytes,destination_bytes,count,same_srv_rate,serror_rate,srv_serror_rate,dst_host_count,dst_host_srv_count,dst_host_same_src_port_rate,dst_host_serror_rate,dst_host_srv_serror_rate,flag,ids_detection,malware_detection,ashula_detection,label,source_ip_address,source_port_number,destination_ip_address,destination_port_number,start_time,protocol
0,86364.573924,other,240680,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,0,0,0,-1,fda2:69aa:1f1a:2d57:7da5:27fc:07e8:2808,32770,fda2:69aa:1f1a:425e:1046:01b0:02d4:2adb,8649,00:00:18,udp
1,0.0,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,0,0,60(2),-2,fda2:69aa:1f1a:509a:0b19:590a:0528:2375,1050,fda2:69aa:1f1a:f505:7df6:2782:60e4:44d6,1434,00:00:27,udp
2,0.00334,other,48,48,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,SF,0,0,0,-1,fda2:69aa:1f1a:232a:7a25:0083:5f86:3cc0,123,fda2:69aa:1f1a:f820:7d99:2701:0ff4:1570,123,00:00:53,udp
3,0.0,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,0,0,0,-1,fda2:69aa:1f1a:a757:7d73:278f:61f1:0f3f,138,fda2:69aa:1f1a:1499:7d6b:27b7:6172:002c,138,00:00:57,udp
4,0.311797,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,RSTO,0,0,0,-1,fda2:69aa:1f1a:9113:3c52:037e:52b3:2742,11810,fda2:69aa:1f1a:ec01:7d38:2763:0f17:1b37,139,00:01:17,tcp


In [6]:
# Size of the dataset
df.shape

(40476, 23)

In [7]:
# Group of label
df.groupby(['label'])['label'].count()

label
-2     2670
-1    34826
 1     2980
Name: label, dtype: int64

In [8]:
df.dtypes

duration                       float64
service                         object
source_bytes                     int64
destination_bytes                int64
count                            int64
same_srv_rate                  float64
serror_rate                    float64
srv_serror_rate                float64
dst_host_count                   int64
dst_host_srv_count               int64
dst_host_same_src_port_rate    float64
dst_host_serror_rate           float64
dst_host_srv_serror_rate       float64
flag                            object
malware_detection               object
ashula_detection                object
label                            int64
source_ip_address               object
source_port_number               int64
destination_ip_address          object
destination_port_number          int64
start_time                      object
protocol                        object
dtype: object

In [9]:
# Drop duplicates
df.drop_duplicates(inplace=True)
df.shape

(40476, 23)

In [10]:
# Check percentage values to work
data = df.groupby(['label'])['label'].count()
data[1] / data.sum()

0.07362387587706296

### First steps with dataframe

#### Helper funtions

In [11]:
def map_ipv6_to_ipv4(ipv6_addr):
    # Convert the IPv6 address to an integer
    ipv6_int = int(ipaddress.IPv6Address(ipv6_addr))

    # Select a suitable range for IPv4 addresses (e.g., 192.168.0.0/16)
    ipv4_base_int = int(ipaddress.IPv4Address('192.168.0.0'))
    ipv4_range = 2**16  # Number of addresses in the range 192.168.0.0/16

    # Create a simple mapping: take the lower 16 bits of the IPv6 address
    mapped_ipv4_int = ipv4_base_int + (ipv6_int & (ipv4_range - 1))

    # Convert the mapped integer back to an IPv4 address
    mapped_ipv4_addr = ipaddress.IPv4Address(mapped_ipv4_int)

    return mapped_ipv4_addr

def ip_to_int(ip):
    return int(ipaddress.IPv4Address(ip))

In [12]:
# Categorical columns
cat_cols = [
    "service",
    "flag",
    "ashula_detection",
    "label",
    # "source_ip_address",
    # "destination_ip_address",
    "protocol",
]

In [13]:
# Show values of categoricals
for col in cat_cols:
    print(f"{col}: {df[col].unique()}")

service: ['other' 'ssl' 'smtp' 'dns' 'ssh' 'smtp,ssl' 'http' 'ftp']
flag: ['S0' 'SF' 'RSTO' 'REJ' 'OTH' 'S3' 'SHR' 'RSTOS0' 'RSTRH' 'RSTR' 'SH' 'S1'
 'S2']
ashula_detection: ['0' '60(2)' '60(1)' '130(1),131(2)' '129(4)' '151(1)' '119(1)' '80(1)']
label: [-1 -2  1]
protocol: ['udp' 'tcp' 'icmp']


#### Transforming variables

In [14]:
df_transform = df.copy()

# transform source_ip_address and destination_ip_address
df_transform["source_ip_address"] = df_transform["source_ip_address"].apply(map_ipv6_to_ipv4)
df_transform["destination_ip_address"] = df_transform["destination_ip_address"].apply(map_ipv6_to_ipv4)
df_transform["source_ip_address"] = df_transform["source_ip_address"].apply(ip_to_int)
df_transform["destination_ip_address"] = df_transform["destination_ip_address"].apply(ip_to_int)

# transform service
df_transform["service_split"] = df_transform["service"].str.split(" ")
df_transform = df_transform.explode("service_split")
freq_service = df_transform["service_split"].value_counts().to_dict()
df_transform["freq_service"] = df_transform["service_split"].map(freq_service)

# transform flag
freq_flag = df_transform["flag"].value_counts().to_dict()
df_transform["freq_flag"] = df_transform["flag"].map(freq_flag)

# transform ashula_detection
df_transform["ashula_split"] = df_transform["ashula_detection"].str.split(",")
df_transform = df_transform.explode("ashula_split")
df_transform["ashula_code"] = df_transform["ashula_split"].str.extract('(\d+)')
freq_ashula = df_transform["ashula_code"].value_counts().to_dict()
df_transform["freq_ashula"] = df_transform["ashula_code"].map(freq_ashula)

# transform label
df_transform["label"] = df_transform["label"].replace({-1: 1, -2: 1, 1:0})

# transform protocol
freq_protocol = df_transform["protocol"].value_counts().to_dict()
df_transform["freq_protocol"] = df_transform["protocol"].map(freq_protocol)

# Show Results
with pd.option_context(
    'display.max_colwidth', None,
    'display.max_columns', None,
    'display.max_rows', None):
    display(df_transform.head())

Unnamed: 0,duration,service,source_bytes,destination_bytes,count,same_srv_rate,serror_rate,srv_serror_rate,dst_host_count,dst_host_srv_count,dst_host_same_src_port_rate,dst_host_serror_rate,dst_host_srv_serror_rate,flag,malware_detection,ashula_detection,label,source_ip_address,source_port_number,destination_ip_address,destination_port_number,start_time,protocol,service_split,freq_service,freq_flag,ashula_split,ashula_code,freq_ashula,freq_protocol
0,86364.573924,other,240680,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,0,0,1,3232245768,32770,3232246491,8649,00:00:18,udp,other,37331,12040,0,0,37348,5683
1,0.0,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,0,60(2),1,3232244597,1050,3232253142,1434,00:00:27,udp,other,37331,12040,60(2),60,2941,5683
2,0.00334,other,48,48,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,SF,0,0,1,3232251072,123,3232241008,123,00:00:53,udp,other,37331,3342,0,0,37348,5683
3,0.0,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,0,0,1,3232239423,138,3232235564,138,00:00:57,udp,other,37331,12040,0,0,37348,5683
4,0.311797,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,RSTO,0,0,1,3232245570,11810,3232242487,139,00:01:17,tcp,other,37331,5643,0,0,37348,19107


In [15]:
# Delete columns
df_transform = df_transform.drop(
    columns=[
        "service",
        "flag",
        "ashula_detection",
        "ashula_split",
        "ashula_code",
        "service_split",
        "protocol",
        "malware_detection",
        'start_time',
    ]
)
# Show Results
with pd.option_context(
    'display.max_colwidth', None,
    'display.max_columns', None,
    'display.max_rows', None):
    display(df_transform.head())

Unnamed: 0,duration,source_bytes,destination_bytes,count,same_srv_rate,serror_rate,srv_serror_rate,dst_host_count,dst_host_srv_count,dst_host_same_src_port_rate,dst_host_serror_rate,dst_host_srv_serror_rate,label,source_ip_address,source_port_number,destination_ip_address,destination_port_number,freq_service,freq_flag,freq_ashula,freq_protocol
0,86364.573924,240680,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,1,3232245768,32770,3232246491,8649,37331,12040,37348,5683
1,0.0,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,1,3232244597,1050,3232253142,1434,37331,12040,2941,5683
2,0.00334,48,48,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,1,3232251072,123,3232241008,123,37331,3342,37348,5683
3,0.0,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,1,3232239423,138,3232235564,138,37331,12040,37348,5683
4,0.311797,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,1,3232245570,11810,3232242487,139,37331,5643,37348,19107


In [16]:
df_transform.drop_duplicates(inplace=True)
df_transform.dtypes

duration                       float64
source_bytes                     int64
destination_bytes                int64
count                            int64
same_srv_rate                  float64
serror_rate                    float64
srv_serror_rate                float64
dst_host_count                   int64
dst_host_srv_count               int64
dst_host_same_src_port_rate    float64
dst_host_serror_rate           float64
dst_host_srv_serror_rate       float64
label                            int64
source_ip_address                int64
source_port_number               int64
destination_ip_address           int64
destination_port_number          int64
freq_service                     int64
freq_flag                        int64
freq_ashula                      int64
freq_protocol                    int64
dtype: object

In [17]:
profile = ProfileReport(
    df_transform, title='Dataset Network profiling', explorative=True
)
profile.to_file("../reports/profile.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [18]:
df_transform.corr()

Unnamed: 0,duration,source_bytes,destination_bytes,count,same_srv_rate,serror_rate,srv_serror_rate,dst_host_count,dst_host_srv_count,dst_host_same_src_port_rate,...,dst_host_srv_serror_rate,label,source_ip_address,source_port_number,destination_ip_address,destination_port_number,freq_service,freq_flag,freq_ashula,freq_protocol
duration,1.0,0.755727,-2.6e-05,-0.002764,-0.002972,-0.00137,-0.005553,-0.00102,0.00393,-0.001914,...,-0.003081,-0.009288,0.000235,0.008544,-0.000849,0.011509,-0.007757,-0.006984,0.004012,-0.007195
source_bytes,0.755727,1.0,0.013996,0.025319,0.044765,-0.014066,-0.065053,0.025821,0.135904,-0.023466,...,-0.032043,-0.175683,-0.010229,0.007265,-0.067264,-0.007484,-0.191402,-0.155335,0.003512,0.072235
destination_bytes,-2.6e-05,0.013996,1.0,-0.003082,-0.002518,-0.003302,-0.016129,-0.004453,-0.005831,-0.00637,...,-0.007872,-0.002231,-0.007508,0.059991,0.009743,-0.006154,-0.083993,-0.043058,0.007172,0.020353
count,-0.002764,0.025319,-0.003082,1.0,0.739432,0.100467,-0.09898,0.352934,0.289493,0.00877,...,-0.052438,-0.026585,-0.043953,0.16551,-0.129385,-0.032095,-0.022035,-0.330715,0.072029,0.144299
same_srv_rate,-0.002972,0.044765,-0.002518,0.739432,1.0,0.278572,-0.090012,0.438038,0.368914,0.035149,...,-0.044109,-0.031654,-0.07653,0.202306,-0.191337,-0.036226,-0.037708,-0.449136,0.093639,0.16443
serror_rate,-0.00137,-0.014066,-0.003302,0.100467,0.278572,1.0,0.179009,-0.040823,-0.047523,0.113033,...,0.133721,0.028176,-0.048564,0.202352,-0.030104,0.027638,0.030879,0.011486,0.024571,-0.218749
srv_serror_rate,-0.005553,-0.065053,-0.016129,-0.09898,-0.090012,0.179009,1.0,-0.146637,-0.224136,-0.00738,...,0.008206,0.146273,0.066171,0.228136,0.082152,0.041294,0.103358,0.057705,0.087541,0.006958
dst_host_count,-0.00102,0.025821,-0.004453,0.352934,0.438038,-0.040823,-0.146637,1.0,0.748397,0.079511,...,0.033036,0.096309,-0.105196,0.146791,-0.224289,0.006846,0.073499,-0.489754,0.126584,0.248433
dst_host_srv_count,0.00393,0.135904,-0.005831,0.289493,0.368914,-0.047523,-0.224136,0.748397,1.0,0.127138,...,0.074981,-0.40967,-0.081133,0.121071,-0.30612,0.001555,-0.390151,-0.670453,0.154799,0.298109
dst_host_same_src_port_rate,-0.001914,-0.023466,-0.00637,0.00877,0.035149,0.113033,-0.00738,0.079511,0.127138,1.0,...,0.343831,-0.016737,-0.030728,0.041622,-0.074317,-0.014785,-0.04391,-0.1239,0.068693,-0.196839


In [19]:
df_transform["label"].isnull().sum()

0

In [20]:
df_transform["label"].unique().tolist()

[1, 0]

### Apply balance in the dataset with PyCaret

In [21]:
exp_balance = setup(
    data=df_transform,
    target="label",
    normalize=True,
    transformation=True,
    low_variance_threshold=0.1,
    fix_imbalance=True,
    session_id=42,
)

print(get_config("y").value_counts())

Unnamed: 0,Description,Value
0,Session id,42
1,Target,label
2,Target type,Binary
3,Original data shape,"(38688, 21)"
4,Transformed data shape,"(61603, 17)"
5,Transformed train set shape,"(49996, 17)"
6,Transformed test set shape,"(11607, 17)"
7,Numeric features,20
8,Preprocess,True
9,Imputation type,simple


label
1    35712
0     2976
Name: count, dtype: int64


In [22]:
balanced_data = get_config("X_train")
balanced_data.shape

(27081, 20)

In [23]:
balaced_target = get_config("y_train")
balaced_target.value_counts()

label
1    24998
0     2083
Name: count, dtype: int64

In [24]:
transformed_data = get_config("X")
transformed_data.head()

Unnamed: 0,duration,source_bytes,destination_bytes,count,same_srv_rate,serror_rate,srv_serror_rate,dst_host_count,dst_host_srv_count,dst_host_same_src_port_rate,dst_host_serror_rate,dst_host_srv_serror_rate,source_ip_address,source_port_number,destination_ip_address,destination_port_number,freq_service,freq_flag,freq_ashula,freq_protocol
39772,2.216014,66,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,3232242595,8,3232238497,0,37331,16682,37348,15847
16685,2.217538,66,0,0,0.0,0.0,0.0,0,1,0.0,0.0,0.0,3232242576,8,3232261835,0,37331,16682,37348,15847
1159,0.103468,0,48,0,0.0,0.0,0.0,0,16,0.0,0.0,0.0,3232262739,4254,3232239387,25,37331,3342,37348,19107
15482,0.0,0,0,0,0.0,0.0,1.0,0,0,0.0,0.0,0.0,3232236452,1847,3232250278,2967,37331,12040,37348,19107
6105,0.0,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,3232242358,8,3232266476,0,37331,16682,37348,15847


In [25]:
transformed_data_y = get_config("y")
transformed_data_y.head()

39772    1
16685    1
1159     0
15482    1
6105     1
Name: label, dtype: int8

In [26]:
num_component:int = 5
pca = PCA(n_components=num_component)
pca_transformed = pca.fit_transform(transformed_data)
pca_df = pd.DataFrame(pca_transformed, columns=[f"PC{i}" for i in range(1, num_component + 1)])
pca_df.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5
0,-7317.758711,1163.622192,-3851.023078,-5578.814674,-3437.751619
1,-8887.984254,-6215.070726,-6814.359092,3638.59092,15054.13428
2,-2118.68093,3526.754691,-2694.937756,12399.050915,-13168.751814
3,-5152.051752,-1495.945836,-5441.822223,-6505.245393,7977.793494
4,-9193.176512,-7684.038934,-7403.231492,5279.375032,18824.336639


In [27]:
pca_df["label"] = transformed_data_y
pca_df.dropna(inplace=True)
profile = ProfileReport(
    pca_df, title='Dataset Network profiling (processed)', explorative=True
)
profile.to_file("../reports/profile_processed.html")


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [28]:
# Count None values
pca_df["label"].unique().tolist()

[1.0, 0.0]

### Training models

In [29]:
clf_models = setup(
    data=pca_df,
    target="label",
    session_id=42,
    n_jobs=4,
)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,label
2,Target type,Binary
3,Original data shape,"(36909, 6)"
4,Transformed data shape,"(36909, 6)"
5,Transformed train set shape,"(25836, 6)"
6,Transformed test set shape,"(11073, 6)"
7,Numeric features,5
8,Preprocess,True
9,Imputation type,simple


In [30]:
best_models = compare_models(n_select=5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.9231,0.5102,1.0,0.9231,0.96,0.0,0.0,0.26
nb,Naive Bayes,0.9231,0.4996,1.0,0.9231,0.96,0.0,0.0,0.016
ridge,Ridge Classifier,0.9231,0.5102,1.0,0.9231,0.96,0.0,0.0,0.02
lda,Linear Discriminant Analysis,0.9231,0.5102,1.0,0.9231,0.96,0.0,0.0,0.028
dummy,Dummy Classifier,0.9231,0.5,1.0,0.9231,0.96,0.0,0.0,0.018
qda,Quadratic Discriminant Analysis,0.923,0.506,0.9998,0.9231,0.9599,-0.0003,-0.0016,0.02
ada,Ada Boost Classifier,0.923,0.4973,0.9998,0.9231,0.9599,-0.0003,-0.0023,0.376
lightgbm,Light Gradient Boosting Machine,0.923,0.4945,0.9998,0.9231,0.9599,-0.0003,-0.0019,0.057
gbc,Gradient Boosting Classifier,0.9229,0.493,0.9997,0.9231,0.9599,-0.0005,-0.0033,1.642
knn,K Neighbors Classifier,0.9195,0.4937,0.9956,0.9232,0.958,0.002,0.0046,0.079


In [31]:
best_models

[LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=1000,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 GaussianNB(priors=None, var_smoothing=1e-09),
 RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                 max_iter=None, positive=False, random_state=42, solver='auto',
                 tol=0.0001),
 LinearDiscriminantAnalysis(covariance_estimator=None, n_components=None,
                            priors=None, shrinkage=None, solver='svd',
                            store_covariance=False, tol=0.0001),
 DummyClassifier(constant=None, random_state=42, strategy='prior')]

### fine models

In [32]:
tuned_models = []

for model in best_models:
    tuned_model = tune_model(
        model,
        search_library="scikit-learn",
        search_algorithm="grid",
        n_iter=20,
        optimize="AUC",
    )
    tuned_models.append(tuned_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.4872,0.5056,0.4843,0.924,0.6355,0.0019,0.0037
1,0.3115,0.5169,0.2763,0.9256,0.4256,0.0021,0.006
2,0.6358,0.5189,0.6604,0.9232,0.77,0.0008,0.0012
3,0.3607,0.5368,0.3325,0.9297,0.4898,0.0068,0.0176
4,0.493,0.4973,0.4918,0.9229,0.6417,-0.0002,-0.0003
5,0.6072,0.495,0.626,0.9239,0.7463,0.0029,0.0044
6,0.5273,0.5021,0.5325,0.923,0.6754,-0.0009,-0.0015
7,0.5064,0.4871,0.5086,0.9217,0.6555,-0.0033,-0.0062
8,0.5943,0.5241,0.6071,0.9288,0.7343,0.016,0.0253
9,0.4003,0.5196,0.3799,0.9283,0.5391,0.0062,0.0145


Fitting 10 folds for each of 20000 candidates, totalling 200000 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.923,0.4968,1.0,0.923,0.96,0.0,0.0
1,0.923,0.5031,1.0,0.923,0.96,0.0,0.0
2,0.923,0.4715,1.0,0.923,0.96,0.0,0.0
3,0.923,0.5234,1.0,0.923,0.96,0.0,0.0
4,0.923,0.5013,1.0,0.923,0.96,0.0,0.0
5,0.923,0.48,1.0,0.923,0.96,0.0,0.0
6,0.9233,0.5047,1.0,0.9233,0.9601,0.0,0.0
7,0.9233,0.4821,1.0,0.9233,0.9601,0.0,0.0
8,0.9233,0.5195,1.0,0.9233,0.9601,0.0,0.0
9,0.9233,0.524,1.0,0.9233,0.9601,0.0,0.0


Fitting 10 folds for each of 28 candidates, totalling 280 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.923,0.5059,1.0,0.923,0.96,0.0,0.0
1,0.923,0.5168,1.0,0.923,0.96,0.0,0.0
2,0.923,0.5181,1.0,0.923,0.96,0.0,0.0
3,0.923,0.5362,1.0,0.923,0.96,0.0,0.0
4,0.923,0.4971,1.0,0.923,0.96,0.0,0.0
5,0.923,0.4954,1.0,0.923,0.96,0.0,0.0
6,0.9233,0.5025,1.0,0.9233,0.9601,0.0,0.0
7,0.9233,0.4863,1.0,0.9233,0.9601,0.0,0.0
8,0.9233,0.5242,1.0,0.9233,0.9601,0.0,0.0
9,0.9233,0.5194,1.0,0.9233,0.9601,0.0,0.0


Fitting 10 folds for each of 1998 candidates, totalling 19980 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.923,0.5042,1.0,0.923,0.96,0.0,0.0
1,0.923,0.5184,1.0,0.923,0.96,0.0,0.0
2,0.923,0.5252,1.0,0.923,0.96,0.0,0.0
3,0.923,0.517,1.0,0.923,0.96,0.0,0.0
4,0.923,0.5042,1.0,0.923,0.96,0.0,0.0
5,0.923,0.5079,1.0,0.923,0.96,0.0,0.0
6,0.9233,0.5029,1.0,0.9233,0.9601,0.0,0.0
7,0.9233,0.4762,1.0,0.9233,0.9601,0.0,0.0
8,0.9233,0.5251,1.0,0.9233,0.9601,0.0,0.0
9,0.9233,0.5216,1.0,0.9233,0.9601,0.0,0.0


Fitting 10 folds for each of 36 candidates, totalling 360 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8572,0.5012,0.922,0.9232,0.9226,0.0024,0.0024
1,0.858,0.5039,0.9224,0.9236,0.923,0.0078,0.0078
2,0.8572,0.5012,0.922,0.9232,0.9226,0.0024,0.0024
3,0.8557,0.4958,0.9212,0.9223,0.9218,-0.0084,-0.0084
4,0.8603,0.5121,0.9237,0.9249,0.9243,0.024,0.024
5,0.8572,0.5012,0.922,0.9232,0.9226,0.0024,0.0024
6,0.8599,0.5096,0.9233,0.9248,0.924,0.0191,0.0191
7,0.8552,0.4932,0.9208,0.9223,0.9215,-0.0135,-0.0135
8,0.8544,0.4905,0.9203,0.9219,0.9211,-0.0189,-0.0189
9,0.8599,0.5096,0.9233,0.9248,0.924,0.0191,0.0191


Fitting 10 folds for each of 4 candidates, totalling 40 fits
