# ML-based Novelty Detection and Classification of Security Threats in IoT Networks

Disclaimer: this is a reduced (faster) version of the experiments presented in the paper.

#### Importing libraries

In [1]:
import warnings
from numba.core.errors import NumbaDeprecationWarning, NumbaPendingDeprecationWarning
warnings.simplefilter('ignore', category=NumbaDeprecationWarning)
warnings.simplefilter('ignore', category=NumbaPendingDeprecationWarning)

In [2]:
import json
import random
from datetime import datetime

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import psutil
from featurewiz import FeatureWiz
from imblearn.combine import SMOTETomek
from lightgbm import LGBMClassifier
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import SGDOneClassSVM
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

Imported 0.3.1 version. Select nrows to a small number when running on huge datasets.
output = featurewiz(dataname, target, corr_limit=0.90, verbose=2, sep=',', 
		header=0, test_data='',feature_engg='', category_encoders='',
		dask_xgboost_flag=False, nrows=None, skip_sulov=False)
Create new features via 'feature_engg' flag : ['interactions','groupby','target']



# Constants, RNG setup and utils

In [4]:
CFG_FRACTION = 0.01
CFG_SCENARIO = 'Scan'
CFG_SEED = 42
CFG_N_JOBS = int(np.sqrt(psutil.cpu_count(logical=False)))
CFG_N_TRIALS = 20

In [5]:
np.random.seed(CFG_SEED)
np.random.rand(CFG_SEED)
random.seed(CFG_SEED)

In [6]:
# Filter trials (i.e. remove trials that were pruned)
def filter_completed_trials(trials):
    completed_trials = [x for x in trials if x.state == optuna.trial.TrialState.COMPLETE]
    return completed_trials

In [7]:
# Get total execution time in seconds
def get_time(trial):
    start, end = trial.datetime_start, trial.datetime_complete
    return (end - start).total_seconds()

In [8]:
# Compute statistics for the results
def get_results_statistics(study):
    completed_trials = filter_completed_trials(study.trials)
    return {
        'score_min': np.min([x.values for x in completed_trials]),
        'score_max': np.max([x.values for x in completed_trials]),
        'score_mean': np.mean([x.values for x in completed_trials]),
        'score_std': np.std([x.values for x in completed_trials]),
        'time_min' : np.min([get_time(x) for x in completed_trials]),
        'time_max' : np.min([get_time(x) for x in completed_trials]),
        'time_mean' : np.mean([get_time(x) for x in completed_trials]),
        'time_std' : np.std([get_time(x) for x in completed_trials])
    }

In [9]:
# Print value counts with count and frequency
def pretty_print_value_counts(df, column=None):
    if column:
        c = df[column].value_counts(dropna=False)
        p = df[column].value_counts(dropna=False, normalize=True).mul(100).round(1).astype(str) + '%'
    else:
        c = df.value_counts(dropna=False)
        p = df.value_counts(dropna=False, normalize=True).mul(100).round(1).astype(str) + '%'
    print(pd.concat([c,p], axis=1, keys=['counts', '%']))

# Data Preprocessing

#### Reading dataset

In [10]:
# Read the dataset
df = pd.read_parquet('IoT_Network_Intrusion_Macro.parquet').sample(frac=CFG_FRACTION, random_state=CFG_SEED)

# Irrelevant features for training
df = df.drop(['ip.src', 'ip.dst'], axis=1) 

# Target feature
target = 'label'
df[target].replace('DenialofServiceDoS', 'DoS', inplace=True)
df[target].replace('ManintheMiddleMITM', 'MITM', inplace=True)
df[target].replace('MiraiBotnet', 'Mirai', inplace=True)
df[target].replace('Scanning', 'Scan', inplace=True)

#### Showing all target labels (categories)

In [11]:
pretty_print_value_counts(df, target)

        counts      %
label                
Normal   14464  54.4%
Mirai    10206  38.4%
MITM       996   3.7%
DoS        682   2.6%
Scan       235   0.9%


#### Converting categories into numeric values

In [12]:
df[target], categories = pd.factorize(df[target], sort=False)
reverse_mappings = {i:categories[i] for i in range(len(categories.values))}

In [13]:
reverse_mappings

{0: 'Normal', 1: 'Mirai', 2: 'MITM', 3: 'DoS', 4: 'Scan'}

In [14]:
pretty_print_value_counts(df, target)

       counts      %
label               
0       14464  54.4%
1       10206  38.4%
2         996   3.7%
3         682   2.6%
4         235   0.9%


In [15]:
outlier_labels = [CFG_SCENARIO]
outlier_indices = [k for k,v in reverse_mappings.items() if v in outlier_labels]
outlier_rows = df[target].isin(outlier_indices)
inlier_rows = ~df[target].isin(outlier_indices)

# Feature Selection

In [16]:
X = df.drop([target], axis=1)
y = df[target]

fwiz = FeatureWiz(corr_limit=0.80, verbose=0).fit(X, y)

fwiz.features

wiz = FeatureWiz(verbose=1)
        X_train_selected = wiz.fit_transform(X_train, y_train)
        X_test_selected = wiz.transform(X_test)
        wiz.features  ### provides a list of selected features ###            
        
featurewiz has selected 0.8 as the correlation limit. Change this limit to fit your needs...
Skipping feature engineering since no feature_engg input...
Skipping category encoding since no category encoders specified in input...
#### Single_Label Multi_Classification problem ####
    Loaded train data. Shape = (26583, 28)
    Some column names had special characters which were removed...
#### Single_Label Multi_Classification problem ####
No test data filename given...
Classifying features using a random sample of 10000 rows from dataset...
#### Single_Label Multi_Classification problem ####
    loading a random sample of 10000 rows into pandas for EDA
#######################################################################################
########################

['ip.flags.df',
 'ip.ttl',
 'tcp.flags.syn',
 'ip.len',
 'frame.number',
 'udp.srcport',
 'tcp.flags.ack',
 'udp.length',
 'udp.dstport']

# Novelty detection

In [17]:
detection_results = {}

In [18]:
X_train = df[inlier_rows][fwiz.features]
y_train = df[inlier_rows][target].copy()
y_train.loc[:] = 'Known'

X_test = df[outlier_rows][fwiz.features]
y_test = df[outlier_rows][target].copy()
y_test.loc[:] = 'Unknown'

In [19]:
def accuracy_count_score(y_pred):
    n_outliers = len([y for y in y_pred if y == -1])
    n_inliers = len([y for y in y_pred if y == +1])
    accuracy = n_outliers / (n_outliers + n_inliers)
    return accuracy

#### Elliptic Envelope

In [20]:
def objective(trial):
    try:
        trial.set_user_attr('detector', EllipticEnvelope.__name__)
        assume_centered = trial.suggest_categorical('assume_centered', [True, False])
        support_fraction = trial.suggest_float('support_fraction', 0.0, 1.0)
        clf = EllipticEnvelope(assume_centered=assume_centered, support_fraction=support_fraction, random_state=CFG_SEED)
        clf.fit(X_train)
        y_pred = clf.predict(X_test)
        score = accuracy_count_score(y_pred)
        return score
    except:
        raise optuna.TrialPruned()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=CFG_N_TRIALS, n_jobs=CFG_N_JOBS)
print(f'Best trial: {study.best_trial.params} => {study.best_trial.values[0]:.3f}')

clf_name = study.best_trial.user_attrs['detector']
detection_results[clf_name] = get_results_statistics(study)

[32m[I 2023-10-05 16:40:24,212][0m A new study created in memory with name: no-name-0c5d495a-b46f-4bfb-aebe-1d01f396a69c[0m
[32m[I 2023-10-05 16:40:28,995][0m Trial 1 finished with value: 0.07234042553191489 and parameters: {'assume_centered': True, 'support_fraction': 0.9219721978323695}. Best is trial 1 with value: 0.07234042553191489.[0m
[32m[I 2023-10-05 16:40:29,196][0m Trial 0 finished with value: 0.9063829787234042 and parameters: {'assume_centered': False, 'support_fraction': 0.9454817383636683}. Best is trial 0 with value: 0.9063829787234042.[0m
[32m[I 2023-10-05 16:40:30,790][0m Trial 2 finished with value: 0.04680851063829787 and parameters: {'assume_centered': True, 'support_fraction': 0.05526227755415958}. Best is trial 0 with value: 0.9063829787234042.[0m
[32m[I 2023-10-05 16:40:32,844][0m Trial 4 finished with value: 0.04680851063829787 and parameters: {'assume_centered': True, 'support_fraction': 0.5810456201752249}. Best is trial 0 with value: 0.906382978

Best trial: {'assume_centered': False, 'support_fraction': 0.7465093332900095} => 0.919


#### Isolation Forest

In [21]:
def objective(trial):
    try:
        trial.set_user_attr('detector', IsolationForest.__name__)
        n_estimators = trial.suggest_int('n_estimators', 1, 1024, log=True)
        max_samples = trial.suggest_float('max_samples', 0.1, 1.0, step=0.1)
        max_features = trial.suggest_float('max_features', 0.1, 1.0, step=0.1)
        bootstrap = trial.suggest_categorical('bootstrap', [True, False])
        clf = IsolationForest(n_estimators=n_estimators, max_samples=max_samples,
                              max_features=max_features, bootstrap=bootstrap,
                              n_jobs=CFG_N_JOBS, random_state=CFG_SEED)
        clf.fit(X_train)
        y_pred = clf.predict(X_test)
        score = accuracy_count_score(y_pred)
        return score
    except:
        raise optuna.TrialPruned()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=CFG_N_TRIALS, n_jobs=CFG_N_JOBS)
print(f'Best trial: {study.best_trial.params} => {study.best_trial.values[0]:.3f}')

clf_name = study.best_trial.user_attrs['detector']
detection_results[clf_name] = get_results_statistics(study)

[32m[I 2023-10-05 16:40:57,226][0m A new study created in memory with name: no-name-3f126553-a122-4c65-915d-16d03f46e732[0m
[32m[I 2023-10-05 16:40:57,362][0m Trial 1 finished with value: 0.9361702127659575 and parameters: {'n_estimators': 34, 'max_samples': 0.1, 'max_features': 0.8, 'bootstrap': False}. Best is trial 1 with value: 0.9361702127659575.[0m
[32m[I 2023-10-05 16:40:57,431][0m Trial 2 finished with value: 0.9276595744680851 and parameters: {'n_estimators': 15, 'max_samples': 0.4, 'max_features': 0.5, 'bootstrap': True}. Best is trial 1 with value: 0.9361702127659575.[0m
[32m[I 2023-10-05 16:40:57,826][0m Trial 0 finished with value: 0.9276595744680851 and parameters: {'n_estimators': 175, 'max_samples': 0.2, 'max_features': 0.5, 'bootstrap': True}. Best is trial 1 with value: 0.9361702127659575.[0m
[32m[I 2023-10-05 16:40:58,152][0m Trial 3 finished with value: 0.9063829787234042 and parameters: {'n_estimators': 204, 'max_samples': 0.4, 'max_features': 0.30000

Best trial: {'n_estimators': 2, 'max_samples': 0.2, 'max_features': 0.1, 'bootstrap': True} => 1.000


#### Local Outlier Factor

In [22]:
def objective(trial):
    try:
        trial.set_user_attr('detector', LocalOutlierFactor.__name__)
        n_neighbors = trial.suggest_int('n_neighbors', 1, 4096, log=True)
        algorithm = trial.suggest_categorical('algorithm', ['ball_tree', 'kd_tree', 'brute'])
        leaf_size = trial.suggest_int('leaf_size', 1, 4096, log=True)
        clf = LocalOutlierFactor(n_neighbors=n_neighbors, algorithm=algorithm, 
                                 leaf_size=leaf_size, novelty=True, n_jobs=CFG_N_JOBS)
        clf.fit(X_train)
        y_pred = clf.predict(X_test)
        score = accuracy_count_score(y_pred)
        return score
    except:
        raise optuna.TrialPruned()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=CFG_N_TRIALS, n_jobs=CFG_N_JOBS)
print(f'Best trial: {study.best_trial.params} => {study.best_trial.values[0]:.3f}')

clf_name = study.best_trial.user_attrs['detector']
completed_trials = filter_completed_trials(study.trials)
detection_results[clf_name] = get_results_statistics(study)

[32m[I 2023-10-05 16:41:02,486][0m A new study created in memory with name: no-name-4541c50b-d736-443a-8070-11efdd212c91[0m
[32m[I 2023-10-05 16:41:02,862][0m Trial 0 finished with value: 0.6085106382978723 and parameters: {'n_neighbors': 1, 'algorithm': 'kd_tree', 'leaf_size': 5}. Best is trial 0 with value: 0.6085106382978723.[0m
[32m[I 2023-10-05 16:41:03,069][0m Trial 2 finished with value: 0.6085106382978723 and parameters: {'n_neighbors': 1, 'algorithm': 'ball_tree', 'leaf_size': 1}. Best is trial 0 with value: 0.6085106382978723.[0m
[32m[I 2023-10-05 16:41:18,927][0m Trial 3 finished with value: 0.6085106382978723 and parameters: {'n_neighbors': 1, 'algorithm': 'brute', 'leaf_size': 3}. Best is trial 0 with value: 0.6085106382978723.[0m
[32m[I 2023-10-05 16:41:33,148][0m Trial 1 finished with value: 0.0 and parameters: {'n_neighbors': 1823, 'algorithm': 'brute', 'leaf_size': 2}. Best is trial 0 with value: 0.6085106382978723.[0m
[32m[I 2023-10-05 16:41:33,328][0

Best trial: {'n_neighbors': 1, 'algorithm': 'kd_tree', 'leaf_size': 5} => 0.609


#### One Class SVM (SGD)

In [23]:
def objective(trial):
    try:
        trial.set_user_attr('detector', SGDOneClassSVM.__name__)
        gamma = trial.suggest_float('kernel_gamma', 0.05, 1, step=0.05)
        n_components = trial.suggest_int('kernel_n_components', 50, 150, step=5)
        nu = trial.suggest_float('sgd_nu', 0.05, 1, step=0.05)
        clf = make_pipeline(
            Nystroem(gamma=gamma, n_components=n_components, random_state=CFG_SEED),
            SGDOneClassSVM(nu=nu, random_state=CFG_SEED)
        )
        clf.fit(X_train)
        y_pred = clf.predict(X_test)
        score = accuracy_count_score(y_pred)
        return score
    except:
        return optuna.TrialPruned()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=CFG_N_TRIALS, n_jobs=CFG_N_JOBS)
print(f'Best trial: {study.best_trial.params} => {study.best_trial.values[0]:.3f}')

clf_name = study.best_trial.user_attrs['detector']
completed_trials = filter_completed_trials(study.trials)
detection_results[clf_name] = get_results_statistics(study)

[32m[I 2023-10-05 16:41:43,173][0m A new study created in memory with name: no-name-331a87d2-e990-4fcf-a1a0-ab9742b30529[0m
[32m[I 2023-10-05 16:41:43,372][0m Trial 0 finished with value: 0.0 and parameters: {'kernel_gamma': 0.45, 'kernel_n_components': 85, 'sgd_nu': 0.55}. Best is trial 0 with value: 0.0.[0m
[32m[I 2023-10-05 16:41:43,374][0m Trial 1 finished with value: 0.0 and parameters: {'kernel_gamma': 0.9500000000000001, 'kernel_n_components': 90, 'sgd_nu': 0.5}. Best is trial 0 with value: 0.0.[0m
[32m[I 2023-10-05 16:41:43,590][0m Trial 3 finished with value: 1.0 and parameters: {'kernel_gamma': 0.35000000000000003, 'kernel_n_components': 100, 'sgd_nu': 0.3}. Best is trial 3 with value: 1.0.[0m
[32m[I 2023-10-05 16:41:43,612][0m Trial 2 finished with value: 0.0 and parameters: {'kernel_gamma': 0.1, 'kernel_n_components': 105, 'sgd_nu': 0.35000000000000003}. Best is trial 3 with value: 1.0.[0m
[32m[I 2023-10-05 16:41:43,756][0m Trial 5 finished with value: 1.0 

Best trial: {'kernel_gamma': 0.35000000000000003, 'kernel_n_components': 100, 'sgd_nu': 0.3} => 1.000


#### Summarize results

In [24]:
pd.DataFrame.from_dict(detection_results).round(3)

Unnamed: 0,EllipticEnvelope,IsolationForest,LocalOutlierFactor,SGDOneClassSVM
score_min,0.0,0.009,0.0,0.0
score_max,0.919,1.0,0.609,1.0
score_mean,0.344,0.866,0.263,0.5
score_std,0.416,0.239,0.267,0.5
time_min,1.738,0.014,0.075,0.102
time_max,1.738,0.014,0.075,0.102
time_mean,3.242,0.504,4.035,0.191
time_std,1.342,0.978,8.292,0.06


# Classification

In [25]:
classification_results = {}

In [26]:
holdout_class = dict((v, k) for k, v in reverse_mappings.items())[CFG_SCENARIO]
holdout_indices = (y == holdout_class)

X = X.drop(index=y.index[holdout_indices])
y = y.drop(index=y.index[holdout_indices])

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X[fwiz.features], y, test_size=0.2, random_state=CFG_SEED)

print('\nTrain data before resampling:', X_train.shape, y_train.shape, '\n')
pretty_print_value_counts(y_train)

X_train, y_train = SMOTETomek(n_jobs=CFG_N_JOBS, random_state=CFG_SEED).fit_resample(X_train, y_train)

print('\nTrain data after resampling:', X_train.shape, y_train.shape, '\n')
pretty_print_value_counts(y_train)


Train data before resampling: (21078, 9) (21078,) 

       counts      %
label               
0       11592  55.0%
1        8123  38.5%
2         809   3.8%
3         554   2.6%

Train data after resampling: (43776, 9) (43776,) 

       counts      %
label               
3       11197  25.6%
1       11168  25.5%
2       10872  24.8%
0       10539  24.1%


#### Decision Tree

In [28]:
def objective(trial):
    try:
        trial.set_user_attr('classifier', DecisionTreeClassifier.__name__)
        max_depth = trial.suggest_int('max_depth', 2, 10)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
        max_features = trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2', None])
        criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
        clf = DecisionTreeClassifier(
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_features=max_features,
            criterion=criterion,
            random_state=CFG_SEED
        )
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        return score
    except:
        raise optuna.TrialPruned()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=CFG_N_TRIALS, n_jobs=CFG_N_JOBS)
print(f'Best trial: {study.best_trial.params} => {study.best_trial.values[0]:.3f}')

clf_name = study.best_trial.user_attrs['classifier']
classification_results[clf_name] = get_results_statistics(study)

[32m[I 2023-10-05 16:41:45,396][0m A new study created in memory with name: no-name-a76d97a3-d2e9-4ab3-a065-735c26b0ad76[0m
[32m[I 2023-10-05 16:41:45,430][0m Trial 0 finished with value: 0.9461100569259963 and parameters: {'max_depth': 9, 'min_samples_split': 7, 'min_samples_leaf': 10, 'max_features': 'auto', 'criterion': 'gini'}. Best is trial 0 with value: 0.9461100569259963.[0m
[32m[I 2023-10-05 16:41:45,445][0m Trial 1 finished with value: 0.9495256166982923 and parameters: {'max_depth': 4, 'min_samples_split': 4, 'min_samples_leaf': 3, 'max_features': None, 'criterion': 'entropy'}. Best is trial 1 with value: 0.9495256166982923.[0m
[32m[I 2023-10-05 16:41:45,483][0m Trial 2 finished with value: 0.9453510436432637 and parameters: {'max_depth': 9, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': None, 'criterion': 'gini'}. Best is trial 1 with value: 0.9495256166982923.[0m
[32m[I 2023-10-05 16:41:45,503][0m Trial 3 finished with value: 0.946679316888045

Best trial: {'max_depth': 6, 'min_samples_split': 9, 'min_samples_leaf': 6, 'max_features': None, 'criterion': 'entropy'} => 0.957


#### LightGBM

In [29]:
def objective(trial):
    try:
        trial.set_user_attr('classifier', LGBMClassifier.__name__)
        learning_rate = trial.suggest_float('learning_rate', 0.1, 1.0, step=0.05)
        n_estimators = trial.suggest_int('n_estimators', 1, 200)
        max_depth = trial.suggest_int('max_depth', 3, 10)
        num_leaves = trial.suggest_int('num_leaves', 10, 100)
        min_child_samples = trial.suggest_int('min_child_samples', 1, 20)
        subsample = trial.suggest_float('subsample', 0.5, 1.0, step=0.1)
        colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0, step=0.1)
        reg_alpha = trial.suggest_float('reg_alpha', 0.0, 1.0, step=0.1)
        clf = LGBMClassifier(
            learning_rate=learning_rate,
            n_estimators=n_estimators,
            max_depth=max_depth,
            num_leaves=num_leaves,
            min_child_samples=min_child_samples,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            reg_alpha=reg_alpha,
            random_state=CFG_SEED
        )
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        return score
    except:
        raise optuna.TrialPruned()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=CFG_N_TRIALS, n_jobs=CFG_N_JOBS)
print(f'Best trial: {study.best_trial.params} => {study.best_trial.values[0]:.3f}')

clf_name = study.best_trial.user_attrs['classifier']
classification_results[clf_name] = get_results_statistics(study)

[32m[I 2023-10-05 16:41:45,895][0m A new study created in memory with name: no-name-86919850-14da-4dba-9499-ae50483d1944[0m
[32m[I 2023-10-05 16:41:46,311][0m Trial 0 finished with value: 0.9449715370018975 and parameters: {'learning_rate': 0.1, 'n_estimators': 64, 'max_depth': 7, 'num_leaves': 26, 'min_child_samples': 9, 'subsample': 1.0, 'colsample_bytree': 0.8, 'reg_alpha': 0.9}. Best is trial 0 with value: 0.9449715370018975.[0m
[32m[I 2023-10-05 16:41:46,460][0m Trial 1 finished with value: 0.9521821631878558 and parameters: {'learning_rate': 0.85, 'n_estimators': 143, 'max_depth': 7, 'num_leaves': 34, 'min_child_samples': 16, 'subsample': 0.7, 'colsample_bytree': 0.9, 'reg_alpha': 0.7000000000000001}. Best is trial 1 with value: 0.9521821631878558.[0m
[32m[I 2023-10-05 16:41:46,715][0m Trial 2 finished with value: 0.5351043643263758 and parameters: {'learning_rate': 0.9, 'n_estimators': 109, 'max_depth': 6, 'num_leaves': 79, 'min_child_samples': 6, 'subsample': 0.6, 'c

Best trial: {'learning_rate': 0.15000000000000002, 'n_estimators': 54, 'max_depth': 10, 'num_leaves': 80, 'min_child_samples': 13, 'subsample': 0.6, 'colsample_bytree': 0.9, 'reg_alpha': 0.6000000000000001} => 0.956


#### Random Forest

In [30]:
def objective(trial):
    try:
        trial.set_user_attr('classifier', RandomForestClassifier.__name__)
        n_estimators = trial.suggest_int('n_estimators', 100, 1000, step=100)
        max_depth = trial.suggest_int('max_depth', 3, 10)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
        clf = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=CFG_SEED
        )
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        return score
    except:
        raise optuna.TrialPruned()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=CFG_N_TRIALS, n_jobs=CFG_N_JOBS)
print(f'Best trial: {study.best_trial.params} => {study.best_trial.values[0]:.3f}')

clf_name = study.best_trial.user_attrs['classifier']
classification_results[clf_name] = get_results_statistics(study)

[32m[I 2023-10-05 16:41:50,905][0m A new study created in memory with name: no-name-cd434430-42f5-4c24-8dd7-1dc39008884e[0m
[32m[I 2023-10-05 16:41:52,486][0m Trial 0 finished with value: 0.8842504743833017 and parameters: {'n_estimators': 100, 'max_depth': 6, 'min_samples_split': 14, 'min_samples_leaf': 7}. Best is trial 0 with value: 0.8842504743833017.[0m
[32m[I 2023-10-05 16:41:55,407][0m Trial 2 finished with value: 0.7060721062618596 and parameters: {'n_estimators': 300, 'max_depth': 3, 'min_samples_split': 10, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.8842504743833017.[0m
[32m[I 2023-10-05 16:41:59,080][0m Trial 1 finished with value: 0.8747628083491461 and parameters: {'n_estimators': 600, 'max_depth': 5, 'min_samples_split': 15, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.8842504743833017.[0m
[32m[I 2023-10-05 16:42:05,673][0m Trial 4 finished with value: 0.9210626185958254 and parameters: {'n_estimators': 400, 'max_depth': 8, 'min_samples_s

Best trial: {'n_estimators': 800, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 1} => 0.930


#### XGBoost

In [31]:
def objective(trial):
    try:
        trial.set_user_attr('classifier', XGBClassifier.__name__)
        eta = trial.suggest_float('eta', 0.01, 0.3, step=0.01)
        max_depth = trial.suggest_int('max_depth', 3, 10)
        subsample = trial.suggest_float('subsample', 0.5, 1.0, step=0.1)
        colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0, step=0.1)
        min_child_weight = trial.suggest_int('min_child_weight', 1, 10)
        gamma = trial.suggest_float('gamma', 0.0, 1.0, step=0.1)
        reg_lambda = trial.suggest_float('reg_lambda', 0.0, 1.0, step=0.1)
        reg_alpha = trial.suggest_float('reg_alpha', 0.0, 1.0, step=0.1)
        clf = XGBClassifier(
            objective='binary:logistic',
            eval_metric='logloss',
            eta=eta,
            max_depth=max_depth,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            min_child_weight=min_child_weight,
            gamma=gamma,
            reg_lambda=reg_lambda,
            reg_alpha=reg_alpha,
            n_jobs=CFG_N_JOBS,
            random_state=CFG_SEED
        )
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        return score
    except:
        raise optuna.TrialPruned()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=CFG_N_TRIALS, n_jobs=CFG_N_JOBS)
print(f'Best trial: {study.best_trial.params} => {study.best_trial.values[0]:.3f}')

clf_name = study.best_trial.user_attrs['classifier']
classification_results[clf_name] = get_results_statistics(study)

[32m[I 2023-10-05 16:43:43,353][0m A new study created in memory with name: no-name-56696989-1d91-4a1f-8627-3ec3007886e0[0m
[32m[I 2023-10-05 16:43:45,916][0m Trial 0 finished with value: 0.9180265654648956 and parameters: {'eta': 0.06999999999999999, 'max_depth': 3, 'subsample': 0.7, 'colsample_bytree': 1.0, 'min_child_weight': 1, 'gamma': 0.9, 'reg_lambda': 0.6000000000000001, 'reg_alpha': 0.6000000000000001}. Best is trial 0 with value: 0.9180265654648956.[0m
[32m[I 2023-10-05 16:43:47,225][0m Trial 1 finished with value: 0.9561669829222011 and parameters: {'eta': 0.27, 'max_depth': 9, 'subsample': 0.9, 'colsample_bytree': 0.7, 'min_child_weight': 4, 'gamma': 0.9, 'reg_lambda': 0.30000000000000004, 'reg_alpha': 0.2}. Best is trial 1 with value: 0.9561669829222011.[0m
[32m[I 2023-10-05 16:43:50,398][0m Trial 3 finished with value: 0.932068311195446 and parameters: {'eta': 0.05, 'max_depth': 5, 'subsample': 0.8, 'colsample_bytree': 0.7, 'min_child_weight': 3, 'gamma': 0.700

Best trial: {'eta': 0.27, 'max_depth': 9, 'subsample': 0.9, 'colsample_bytree': 0.7, 'min_child_weight': 4, 'gamma': 0.9, 'reg_lambda': 0.30000000000000004, 'reg_alpha': 0.2} => 0.956


#### Summarize results

In [32]:
pd.DataFrame.from_dict(classification_results).round(3)

Unnamed: 0,DecisionTreeClassifier,LGBMClassifier,RandomForestClassifier,XGBClassifier
score_min,0.76,0.504,0.705,0.892
score_max,0.957,0.956,0.93,0.956
score_mean,0.9,0.889,0.895,0.946
score_std,0.058,0.151,0.066,0.016
time_min,0.024,0.108,1.578,2.282
time_max,0.024,0.108,1.578,2.282
time_mean,0.045,0.489,10.853,3.457
time_std,0.014,0.299,5.063,0.63
