## Load the dataset

In [11]:
import pickle

import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import balanced_accuracy_score, f1_score, accuracy_score
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from skopt import BayesSearchCV

save_in_test_folder = True
if save_in_test_folder:
    filepath = "../TestModule"
else:
    filepath = "."

seed = 42
FILENAME = "dataset/train_dataset.csv"

#Prepare train data
df1 = pd.read_csv(FILENAME, sep=",", low_memory=False)

# get features names
features = list(df1.columns)
features_to_remove = ["label", "ts", "src_ip", "dst_ip", "dns_query", "ssl_subject", "ssl_issuer", "http_uri", "type", "http_referrer", "http_user_agent"]
features = [feature for feature in features if feature not in features_to_remove]
df1 = df1[features + ["type"]]

# Converte i valori in numeri, sostituendo quelli non validi con NaN
df1["src_bytes"] = pd.to_numeric(df1["src_bytes"], errors='coerce')
# Filtra le righe con NaN (valori non convertibili)
df1 = df1.dropna(subset=["src_bytes"])
# Converte i valori rimasti in interi
df1.loc[:, "src_bytes"] = df1["src_bytes"].astype(int)

print("#Righe: " + str(df1.shape[0]) + " #Colonne: " + str(df1.shape[1]))
df1 = df1.dropna()
print("#Righe: " + str(df1.shape[0]) + " #Colonne: " + str(df1.shape[1]))

X = df1[features]
y = df1["type"]

le = preprocessing.LabelEncoder()
le.fit(y)
with open(f"{filepath}/transformer/target_encoder.save", "wb") as f:
    pickle.dump(le, f)

y = le.transform(y)

indices = np.arange(X.shape[0])
train_idx, val_idx = train_test_split(indices, test_size=0.2, stratify=y, random_state=seed)

# fold = np.zeros(X.shape[0])
# fold[train_idx] = -1

n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
fold = np.full(len(y), -1)  # Inizializza tutto con -1 (default: train)

# Assegna i fold ai campioni
for fold_number, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    fold[val_idx] = fold_number  # Assegna il numero del fold ai campioni di validazione

ps = PredefinedSplit(fold)
ps.get_n_splits()

# for i, (train_index, test_index) in enumerate(ps.split()):
#     print(f"Fold {i}:")
#     print(f"  Train: index={train_index}")
#     print(f"  Test:  index={test_index}")

# take only x with index in val_idx
X_val = X.iloc[val_idx]
y_val = y[val_idx]
X_train = X.iloc[train_idx]
y_train = y[train_idx]

#Righe: 616983 #Colonne: 36
#Righe: 616983 #Colonne: 36


In [3]:
df1

Unnamed: 0,src_port,dst_port,proto,service,duration,src_bytes,dst_bytes,conn_state,missed_bytes,src_pkts,...,http_version,http_request_body_len,http_response_body_len,http_status_code,http_orig_mime_types,http_resp_mime_types,weird_name,weird_addl,weird_notice,type
0,53972,10502,tcp,-,0.000000,0.0,0,OTH,0,0,...,-,0,0,0,-,-,-,-,-,normal
1,37513,53,udp,dns,0.163608,47.0,423,SF,0,1,...,-,0,0,0,-,-,-,-,-,normal
2,2077,2077,tcp,-,0.208218,0.0,0,S0,0,120,...,-,0,0,0,-,-,-,-,-,normal
3,53972,10502,tcp,-,0.000000,0.0,0,OTH,0,0,...,-,0,0,0,-,-,-,-,-,normal
4,1880,47979,tcp,-,0.000000,0.0,0,OTH,0,1,...,-,0,0,0,-,-,-,-,-,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
616997,53116,53,udp,dns,0.044893,84.0,424,SF,0,2,...,-,0,0,0,-,-,-,-,-,mitm
616998,57669,53,udp,dns,0.002957,84.0,436,SF,0,2,...,-,0,0,0,-,-,-,-,-,mitm
616999,54730,53,udp,dns,0.016624,58.0,178,SF,0,2,...,-,0,0,0,-,-,-,-,-,mitm
617000,59846,443,tcp,ssl,48.271568,3219.0,1212,SF,0,26,...,-,0,0,0,-,-,-,-,-,mitm


## Preprocess the dataset

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder, RobustScaler
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.compose import ColumnTransformer

import pickle

categorical_columns = X_train.select_dtypes(include=["object"]).columns.tolist()
numeric_columns = X_train.select_dtypes(include=["int64", "float64"]).columns.tolist()
# boolean_columns = X_train.select_dtypes(include=["bool"]).columns.tolist()

ct = ColumnTransformer(
    [
        ("cat", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_columns),  # Trasforma le colonne categoriche
        # ("ordinal", OneHotEncoder(handle_unknown='infrequent_if_exist', sparse_output=False), categorical_columns),  # Trasforma le colonne categoriche
        ("scale", RobustScaler(), numeric_columns)  # Normalizza le colonne numeriche
    ],
    remainder="passthrough"  # Mantieni le altre colonne invariate
)
ct.set_output(transform="pandas")

ct = ct.fit(X_train)
with open(f"{filepath}/transformer/transformer.save", "wb") as f:
    pickle.dump(ct, f)

# train set
X_train = ct.transform(X_train)

# validation set
X_val = ct.transform(X_val)

# X
# X = ct.transform(X)

## Feature selection

In [13]:
rf = RandomForestClassifier(n_estimators=3, random_state=seed)
sfs = SequentialFeatureSelector(estimator=rf, direction="backward", n_features_to_select="auto", scoring="balanced_accuracy", n_jobs=12)

# sfs.fit(X_train, y_train)
# with open(f"{filepath}/transformer/sfs.save", "wb") as f:
#     pickle.dump(sfs, f)

with open("../TestModule/transformer/sfs.save", "rb") as f:
    sfs: SequentialFeatureSelector = pickle.load(f)

# train set
X_train = sfs.transform(X_train)

# validation set
X_val = sfs.transform(X_val)

# X
columns_to_keep = [column.split("__")[1] for column in sfs.get_feature_names_out()]
X = X[columns_to_keep]

## Apply K-Nearest Neighbour

In [9]:

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
import os

os.environ['OMP_NUM_THREADS'] = '12'

param_grid = {
    'knn__n_neighbors': [3, 5, 7, 9, 11],  # Number of neighbors to consider
    'knn__weights': ['uniform', 'distance'],  # Weight function
    'knn__metric': ['euclidean', 'manhattan', 'minkowski'],  # Distance metrics
    'knn__p': [1, 2],  # Minkowski parameter (1 for Manhattan, 2 for Euclidean)
    'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],  # Algorithm to compute neighbors
    'knn__leaf_size': [30, 50, 70, 90, 100]
}

pipeline = Pipeline(
    [
        ('ct', ColumnTransformer(
            [
                ("cat", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_columns),  # Trasforma le colonne categoriche
                ("scale", RobustScaler(), numeric_columns)  # Normalizza le colonne numeriche
            ],
            remainder="passthrough"  # Mantieni le altre colonne invariate
        )),
        ('knn', KNeighborsClassifier())
    ],
    verbose=True
)

grid = RandomizedSearchCV(pipeline, param_grid, cv=ps, n_iter=10, random_state=seed, n_jobs=12, verbose=2, scoring="balanced_accuracy")
grid.fit(X, y)

print("Best parameters:", grid.best_params_)
print("Best score:", grid.best_score_)
best_knn = grid.best_estimator_
with open(f"{filepath}/models/knn.save", "wb") as file:
    pickle.dump(best_knn, file)

y_pred = best_knn.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Balanced accuracy:", balanced_accuracy_score(y_val, y_pred))
print("F1 score:", f1_score(y_val, y_pred, average="weighted"))
pd.DataFrame(grid.cv_results_)

# Accuracy: 0.9745696781095011
# Balanced
# accuracy: 0.8729493259671391
# F1
# score: 0.9743348139040916
# knn = KNeighborsClassifier(n_jobs=12).fit(X_train, y_train)
# y_pred = knn.predict(X_val)
# print("Accuracy:", accuracy_score(y_val, y_pred))
# print("Balanced accuracy:", balanced_accuracy_score(y_val, y_pred))
# print("F1 score:", f1_score(y_val, y_pred, average="weighted"))
# with open(f"{filepath}/models/knn2.save", "wb") as file:
#     pickle.dump(knn, file)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


ValueError: 
All the 100 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\miaob\PycharmProjects\DataAnalyticsProject\Lib\site-packages\pandas\core\indexes\base.py", line 3805, in get_loc
    return self._engine.get_loc(casted_key)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc
  File "index.pyx", line 196, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\\_libs\\hashtable_class_helper.pxi", line 7081, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas\\_libs\\hashtable_class_helper.pxi", line 7089, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'proto'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:\Users\miaob\PycharmProjects\DataAnalyticsProject\Lib\site-packages\sklearn\utils\_indexing.py", line 364, in _get_column_indices
    col_idx = all_columns.get_loc(col)
              ^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\miaob\PycharmProjects\DataAnalyticsProject\Lib\site-packages\pandas\core\indexes\base.py", line 3812, in get_loc
    raise KeyError(key) from err
KeyError: 'proto'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:\Users\miaob\PycharmProjects\DataAnalyticsProject\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\miaob\PycharmProjects\DataAnalyticsProject\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\miaob\PycharmProjects\DataAnalyticsProject\Lib\site-packages\sklearn\pipeline.py", line 652, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\miaob\PycharmProjects\DataAnalyticsProject\Lib\site-packages\sklearn\pipeline.py", line 586, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\miaob\PycharmProjects\DataAnalyticsProject\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\miaob\PycharmProjects\DataAnalyticsProject\Lib\site-packages\sklearn\pipeline.py", line 1540, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\miaob\PycharmProjects\DataAnalyticsProject\Lib\site-packages\sklearn\utils\_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\miaob\PycharmProjects\DataAnalyticsProject\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\miaob\PycharmProjects\DataAnalyticsProject\Lib\site-packages\sklearn\compose\_column_transformer.py", line 992, in fit_transform
    self._validate_column_callables(X)
  File "C:\Users\miaob\PycharmProjects\DataAnalyticsProject\Lib\site-packages\sklearn\compose\_column_transformer.py", line 551, in _validate_column_callables
    transformer_to_input_indices[name] = _get_column_indices(X, columns)
                                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\miaob\PycharmProjects\DataAnalyticsProject\Lib\site-packages\sklearn\utils\_indexing.py", line 372, in _get_column_indices
    raise ValueError("A given column is not a column of the dataframe") from e
ValueError: A given column is not a column of the dataframe


- Performance: 0.9187280941672238 con minmax
- Performance: 0.9549626207986386 senza minmax
- Accuracy: 0.9745696781095011 Balanced accuracy: 0.8729493259671391 F1 score: 0.9743348139040916

## Apply Random Forest

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'rf__n_estimators': [100, 200, 300],  # Number of trees in the forest
    'rf__max_depth': [None, 10, 20, 30, 50],  # Maximum depth of the tree
    'rf__min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'rf__min_samples_leaf': [1, 2, 4],  # Minimum number of samples required at a leaf node
    'rf__max_features': ['sqrt', 'log2', None],  # Number of features to consider when looking for the best split
    'rf__bootstrap': [True, False],  # Whether bootstrap samples are used when building trees
    'rf__criterion': ['gini', 'entropy', 'log_loss'],  # Split quality measure
    # 'class_weight': ['balanced', 'balanced_subsample', None]  # Weights associated with classes
}

pipeline = Pipeline(
    [
        ('ct', ColumnTransformer(
            [
                ("cat", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), [c for c in categorical_columns if c in columns_to_keep]),  # Trasforma le colonne categoriche
                ("scale", RobustScaler(), [n for n in numeric_columns if n in columns_to_keep])  # Normalizza le colonne numeriche
            ],
            remainder="passthrough"  # Mantieni le altre colonne invariate
        )),
        ('rf', RandomForestClassifier(random_state=seed, class_weight='balanced'))
    ],
    verbose=True
)

grid = BayesSearchCV(pipeline, param_grid, cv=ps, n_iter=10, random_state=seed, verbose=2, n_jobs=12, scoring="balanced_accuracy")
grid.fit(X, y)

print("Best parameters:", grid.best_params_)
print("Best score:", grid.best_score_)
best_rf = grid.best_estimator_.named_steps['rf']
with open(f"{filepath}/models/rf.save", "wb") as file:
    pickle.dump(best_rf, file)

y_pred = best_rf.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Balanced accuracy:", balanced_accuracy_score(y_val, y_pred))
print("F1 score:", f1_score(y_val, y_pred, average="weighted"))

pd.DataFrame(grid.cv_results_)

# rf = RandomForestClassifier(random_state=seed, class_weight="balanced", n_jobs=12).fit(X_train, y_train)
# y_pred = rf.predict(X_val)
# print("Accuracy:", accuracy_score(y_val, y_pred))
# print("Balanced accuracy:", balanced_accuracy_score(y_val, y_pred))
# print("F1 score:", f1_score(y_val, y_pred, average="weighted"))
#
# with open(f"{filepath}/models/rf.save", "wb") as file:
#     pickle.dump(rf, file)


Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
[Pipeline] ................ (step 1 of 2) Processing ct, total=   1.3s
[Pipeline] ................ (step 2 of 2) Processing rf, total= 5.4min
Best parameters: OrderedDict({'rf__bootstrap': False, 'rf__criterion': 'log_loss', 'rf__max_depth': None, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 4, 'rf__min_samples_split': 5, 'rf__n_estimators': 200})
Best score: 0.9847066315409794


ValueError: Specifying the columns using strings is only supported for dataframes.

0.9989123986553292 senza scaling

0.9990112715048448 con scaling

In [16]:
y_pred = best_rf.named_steps['rf'].predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Balanced accuracy:", balanced_accuracy_score(y_val, y_pred))
print("F1 score:", f1_score(y_val, y_pred, average="weighted"))

pd.DataFrame(grid.cv_results_)

Accuracy: 0.9777302343674025
Balanced accuracy: 0.9298636552710237
F1 score: 0.9746134675000045


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_rf__bootstrap,param_rf__criterion,param_rf__max_depth,param_rf__max_features,param_rf__min_samples_leaf,param_rf__min_samples_split,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,410.814698,5.744129,2.137974,0.420968,True,log_loss,50.0,sqrt,4,5,...,0.982927,0.982374,0.987563,0.986591,0.984383,0.979646,0.990447,0.984365,0.003386,2
1,755.719709,5.586436,0.893432,0.100258,False,log_loss,10.0,,4,2,...,0.970161,0.972567,0.975697,0.979304,0.974837,0.969273,0.979723,0.974588,0.003226,7
2,220.401391,1.727555,1.274765,0.203127,True,log_loss,,log2,1,5,...,0.977016,0.978002,0.984574,0.980239,0.981624,0.97407,0.98778,0.979777,0.004227,5
3,2086.622221,16.019556,3.028257,0.548375,False,gini,20.0,,2,2,...,0.961819,0.965738,0.966049,0.968619,0.971878,0.968714,0.980861,0.969528,0.005008,9
4,1669.297056,12.823786,1.961957,0.308829,False,entropy,20.0,,4,10,...,0.970522,0.97102,0.97668,0.978846,0.973243,0.972875,0.979994,0.974875,0.00311,6
5,617.934965,5.908322,2.323175,0.446837,False,log_loss,,sqrt,4,5,...,0.983545,0.983855,0.98775,0.986899,0.984435,0.978459,0.990746,0.984707,0.003464,1
6,1369.075748,10.225535,1.421042,0.192155,False,log_loss,10.0,,2,5,...,0.970195,0.972627,0.975784,0.979368,0.974968,0.969509,0.97622,0.974329,0.002773,8
7,1417.268032,12.712804,1.798563,0.226206,False,log_loss,20.0,,1,2,...,0.966015,0.954714,0.973728,0.967338,0.964727,0.962777,0.964993,0.964534,0.005308,10
8,241.340702,2.065625,1.055117,0.122203,False,log_loss,50.0,log2,2,2,...,0.982077,0.984345,0.985129,0.980135,0.977419,0.97674,0.990562,0.982076,0.003774,4
9,174.357705,1.637072,1.003581,0.134709,True,log_loss,30.0,sqrt,2,2,...,0.98182,0.984466,0.986173,0.9838,0.984594,0.97893,0.989195,0.983546,0.002961,3


## Apply Support Vector Classifier with HP tuning

In [4]:
from sklearn.linear_model import SGDClassifier

# param_grid = {'C': [0.1, 1, 10, 100, 1000],
#               'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
#               'kernel': ['rbf']}

param_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1],
    'loss': ["hinge"],  # "log_loss", "modified_huber", "squared_hinge", "perceptron"
    'penalty': ["l2", "l1", "elasticnet"],
    'learning_rate': ["optimal", "invscaling", "adaptive"],
    'eta0': [0.01, 0.1, 1, 10],
    'power_t': [0.1, 0.5, 1, 5],
    'average': [True, False]
}

# param_grid = {
#     'C': (0.1, 100, 'log-uniform'),
#     'gamma': (0.001, 10, 'log-uniform'),
#     'kernel': ["rbf"]
# }

grid = BayesSearchCV(
    SGDClassifier(random_state=seed, class_weight="balanced", verbose=0, n_jobs=12),
    param_grid,
    n_iter=10,  # Numero massimo di iterazioni
    cv=ps,
    verbose=1,
    random_state=seed,
    scoring="balanced_accuracy"
)
grid.fit(X, y)

print("Best parameters:", grid.best_params_)
print("Best score:", grid.best_score_)
best_svm = grid.best_estimator_
with open(f"{filepath}/models/svm.save", "wb") as file:
    pickle.dump(best_svm, file)
y_pred = best_svm.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Balanced accuracy:", balanced_accuracy_score(y_val, y_pred))
print("F1 score:", f1_score(y_val, y_pred, average="weighted"))

pd.DataFrame(grid.cv_results_)

# svm = SVC(random_state=seed, class_weight="balanced", verbose=1, C=100).fit(X_train[:10000], y_train[:10000])
# svm = SGDClassifier(random_state=seed, class_weight="balanced", verbose=2, n_jobs=12).fit(X_train, y_train)
# y_pred = svm.predict(X_val)
# print("Accuracy:", accuracy_score(y_val, y_pred))
# print("Balanced accuracy:", balanced_accuracy_score(y_val, y_pred))
# print("F1 score:", f1_score(y_val, y_pred, average="weighted"))
#
# with open(f"{filepath}/models/svm.save", "wb") as file:
#     pickle.dump(svm, file)



Fitting 10 folds for each of 1 candidates, totalling 10 fits




Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Best parameters: OrderedDict({'alpha': 0.0001, 'average': False, 'eta0': 1, 'learning_rate': 'optimal', 'loss': 'modified_huber', 'penalty': 'l2', 'power_t': 0.5})
Best score: 0.6896217820090957
Accuracy: 0.6394372589062854
Balanced accuracy: 0.5549949558990029
F1 score: 0.6872223688164277


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_average,param_eta0,param_learning_rate,param_loss,param_penalty,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,265.175159,19.581271,0.010093,0.000831,0.01,False,10.0,optimal,squared_hinge,l1,...,0.353302,0.229067,0.207981,0.266864,0.480048,0.618529,0.191627,0.341619,0.133579,6
1,12.501902,0.930198,0.010171,0.001094,1.0,False,0.1,adaptive,perceptron,l2,...,0.697267,0.529499,0.661853,0.717868,0.709812,0.59164,0.651383,0.629937,0.067504,3
2,3.485439,0.557518,0.009635,0.000862,0.01,False,0.01,invscaling,hinge,l1,...,0.619842,0.695728,0.617492,0.656893,0.573519,0.625531,0.609339,0.594834,0.061251,4
3,28.110854,1.598656,0.009873,0.00131,1.0,True,1.0,adaptive,modified_huber,l2,...,0.002123,0.003096,0.002869,0.002626,0.002593,0.002804,0.002755,0.002551,0.000564,10
4,24.963539,2.173775,0.011439,0.003067,0.1,True,1.0,adaptive,perceptron,elasticnet,...,0.030763,0.034134,0.031136,0.033048,0.030649,0.029774,0.035949,0.031991,0.001989,9
5,3.406497,3.480001,0.011394,0.002534,0.1,False,0.01,optimal,perceptron,l1,...,0.001572,0.031217,0.02086,0.001572,0.001556,0.019984,0.001556,0.047422,0.099491,8
6,20.167309,1.075677,0.009573,0.000723,0.1,False,0.1,adaptive,modified_huber,l1,...,0.549029,0.566064,0.564573,0.560261,0.548543,0.557571,0.527424,0.555352,0.011657,5
7,18.337396,1.31213,0.009704,0.000781,0.01,False,0.1,adaptive,log_loss,l2,...,0.645402,0.643684,0.640912,0.642744,0.647444,0.639308,0.639729,0.64317,0.002671,2
8,17.128403,1.893942,0.0095,0.001284,1.0,False,10.0,invscaling,log_loss,l2,...,0.1325,0.024571,0.023891,0.181546,0.024588,0.001021,0.355927,0.130368,0.118101,7
9,8.357687,0.781089,0.009604,0.000704,0.0001,False,1.0,optimal,modified_huber,l2,...,0.692356,0.655289,0.728581,0.635628,0.74453,0.703297,0.583601,0.689622,0.048224,1
