## Load the dataset

In [2]:
import pickle

import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import balanced_accuracy_score, f1_score, accuracy_score
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from skopt import BayesSearchCV
from skopt.space import Integer

save_in_test_folder = True
if save_in_test_folder:
    filepath = "../TestModule"
else:
    filepath = "."

seed = 42
FILENAME = "dataset/train_dataset.csv"

#Prepare train data
df1 = pd.read_csv(FILENAME, sep=",", low_memory=False)

# get features names
features = list(df1.columns)
features_to_remove = ["label", "ts", "type", "http_referrer"]
features = [feature for feature in features if feature not in features_to_remove]
df1 = df1[features + ["type"]]

# Converte i valori in numeri, sostituendo quelli non validi con NaN
df1["src_bytes"] = pd.to_numeric(df1["src_bytes"], errors='coerce')
# Filtra le righe con NaN (valori non convertibili)
df1 = df1.dropna(subset=["src_bytes"])
# Converte i valori rimasti in interi
df1.loc[:, "src_bytes"] = df1["src_bytes"].astype(int)

print("#Righe: " + str(df1.shape[0]) + " #Colonne: " + str(df1.shape[1]))
df1 = df1.dropna()
print("#Righe: " + str(df1.shape[0]) + " #Colonne: " + str(df1.shape[1]))

X = df1[features]
y = df1["type"]

le = preprocessing.LabelEncoder()
le.fit(y)
with open(f"{filepath}/transformer/target_encoder.save", "wb") as f:
    pickle.dump(le, f)

y = le.transform(y)

indices = np.arange(X.shape[0])
train_idx, val_idx = train_test_split(indices, test_size=0.2, stratify=y, random_state=seed)

# fold = np.zeros(X.shape[0])
# fold[train_idx] = -1

n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
fold = np.full(len(y), -1)  # Inizializza tutto con -1 (default: train)

# Assegna i fold ai campioni
for fold_number, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    fold[val_idx] = fold_number  # Assegna il numero del fold ai campioni di validazione

ps = PredefinedSplit(fold)
ps.get_n_splits()

# for i, (train_index, test_index) in enumerate(ps.split()):
#     print(f"Fold {i}:")
#     print(f"  Train: index={train_index}")
#     print(f"  Test:  index={test_index}")

# take only x with index in val_idx
X_val = X.iloc[val_idx]
y_val = y[val_idx]
X_train = X.iloc[train_idx]
y_train = y[train_idx]

#Righe: 616983 #Colonne: 43
#Righe: 616983 #Colonne: 43


In [2]:
df1

Unnamed: 0,src_port,dst_port,proto,service,duration,src_bytes,dst_bytes,conn_state,missed_bytes,src_pkts,...,http_version,http_request_body_len,http_response_body_len,http_status_code,http_orig_mime_types,http_resp_mime_types,weird_name,weird_addl,weird_notice,type
0,53972,10502,tcp,-,0.000000,0.0,0,OTH,0,0,...,-,0,0,0,-,-,-,-,-,normal
1,37513,53,udp,dns,0.163608,47.0,423,SF,0,1,...,-,0,0,0,-,-,-,-,-,normal
2,2077,2077,tcp,-,0.208218,0.0,0,S0,0,120,...,-,0,0,0,-,-,-,-,-,normal
3,53972,10502,tcp,-,0.000000,0.0,0,OTH,0,0,...,-,0,0,0,-,-,-,-,-,normal
4,1880,47979,tcp,-,0.000000,0.0,0,OTH,0,1,...,-,0,0,0,-,-,-,-,-,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
616997,53116,53,udp,dns,0.044893,84.0,424,SF,0,2,...,-,0,0,0,-,-,-,-,-,mitm
616998,57669,53,udp,dns,0.002957,84.0,436,SF,0,2,...,-,0,0,0,-,-,-,-,-,mitm
616999,54730,53,udp,dns,0.016624,58.0,178,SF,0,2,...,-,0,0,0,-,-,-,-,-,mitm
617000,59846,443,tcp,ssl,48.271568,3219.0,1212,SF,0,26,...,-,0,0,0,-,-,-,-,-,mitm


## Preprocess the dataset

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder, RobustScaler, StandardScaler, OneHotEncoder
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.compose import ColumnTransformer

import pickle

categorical_columns = X_train.select_dtypes(include=["object"]).columns.tolist()
numeric_columns = X_train.select_dtypes(include=["int64", "float64"]).columns.tolist()
# boolean_columns = X_train.select_dtypes(include=["bool"]).columns.tolist()

ct = ColumnTransformer(
    [
        # ("cat", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_columns),  # Trasforma le colonne categoriche
        ("ordinal", OneHotEncoder(handle_unknown='infrequent_if_exist', sparse_output=True), categorical_columns),  # Trasforma le colonne categoriche
        ("scale", StandardScaler(), numeric_columns)  # Normalizza le colonne numeriche
    ],
    remainder="passthrough"  # Mantieni le altre colonne invariate
)
# ct.set_output(transform="pandas")

ct = ct.fit(X_train)
with open(f"{filepath}/transformer/transformer.save", "wb") as f:
    pickle.dump(ct, f)

# train set
X_train = ct.transform(X_train)

# validation set
X_val = ct.transform(X_val)

## Apply K-Nearest Neighbour

In [5]:
from sklearn.neighbors import KNeighborsClassifier
from skopt.space import Integer, Categorical, Real

param_grid = {
    'knn__n_neighbors': Integer(3, 15),  # Number of neighbors to consider
    'knn__weights': Categorical(['uniform', 'distance']),  # Weight function
    'knn__metric': Categorical(['euclidean', 'manhattan', 'minkowski']),  # Distance metrics
    'knn__p': Categorical([1, 2]),  # Minkowski parameter (1 for Manhattan, 2 for Euclidean)
    'knn__algorithm': Categorical(['auto', 'ball_tree', 'kd_tree', 'brute']),  # Algorithm to compute neighbors
    'knn__leaf_size': Integer(10, 60),
}

pipeline = Pipeline(
    [
        ('ct', ColumnTransformer(
            [
                ("cat", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_columns),  # Trasforma le colonne categoriche
                ("scale", StandardScaler(), numeric_columns)  # Normalizza le colonne numeriche
            ],
            remainder="passthrough"  # Mantieni le altre colonne invariate
        )),
        ('knn', KNeighborsClassifier())
    ],
    verbose=True
)

grid = BayesSearchCV(pipeline, param_grid, cv=ps, n_iter=10, random_state=seed, n_jobs=12, verbose=2, scoring="balanced_accuracy")
grid.fit(X, y)

print("Best parameters:", grid.best_params_)
print("Best score:", grid.best_score_)
best_knn = grid.best_estimator_.named_steps['knn']
with open(f"{filepath}/models/knn.save", "wb") as file:
    pickle.dump(best_knn, file)

y_pred = best_knn.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Balanced accuracy:", balanced_accuracy_score(y_val, y_pred))
print("F1 score:", f1_score(y_val, y_pred, average="weighted"))

results = pd.DataFrame(grid.cv_results_)
results.to_csv(f"{filepath}/results/knn_results.csv")

# Accuracy: 0.9745696781095011
# Balanced
# accuracy: 0.8729493259671391
# F1
# score: 0.9743348139040916
# knn = KNeighborsClassifier(n_jobs=12).fit(X_train, y_train)
# y_pred = knn.predict(X_val)
# print("Accuracy:", accuracy_score(y_val, y_pred))
# print("Balanced accuracy:", balanced_accuracy_score(y_val, y_pred))
# print("F1 score:", f1_score(y_val, y_pred, average="weighted"))
# with open(f"{filepath}/models/knn2.save", "wb") as file:
#     pickle.dump(knn, file)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


KeyboardInterrupt: 

- Performance: 0.9187280941672238 con minmax
- Performance: 0.9549626207986386 senza minmax
- Accuracy: 0.9745696781095011 Balanced accuracy: 0.8729493259671391 F1 score: 0.9743348139040916

## Apply Random Forest

In [6]:
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'rf__n_estimators': Integer(50, 400),  # Number of trees in the forest
    'rf__max_depth': Integer(0, 50),  # Maximum depth of the tree
    'rf__min_samples_split': Integer(1, 15),  # Minimum number of samples required to split a node
    'rf__min_samples_leaf': Integer(1, 10),  # Minimum number of samples required at a leaf node
    'rf__max_features': Categorical(['sqrt', 'log2', None]),  # Number of features to consider when looking for the best split
    'rf__bootstrap': Categorical([True, False]),  # Whether bootstrap samples are used when building trees
    'rf__criterion': Categorical(['gini', 'entropy', 'log_loss']),  # Split quality measure
}

pipeline = Pipeline(
    [
        ('ct', ColumnTransformer(
            [
                ("cat", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_columns),  # Trasforma le colonne categoriche
                ("scale", StandardScaler(), numeric_columns)  # Normalizza le colonne numeriche
            ],
            remainder="passthrough"  # Mantieni le altre colonne invariate
        )),
        ('rf', RandomForestClassifier(random_state=seed, class_weight='balanced'))
    ],
    verbose=True
)

grid = BayesSearchCV(pipeline, param_grid, cv=ps, n_iter=10, random_state=seed, verbose=2, n_jobs=12, scoring="balanced_accuracy")
grid.fit(X, y)

print("Best parameters:", grid.best_params_)
print("Best score:", grid.best_score_)
best_rf = grid.best_estimator_.named_steps['rf']
with open(f"{filepath}/models/rf.save", "wb") as file:
    pickle.dump(best_rf, file)

y_pred = best_rf.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Balanced accuracy:", balanced_accuracy_score(y_val, y_pred))
print("F1 score:", f1_score(y_val, y_pred, average="weighted"))

results = pd.DataFrame(grid.cv_results_)
results.to_csv(f"{filepath}/results/rf_results.csv")


# rf = RandomForestClassifier(random_state=seed, class_weight="balanced", n_jobs=12).fit(X_train, y_train)
# y_pred = rf.predict(X_val)
# print("Accuracy:", accuracy_score(y_val, y_pred))
# print("Balanced accuracy:", balanced_accuracy_score(y_val, y_pred))
# print("F1 score:", f1_score(y_val, y_pred, average="weighted"))
#
# with open(f"{filepath}/models/rf.save", "wb") as file:
#     pickle.dump(rf, file)


Fitting 10 folds for each of 1 candidates, totalling 10 fits


KeyboardInterrupt: 

0.9989123986553292 senza scaling

0.9990112715048448 con scaling

## Apply Support Vector Classifier with HP tuning

In [3]:

from sklearn.linear_model import SGDClassifier
from sklearn.kernel_approximation import Nystroem

# param_grid = {'C': [0.1, 1, 10, 100, 1000],
#               'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
#               'kernel': ['rbf']}

param_grid = {
    'svm__alpha': Real(1e-4,1),
    'svm__loss': Categorical(["hinge"]),  # "log_loss", "modified_huber", "squared_hinge", "perceptron"
    'svm__penalty': Categorical(["l2", "l1", "elasticnet"]),
    'svm__learning_rate': Categorical(["optimal", "invscaling", "adaptive"]),
    'svm__eta0': Real(1e-2,10),
    'svm__power_t': Real(1e-1,5),
    'svm__average': Categorical([True, False]),
    'svm__n_iter_no_change': Integer(5,20),
    'nys__gamma': Real(1e-3,1),
    'nys__n_components': Integer(100, 400)
}

pipeline = Pipeline(
    [
        ('ct', ColumnTransformer(
            [
                ("ordinal", OneHotEncoder(handle_unknown='infrequent_if_exist', sparse_output=True), categorical_columns),  # Trasforma le colonne categoriche
                ("scale", StandardScaler(), numeric_columns)  # Normalizza le colonne numeriche
            ],
            remainder="passthrough"  # Mantieni le altre colonne invariate
        )),
        ('nys',Nystroem(random_state=seed, n_jobs=12)),
        ('svm', SGDClassifier(random_state=seed, class_weight="balanced", verbose=0, n_jobs=12))
    ],
    verbose=True
)


grid = BayesSearchCV(
    pipeline,
    param_grid,
    n_iter=10,  # Numero massimo di iterazioni
    cv=ps,
    verbose=1,
    random_state=seed,
    scoring="balanced_accuracy"
)
grid.fit(X, y)

print("Best parameters:", grid.best_params_)
print("Best score:", grid.best_score_)
best_svm = grid.best_estimator_.named_steps['svm']
with open(f"{filepath}/models/svm.save", "wb") as file:
    pickle.dump(best_svm, file)
y_pred = best_svm.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Balanced accuracy:", balanced_accuracy_score(y_val, y_pred))
print("F1 score:", f1_score(y_val, y_pred, average="weighted"))

results = pd.DataFrame(grid.cv_results_)
results.to_csv(f"{filepath}/results/svm_results.csv")


# np.random.seed = seed
# n_estimators = 40

#
# # concatenate X_train and y_train
# Xy_train = np.concatenate((X_train, y_train.reshape(-1, 1)), axis=1)
# # shuffle X_train and y_train
# np.random.shuffle(Xy_train)
#
# # split X_train and y_train into X_train and y_train
# X_train_shuffled = Xy_train[:, :-1]
# y_train_shuffled = Xy_train[:, -1]

# clf = BaggingClassifier(SVC(class_weight='balanced', verbose=0, C=100), max_samples=1.0 / n_estimators, n_estimators=n_estimators, n_jobs=12, random_state=seed)
# clf.fit(X_train, y_train)
# svm = SVC(random_state=seed, gamma="auto", class_weight="balanced", verbose=1, C=100, max_iter=10000).fit(X_train, y_train)




# feature_map_nystroem = Nystroem(gamma=.2,
#                                 random_state=seed,
#                                 n_components=300,
#                                 n_jobs=12)
# ny = feature_map_nystroem.fit(X_train)
# with open(f"{filepath}/transformer/nystroem_svm.save", "wb") as f:
#     pickle.dump(ny, f)
#
# X_train_transformed = ny.transform(X_train)
# X_val_transformed = ny.transform(X_val)

# # clf = svm.LinearSVC(random_state=seed, class_weight="balanced", verbose=2)
# clf = SGDClassifier(random_state=seed, class_weight="balanced", learning_rate="adaptive", eta0=0.1, verbose=2, n_jobs=12)
# clf.fit(X_train_transformed, y_train)
#
#
# y_pred = clf.predict(X_val_transformed)
# print("Accuracy:", accuracy_score(y_val, y_pred))
# print("Balanced accuracy:", balanced_accuracy_score(y_val, y_pred))
# print("F1 score:", f1_score(y_val, y_pred, average="weighted"))
#
# with open(f"{filepath}/models/svm.save", "wb") as file:
#     pickle.dump(clf, file)

Fitting 10 folds for each of 1 candidates, totalling 10 fits
[Pipeline] ................ (step 1 of 3) Processing ct, total=   4.3s
[Pipeline] ............... (step 2 of 3) Processing nys, total=   8.6s
[Pipeline] ............... (step 3 of 3) Processing svm, total=  23.5s
[Pipeline] ................ (step 1 of 3) Processing ct, total=   3.8s
[Pipeline] ............... (step 2 of 3) Processing nys, total=   8.1s
[Pipeline] ............... (step 3 of 3) Processing svm, total=  25.8s
[Pipeline] ................ (step 1 of 3) Processing ct, total=   4.0s
[Pipeline] ............... (step 2 of 3) Processing nys, total=   7.9s
[Pipeline] ............... (step 3 of 3) Processing svm, total=  24.8s
[Pipeline] ................ (step 1 of 3) Processing ct, total=   5.3s
[Pipeline] ............... (step 2 of 3) Processing nys, total=  11.5s
[Pipeline] ............... (step 3 of 3) Processing svm, total=  25.2s
[Pipeline] ................ (step 1 of 3) Processing ct, total=   3.7s
[Pipeline] .....

KeyboardInterrupt: 

Risultati Bagging con tutte le colonne e shuffle, 40 estimator e C=100
- Accuracy: 0.7577231028558462
- Balanced accuracy: 0.7412778169637153
- F1 score: 0.7809132510222976

Risultati Bagging con tutte le colonne no shuffle, 40 estimator e C=100
- Accuracy: 0.7598949722843528
- Balanced accuracy: 0.7390178640894159
- F1 score: 0.7839818912738935

Risultati con Linear svm C=1 e nystroem gamma=.2, n_components=300 con scaling sbagliato
- Accuracy: 0.9407436221595513
- Balanced accuracy: 0.8814098625404746
- F1 score: 0.9439970075746738

Risultati con Linear svm C=1 e nystroem gamma=.2, n_components=300 con scaling giusto
- Accuracy: 0.8903854257836559
- Balanced accuracy: 0.7418163296742876
- F1 score: 0.9033155670832028