## Load the dataset

In [14]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.model_selection import PredefinedSplit
from sklearn import preprocessing

save_in_test_folder = True
if save_in_test_folder:
    filepath = "../TestModule"
else:
    filepath = "."

seed = 42
FILENAME = "dataset/train_dataset.csv"

#Prepare train data
df1 = pd.read_csv(FILENAME, sep=",", low_memory=False)

# get features names
features = list(df1.columns)
features_to_remove = ["label", "ts", "src_ip", "dst_ip", "dns_query", "ssl_subject", "ssl_issuer", "http_uri", "type", "http_referrer", "http_user_agent"]
features = [feature for feature in features if feature not in features_to_remove]
df1 = df1[features + ["type"]]

# Converte i valori in numeri, sostituendo quelli non validi con NaN
df1["src_bytes"] = pd.to_numeric(df1["src_bytes"], errors='coerce')
# Filtra le righe con NaN (valori non convertibili)
df1 = df1.dropna(subset=["src_bytes"])
# Converte i valori rimasti in interi
df1.loc[:, "src_bytes"] = df1["src_bytes"].astype(int)

print("#Righe: " + str(df1.shape[0]) + " #Colonne: " + str(df1.shape[1]))
df1 = df1.dropna()
print("#Righe: " + str(df1.shape[0]) + " #Colonne: " + str(df1.shape[1]))

X = df1[features]
y = df1["type"]

le = preprocessing.LabelEncoder()
le.fit(y)
with open(f"{filepath}/transformer/target_encoder.save", "wb") as f:
    pickle.dump(le, f)

y = le.transform(y)

indices = np.arange(X.shape[0])
train_idx, val_idx = train_test_split(indices, test_size=0.2, stratify=y, random_state=seed)

# fold = np.zeros(X.shape[0])
# fold[train_idx] = -1

n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

fold = np.full(len(y), -1)  # Inizializza tutto con -1 (default: train)

# Assegna i fold ai campioni
for fold_number, (_, val_idx) in enumerate(skf.split(X, y)):
    fold[val_idx] = fold_number  # Assegna il numero del fold ai campioni di validazione


ps = PredefinedSplit(fold)
ps.get_n_splits()


# for i, (train_index, test_index) in enumerate(ps.split()):
#     print(f"Fold {i}:")
#     print(f"  Train: index={train_index}")
#     print(f"  Test:  index={test_index}")

# take only x with index in val_idx
X_val = X.iloc[val_idx]
y_val = y[val_idx]
X_train = X.iloc[train_idx]
y_train = y[train_idx]

#Righe: 616983 #Colonne: 36
#Righe: 616983 #Colonne: 36


In [None]:
df1

## Preprocess the dataset

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.impute import KNNImputer, SimpleImputer

import pickle

categorical_columns = X_train.select_dtypes(include=["object"]).columns.tolist()
numeric_columns = X_train.select_dtypes(include=["int64", "float64"]).columns.tolist()
# boolean_columns = X_train.select_dtypes(include=["bool"]).columns.tolist()

ct = ColumnTransformer(
    [
        ("cat", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_columns),  # Trasforma le colonne categoriche
        # ("ordinal", OneHotEncoder(handle_unknown='infrequent_if_exist', sparse_output=False), categorical_columns),  # Trasforma le colonne categoriche
        ("scale", StandardScaler(), numeric_columns)  # Normalizza le colonne numeriche
    ],
    remainder="passthrough"  # Mantieni le altre colonne invariate
)
ct.set_output(transform="pandas")

ct = ct.fit(X_train)
with open(f"{filepath}/transformer/transformer.save", "wb") as f:
    pickle.dump(ct, f)    

# train set
X_train = ct.transform(X_train)

# validation set
X_val = ct.transform(X_val)

# X
X = ct.transform(X)

In [16]:
# rf = RandomForestClassifier(n_estimators=3, random_state=seed)
# sfs = SequentialFeatureSelector(estimator=rf, direction="backward", n_features_to_select="auto", scoring="balanced_accuracy")

# sfs.fit(X_train, y_train)


# rename "scale__" to all numeric columns in x_train
X_train.columns = [col.replace("remainder__", "scale__") for col in X_train.columns]
X_val.columns = [col.replace("remainder__", "scale__") for col in X_val.columns]
X.columns = [col.replace("remainder__", "scale__") for col in X.columns]


with open("../TestModule/transformer/sfs.save", "rb") as f:
    sfs: SequentialFeatureSelector = pickle.load(f)

# train set
X_train = sfs.transform(X_train)

# validation set
X_val = sfs.transform(X_val)

# X
X = sfs.transform(X)

In [None]:
print(sfs.get_feature_names_out())
print(sfs.get_support())
with open(f"{filepath}/transformer/sfs.save", "wb") as f:
    pickle.dump(sfs, f)   

In [None]:
from sklearn import decomposition
from sklearn import discriminant_analysis

# pca = decomposition.PCA(n_components=0.95, random_state=seed)
pca = discriminant_analysis.LinearDiscriminantAnalysis(n_components=9)

pca.set_output(transform="pandas")

pca.fit(X_train, y_train)
with open(f"{filepath}/transformer/pca.save", "wb") as f:
    pickle.dump(pca, f)

X_train_pca = pca.transform(X_train)
X_val_pca = pca.transform(X_val)
X_pca = pca.transform(X)

# X_train = X_train_pca
# X_val = X_val_pca
# X = X_pca

## Apply K-Nearest Neighbour

In [13]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
import os

os.environ['OMP_NUM_THREADS'] = '4'
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],  # Number of neighbors to consider
    'weights': ['uniform', 'distance'],  # Weight function
    'metric': ['euclidean', 'manhattan', 'minkowski'],  # Distance metrics
    'p': [1, 2],  # Minkowski parameter (1 for Manhattan, 2 for Euclidean)
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']  # Algorithm to compute neighbors
}

grid = RandomizedSearchCV(KNeighborsClassifier(), param_grid, cv=ps, n_iter=2, random_state=seed)
grid.fit(X, y)

print("Best parameters:", grid.best_params_)
print("Best score:", grid.best_score_)
best_knn = grid.best_estimator_
with open(f"{filepath}/models/knn.save", "wb") as file:
    pickle.dump(best_knn, file)

pd.DataFrame(grid.cv_results_)


# knn = KNeighborsClassifier().fit(X_train, y_train)
# print("Performance:", knn.score(X_val, y_val))
# with open( f"{filepath}/models/knn.save", "wb") as file:
#     pickle.dump(knn, file)

KeyboardInterrupt: 

- Performance: 0.9187280941672238 con minmax
- Performance: 0.9549626207986386 senza minmax

## Apply Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, f1_score, accuracy_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from skopt import BayesSearchCV 

param_grid = {
    'n_estimators': [100, 200, 300, 500],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30, 50],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required at a leaf node
    'max_features': ['sqrt', 'log2', None],  # Number of features to consider when looking for the best split
    'bootstrap': [True, False],  # Whether bootstrap samples are used when building trees
    'criterion': ['gini', 'entropy', 'log_loss'],  # Split quality measure
    # 'class_weight': ['balanced', 'balanced_subsample', None]  # Weights associated with classes
}

grid = BayesSearchCV(RandomForestClassifier(random_state=seed, class_weight='balanced'), param_grid, cv=ps, n_iter=10, random_state=seed, verbose=2, n_jobs=2, scoring="balanced_accuracy")
grid.fit(X, y)

print("Best parameters:", grid.best_params_)
print("Best score:", grid.best_score_)
best_rf = grid.best_estimator_
with open(f"{filepath}/models/rf.save", "wb") as file:
    pickle.dump(best_rf, file)

pd.DataFrame(grid.cv_results_)

# rf = RandomForestClassifier(random_state=seed, class_weight="balanced").fit(X_train, y_train)
# y_pred = rf.predict(X_val)
# print("Accuracy:", accuracy_score(y_val, y_pred))
# print("Balanced accuracy:", balanced_accuracy_score(y_val, y_pred))
# print("F1 score:", f1_score(y_val, y_pred, average="weighted"))

# with open(f"{filepath}/models/rf.save", "wb") as file:
#     pickle.dump(rf, file)

Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] END bootstrap=True, criterion=log_loss, max_depth=50, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time= 1.6min
[CV] END bootstrap=True, criterion=log_loss, max_depth=50, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time= 1.7min
[CV] END bootstrap=True, criterion=log_loss, max_depth=50, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time= 1.6min
[CV] END bootstrap=True, criterion=log_loss, max_depth=50, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time= 1.6min
[CV] END bootstrap=True, criterion=log_loss, max_depth=50, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time= 1.6min
[CV] END bootstrap=True, criterion=log_loss, max_depth=50, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time= 1.7min
[CV

0.9989123986553292 senza scaling

0.9990112715048448 con scaling

## Apply Support Vector Classifier with HP tuning

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from skopt import BayesSearchCV

param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

# param_grid = {
#     'C': (0.1, 100, 'log-uniform'),
#     'gamma': (0.001, 10, 'log-uniform'),
#     'kernel': ["rbf"]
# }

grid = BayesSearchCV(
    SVC(random_state=seed),
    param_grid,
    n_iter=1,  # Numero massimo di iterazioni
    cv=ps,
    verbose=1,
    random_state=seed,
    n_jobs=6
)
grid.fit(X, y)

# grid = RandomizedSearchCV(SVC(random_state=seed), param_grid, cv=ps)
# grid.fit(X, y)
#
print("Best parameters:", grid.best_params_)
print("Best score:", grid.best_score_)
best_svm = grid.best_estimator_
with open(f"{filepath}/models/svm.save", "wb") as file:
    pickle.dump(best_svm, file)

pd.DataFrame(grid.cv_results_)

# svm = SVC(random_state=seed, class_weight="balanced", verbose=1, C=100).fit(X_train.head(10000), y_train[:10000])
# print("Performance:", svm.score(X_val, y_val))
# with open(f"{filepath}/models/svm.save", "wb") as file:
#     pickle.dump(svm, file)

