## Load the dataset

In [None]:
import pandas as pd
import numpy as np
import torch
from pytorch_tabular.categorical_encoders import OrdinalEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.model_selection import PredefinedSplit
from sklearn import preprocessing

seed = 42
FILENAME = "dataset/train_dataset.csv"

#Prepare train data
df1 = pd.read_csv(FILENAME, sep=",", low_memory=False)


# remove label columns
# df1 = df1.drop(["label"], axis=1)

# mapping ={ "T": 1, "F": 0, "-": np.nan}
# df1 = df1.map(lambda x: mapping.get(x, x))

# get features names
features = list(df1.columns)

# count = df1.count()
# threshold = count.quantile(0.25)
# print(f"soglia : {threshold}")

# df1 = df1.loc[:, count >= threshold]
# features_to_remove = list(count[count <= threshold].index)

features_to_remove = ["label", "ts", "src_ip", "dst_ip", "dns_query", "ssl_subject", "ssl_issuer", "http_uri", "type"]

# features_to_remove = [ 'http_trans_depth', 'http_resp_mime_types', 'ssl_cipher', 'dns_query', 'http_version', 'http_user_agent', 'http_uri', 'weird_notice', 'ssl_established', 'weird_addl', 'http_method', 'ssl_resumed', "src_ip", "dst_ip", "type"]
for feature in features_to_remove:
    features.remove(feature)

df1 = df1[features + ["type"]]

# Conversion of datatype

# Converte i valori in numeri, sostituendo quelli non validi con NaN
df1["src_bytes"] = pd.to_numeric(df1["src_bytes"], errors='coerce')
# Filtra le righe con NaN (valori non convertibili)
X_train = df1.dropna(subset=["src_bytes"])
# Converte i valori rimasti in interi
X_train.loc[:, "src_bytes"] = X_train["src_bytes"].astype(int)

print("#Righe: " + str(df1.shape[0]) + " #Colonne: " + str(df1.shape[1]))
df1 = df1.dropna()
print("#Righe: " + str(df1.shape[0]) + " #Colonne: " + str(df1.shape[1]))

X = df1[features]
y = df1["type"]

le = preprocessing.LabelEncoder()
le.fit(y)
y = le.transform(y)

indices = np.arange(X.shape[0])
train_idx, test_idx = train_test_split(indices, test_size=0.2, stratify=y, random_state=seed)
train_idx, val_idx = train_test_split(train_idx, test_size=0.2, stratify=y[train_idx], random_state=seed)


fold = np.zeros(X.shape[0])
fold[train_idx] = -1

ps = PredefinedSplit(fold)
ps.get_n_splits()

# for i, (train_index, test_index) in enumerate(ps.split()):
#     print(f"Fold {i}:")
#     print(f"  Train: index={train_index}")
#     print(f"  Test:  index={test_index}")

# take only x with index in val_idx
X_val = X.iloc[val_idx]
y_val = y[val_idx]
X_train = X.iloc[train_idx]
y_train = y[train_idx]

X_test = X.iloc[test_idx]
y_test = y[test_idx]

In [None]:
df1

## Preprocess the dataset

In [None]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.impute import KNNImputer, SimpleImputer

import pickle


#categorical_columns = ["proto", "service", "conn_state"]
#boolean_columns= [ "dns_AA", "dns_RD", "dns_RA", "dns_rejected"]

categorical_columns = X_train.select_dtypes(include=["object"]).columns.tolist()
# boolean_columns = X_train.select_dtypes(include=["bool"]).columns.tolist()

ct = ColumnTransformer(
    [
        ("ordinal", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_columns),  # Trasforma le colonne categoriche
        # ("scale", StandardScaler(), ["feature1", "feature2"])  # Normalizza le colonne numeriche
    ],
    remainder="passthrough"  # Mantieni le altre colonne invariate
)
ct.set_output(transform="pandas")

ct = ct.fit(X_train)
with open("transformer.save", "wb") as f:
    pickle.dump(ct, f)

# train set
X_train = ct.transform(X_train)

# validation set
X_val = ct.transform(X_val)

# X
X = ct.transform(X)

# X_test
X_test = ct.transform(X_test)

# # train set
# ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
# ordinal_encoder = ordinal_encoder.fit(X_train[categorical_columns])
# X_train.loc[:, categorical_columns] = ordinal_encoder.transform(X_train[categorical_columns])
#
# # validation set
# X_val.loc[:, categorical_columns] = ordinal_encoder.transform(X_val[categorical_columns])
#
# # X
# X.loc[:, categorical_columns] = ordinal_encoder.transform(X[categorical_columns])
#
# X_test.loc[:, categorical_columns] = ordinal_encoder.transform(X_test[categorical_columns])

## Apply K-Nearest Neighbour

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

knn = KNeighborsClassifier().fit(X_train, y_train)
print("Performance:", knn.score(X_val, y_val))
with open( "knn.save", "wb") as file:
    pickle.dump(knn, file)

## Apply Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

rf = RandomForestClassifier(random_state=seed).fit(X_train, y_train)
print("Performance:", rf.score(X_val, y_val))
with open( "rf.save", "wb") as file:
    pickle.dump(rf, file)

## Apply Support Vector Classifier with HP tuning

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# param_grid = {'C': [0.1],  #, 1, 10, 100, 1000],
#               'gamma': [1],  #, 0.1, 0.01, 0.001, 0.0001],
#               'kernel': ["linear"]}  #, "poly", "rbf", "sigmoid"]}
#
# grid = RandomizedSearchCV(SVC(random_state=seed), param_grid, cv=ps)
# grid.fit(X, y)
svc = SVC(random_state=seed).fit(X_train, y_train)
print("Performance:", svc.score(X_val, y_val))
with open("svc.save", "wb") as file:
    pickle.dump(svc, file)

# print("Best hyperparameters: ", grid.best_estimator_)
# print("Best performance:", grid.best_score_)


In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score, confusion_matrix


def predict(X, y, clf):
    # X = data[:, :-1]
    # y = data[:, -1]

    ypred = clf.predict(X)
    acc = accuracy_score(y, ypred)
    bacc = balanced_accuracy_score(y, ypred)
    f1 = f1_score(y, ypred, average="weighted")
    print(confusion_matrix(y, ypred))

    perf = {"acc": acc, "bacc": bacc, "f1": f1}

    return perf


predict(X_test, y_test, rf)