## Load the dataset

In [None]:
import pandas as pd
import numpy as np
import torch
from pytorch_tabular.categorical_encoders import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import PredefinedSplit
from sklearn import preprocessing

seed = 42
FILENAME = "dataset/train_dataset.csv"

#Prepare train data
df1 = pd.read_csv(FILENAME, sep=",", low_memory=False)

# remove label columns
# df1 = df1.drop(["label"], axis=1)
df1 = df1.map(lambda x: np.nan if x == "-" else x)
df1 = df1.map(lambda x: 1 if x == "T" else x)
df1 = df1.map(lambda x: 0 if x == "F" else x)

# get features names
features = list(df1.columns)

# count = df1.count()
# threshold = count.quantile(0.25)
# print(f"soglia : {threshold}")
# df1 = df1.loc[:, count >= threshold]

features_to_remove = ["label", 'ssl_issuer', 'weird_name', 'ssl_version', 'ssl_subject', 'http_trans_depth', 'http_referrer', 'http_resp_mime_types', 'ssl_cipher', 'dns_query', 'http_version', 'http_user_agent', 'http_uri', 'weird_notice', 'ssl_established', 'weird_addl', 'http_method', 'http_orig_mime_types', 'ssl_resumed', "src_ip", "dst_ip", "type"]
for feature in features_to_remove:
    features.remove(feature)


# Conversion of datatype

# Converte i valori in numeri, sostituendo quelli non validi con NaN
df1["src_bytes"] = pd.to_numeric(df1["src_bytes"], errors='coerce')
# Filtra le righe con NaN (valori non convertibili)
X_train = df1.dropna(subset=["src_bytes"])
# Converte i valori rimasti in interi
X_train.loc[:, "src_bytes"] = X_train["src_bytes"].astype(int)


print("#Righe: " + str(df1.shape[0]) + " #Colonne: " + str(df1.shape[1]))
# df1 = df1.dropna()

X = df1[features]
y = df1["type"]

le = preprocessing.LabelEncoder()
le.fit(y)
y = le.transform(y)

indices = np.arange(X.shape[0])
train_idx, val_idx = train_test_split(indices, test_size=0.2, stratify=y, random_state=seed)

fold = np.zeros(X.shape[0])
fold[train_idx] = -1

ps = PredefinedSplit(fold)
ps.get_n_splits()

# for i, (train_index, test_index) in enumerate(ps.split()):
#     print(f"Fold {i}:")
#     print(f"  Train: index={train_index}")
#     print(f"  Test:  index={test_index}")

# take only x with index in val_idx
X_val = X.iloc[val_idx]
y_val = y[val_idx]
X_train = X.iloc[train_idx]
y_train = y[train_idx]

In [None]:
df1

## Preprocess the dataset

In [None]:
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline

import pickle

# X_train_copy= X_train.copy()
# X_train_copy = X_train_copy.replace("-", np.nan)
#
# count = X_train_copy.count()
#
# threshold = count.quantile(0.25)

# print(f"soglia: {threshold}")
#
# X_train_copy = X_train_copy.loc[:, count >= threshold]
# print(f"X_train è {X_train_copy.shape()}")
# print(f"ora X_train_copy è {X_train_copy.shape()}")


categorical_columns = ["proto", "service", "conn_state"]
boolean_columns= [ "dns_AA", "dns_RD", "dns_RA", "dns_rejected"]

# train set
ordinal_encoder = OrdinalEncoder()
X_train.loc[:, categorical_columns] = ordinal_encoder.fit_transform(X_train[categorical_columns])

# validation set
ordinal_encoder = OrdinalEncoder()
X_val.loc[:, categorical_columns] = ordinal_encoder.fit_transform(X_val[categorical_columns])

# X
ordinal_encoder = OrdinalEncoder()
X.loc[:, categorical_columns] = ordinal_encoder.fit_transform(X[categorical_columns])

# ct = ColumnTransformer(
#     [
#         ("ordinal", OrdinalEncoder(), ["proto", "service"]),  # Trasforma le colonne categoriche
#         ("scale", StandardScaler(), ["feature1", "feature2"])  # Normalizza le colonne numeriche
#     ],
#     remainder="passthrough"  # Mantieni le altre colonne invariate
# )
#
# X_train_transformed = ct.fit_transform(X_train)


# ct = ct.fit(X_train)
# df_transformed = ct.transform(X_train)

# label_encoder = LabelEncoder()
# print(X_train["proto"])
# print(label_encoder.fit_transform(X_train["proto"]))
# X_train.loc[:, "proto"] = label_encoder.fit_transform(X_train["proto"])


# file = open("scaler.save","wb")
# scaler = MinMaxScaler(feature_range=(0, 1))
# X_train = scaler.fit_transform(X_train)
# X = scaler.transform(X)
# pickle.dump(scaler, file)

## Apply Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

reg = LogisticRegression(random_state=seed).fit(X_train, y_train)
print("Performance:", reg.score(X_val, y_val))
file = open("lr.save", "wb")
pickle.dump(reg, file)
file.close()

## Apply Support Vector Classifier with HP tuning

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.1], #, 1, 10, 100, 1000],
              'gamma': [1], #, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ["linear"]} #, "poly", "rbf", "sigmoid"]}

grid = GridSearchCV(SVC(random_state=seed), param_grid, cv=ps)
grid.fit(X, y)
file = open("svc.save", "wb")
pickle.dump(grid.best_estimator_, file)
print("Best hyperparameters: ", grid.best_estimator_)
print("Best performance:", grid.best_score_)
file.close()


In [None]:
from sklearn.tree import DecisionTreeClassifier

reg = DecisionTreeClassifier(random_state=seed).fit(X_train, y_train)

print("Performance:", reg.score(X_val, y_val))
file = open("dt.save", "wb")
pickle.dump(reg, file)
file.close()

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score, confusion_matrix

def predict(X,y, clf):
    # X = data[:, :-1]
    # y = data[:, -1]

    ypred = clf.predict(X)
    acc = accuracy_score(y, ypred)
    bacc = balanced_accuracy_score(y, ypred)
    f1 = f1_score(y, ypred, average="weighted")
    print(confusion_matrix(y, ypred))


    perf = {"acc": acc, "bacc": bacc, "f1": f1}

    return perf

predict(X_val, y_val, reg)