In [None]:
## Imports ##
import pandas as pd
import numpy as np
import os
import sys
from utility import FeatureSelection
from matplotlib import pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.cluster import AgglomerativeClustering
from pyclustering.cluster.kmedoids import kmedoids
from pyclustering.utils import calculate_distance_matrix
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer, StandardScaler
from sklearn.feature_selection import mutual_info_classif, SelectKBest, chi2
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
import gower
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

In [39]:
## Read the dataset ##
df = pd.read_csv("datasets/kddcup99_csv.csv")
print(df.info())
#df=df.head(5000)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 494020 entries, 0 to 494019
Data columns (total 42 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     494020 non-null  int64  
 1   protocol_type                494020 non-null  object 
 2   service                      494020 non-null  object 
 3   flag                         494020 non-null  object 
 4   src_bytes                    494020 non-null  int64  
 5   dst_bytes                    494020 non-null  int64  
 6   land                         494020 non-null  int64  
 7   wrong_fragment               494020 non-null  int64  
 8   urgent                       494020 non-null  int64  
 9   hot                          494020 non-null  int64  
 10  num_failed_logins            494020 non-null  int64  
 11  logged_in                    494020 non-null  int64  
 12  lnum_compromised             494020 non-null  int64  
 13 

In [40]:
## Data Preprocessing ##
X_encoded = df.copy()
X_encoded.drop(columns=['label'], inplace=True)

# Handling different data types
# Categorical must be encoded to numerical values
categorical_cols = [
  'protocol_type',
  'service',
  'flag'
]
X_encoded[categorical_cols] = X_encoded[categorical_cols].astype('category')

# Boolean columns are already numerical [0, 1]
boolean_cols = [
  'land',
  'logged_in',
  'lroot_shell',
  'lsu_attempted', 
  'is_host_login', 
  'is_guest_login'
]
# Cast boolean columns to integer type (if not already)
X_encoded[boolean_cols] = X_encoded[boolean_cols].astype(bool)

# These columns must be converted to a logarithmic scale
transform_to_log = [
  'duration',
  'src_bytes',
  'dst_bytes',
  'count',
  'srv_count',
  'dst_host_count',
  'dst_host_srv_count'
]
# The rest of the columns are continuous values
continuous_cols = X_encoded.select_dtypes(include=['int64', 'float64']).columns.tolist()
continuous_cols = [col for col in continuous_cols if col not in transform_to_log]

# Conversion to a logarithmic scale
for col in transform_to_log:
    if col in X_encoded.columns:
        X_encoded[col + '_log'] = np.log1p(X_encoded[col])
X_encoded.drop(columns=transform_to_log, inplace=True)

# Normalization of continuous columns to a range of [0, 1]
encoder_continuous = MinMaxScaler()
X_encoded[continuous_cols] = encoder_continuous.fit_transform(X_encoded[continuous_cols])

# We separate the categorical columns in two dataframes using the OneHotEncoder and the OrdinalEncode
# One-Hot Encoding
X_cat_onehot = pd.get_dummies(df[categorical_cols], drop_first=True)

# Ordinal Encoding
encoder = OrdinalEncoder()
ordinal_cols = categorical_cols
X_cat_ordinal = pd.DataFrame(
  encoder.fit_transform(df[categorical_cols]),
  columns=list(map(lambda x: x + '_ordinal', ordinal_cols)),
)

# Unione con feature numeriche
X_onehot = pd.concat([X_encoded, X_cat_onehot], axis=1)
X_onehot.drop(columns=categorical_cols, inplace=True)
X_ordinal = pd.concat([X_encoded, X_cat_ordinal], axis=1)
X_ordinal.drop(columns=categorical_cols, inplace=True)

X_onehot.info()
X_ordinal.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 494020 entries, 0 to 494019
Columns: 115 entries, land to flag_SH
dtypes: bool(83), float64(32)
memory usage: 159.7 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 494020 entries, 0 to 494019
Data columns (total 41 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   land                         494020 non-null  bool   
 1   wrong_fragment               494020 non-null  float64
 2   urgent                       494020 non-null  float64
 3   hot                          494020 non-null  float64
 4   num_failed_logins            494020 non-null  float64
 5   logged_in                    494020 non-null  bool   
 6   lnum_compromised             494020 non-null  float64
 7   lroot_shell                  494020 non-null  bool   
 8   lsu_attempted                494020 non-null  bool   
 9   lnum_root                    494020 non-null  float64
 10  lnum_fi

In [None]:
target = df['label']

df_rank_ordinal = pd.read_csv("datasets/feature_selection_ordinal.csv")
df_rank_onehot = pd.read_csv("datasets/feature_selection_onehot.csv")

if df_rank_ordinal is None:
  df_rank_ordinal = FeatureSelection(X_ordinal, target)
  df_rank_ordinal.to_csv("datasets/feature_selection_ordinal.csv", index=False)
if df_rank_onehot is None: 
  df_rank_onehot = FeatureSelection(X_onehot, target)
  df_rank_onehot.to_csv("datasets/feature_selection_onehot.csv", index=False)


# Display the results
print("df_rank_ordinal")
print(df_rank_ordinal)
print("df_rank_onehot")
print(df_rank_onehot)


df_rank_ordinal
                        Feature        MI        RF  MeanScore
34                    count_log  0.952017  1.000000   0.976009
35                srv_count_log  0.765108  0.869388   0.817248
39              service_ordinal  0.941896  0.539750   0.740823
32                src_bytes_log  1.000000  0.430831   0.715415
38        protocol_type_ordinal  0.677474  0.660095   0.668784
25  dst_host_same_src_port_rate  0.810832  0.437144   0.623988
20                same_srv_rate  0.574710  0.489924   0.532317
21                diff_srv_rate  0.523703  0.315674   0.419689
23       dst_host_same_srv_rate  0.638273  0.184548   0.411411
37       dst_host_srv_count_log  0.636369  0.179369   0.407869
40                 flag_ordinal  0.544463  0.232569   0.388516
24       dst_host_diff_srv_rate  0.601266  0.158725   0.379995
33                dst_bytes_log  0.387967  0.184670   0.286319
27         dst_host_serror_rate  0.387327  0.165333   0.276330
28     dst_host_srv_serror_rate  0.3729

In [42]:
# Seleziona feature con MeanScore sopra una treshold
treshold = 0.3

X_ord_final = X_ordinal[df_rank_ordinal[df_rank_ordinal['MeanScore'] > treshold]['Feature']]
X_oh_final = X_onehot[df_rank_onehot[df_rank_onehot['MeanScore'] > treshold]['Feature']]

X_ord_final.info()
X_oh_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 494020 entries, 0 to 494019
Data columns (total 12 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   count_log                    494020 non-null  float64
 1   srv_count_log                494020 non-null  float64
 2   service_ordinal              494020 non-null  int64  
 3   src_bytes_log                494020 non-null  float64
 4   protocol_type_ordinal        494020 non-null  int64  
 5   dst_host_same_src_port_rate  494020 non-null  float64
 6   same_srv_rate                494020 non-null  float64
 7   diff_srv_rate                494020 non-null  float64
 8   dst_host_same_srv_rate       494020 non-null  float64
 9   dst_host_srv_count_log       494020 non-null  float64
 10  flag_ordinal                 494020 non-null  int64  
 11  dst_host_diff_srv_rate       494020 non-null  float64
dtypes: float64(9), int64(3)
memory usage: 45.2 MB
<class 'pand

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

def compare_models_on_encodings(X_ord_final, X_oh_final, y, task_name="Classificazione"):
  results = []
  datasets = {
    'Ordinal':{
      'dataset': X_ord_final,
      'models': [
        RandomForestClassifier(random_state=42)
      ]
    },
    'OneHot': {
      'dataset': X_oh_final,
      'models': [
        LogisticRegression(max_iter=1000, class_weight='balanced'),
        SVC(class_weight='balanced')
      ]
    }
  }
  
  for ds_name, ds_info in datasets.items():
    X_data = ds_info['dataset']
    models = ds_info['models']
    X_train, X_test, y_train, y_test = train_test_split(
      X_data, y, test_size=0.3, stratify=y, random_state=42
    )

    for model in models:
      model_name = model.__class__.__name__
      model.fit(X_train, y_train)
      y_pred = model.predict(X_test)
      f1 = f1_score(y_test, y_pred, average='macro')
      results.append({
        'Dataset': ds_name,
        'Model': model_name,
        'F1-macro': f1
      })
      print(f"\n📊 {task_name} - {ds_name} - {model_name}")
      print(classification_report(y_test, y_pred))
  # Confronto tabellare
  df_results = pd.DataFrame(results)
  
  # Barplot
  plt.figure(figsize=(10, 6))
  sns.barplot(data=df_results, x='Model', y='F1-macro', hue='Dataset')
  plt.title(f'Confronto F1-macro - {task_name}')
  plt.ylim(0, 1)
  plt.grid(True, axis='y', linestyle='--', alpha=0.5)
  plt.legend(title='Dataset')
  plt.tight_layout()
  plt.show()
  
  return df_results


In [None]:
y_binary = df['label'].apply(lambda x: 0 if x == 'normal' else 1)
results_bin = compare_models_on_encodings(X_ord_final, X_oh_final, y_binary, "Binaria (Normal vs Attack)")

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_multi = le.fit_transform(df['label'])

results_multi = compare_models_on_encodings(X_ord_final, X_oh_final, y_multi, "Multiclasse (Tutti gli attacchi)")



📊 Binaria (Normal vs Attack) - Ordinal - RandomForestClassifier
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     29183
           1       1.00      1.00      1.00    119023

    accuracy                           1.00    148206
   macro avg       1.00      1.00      1.00    148206
weighted avg       1.00      1.00      1.00    148206


📊 Binaria (Normal vs Attack) - OneHot - LogisticRegression
              precision    recall  f1-score   support

           0       0.95      1.00      0.97     29183
           1       1.00      0.99      0.99    119023

    accuracy                           0.99    148206
   macro avg       0.97      0.99      0.98    148206
weighted avg       0.99      0.99      0.99    148206

