In [1]:
import gc
import os
import logging
import csv
import pandas as pd
import numpy as np
from pandas.core.interchange.dataframe_protocol import DataFrame
from sklearn.feature_selection import VarianceThreshold, f_classif
from sklearn.preprocessing import MinMaxScaler

In [2]:
df = pd.read_parquet('../data/CICIDS2017_cleaned.parquet')

X = df.drop(columns = 'Label')
y = df['Label']

scaler = MinMaxScaler().fit_transform(X=X, y=y)
scaled_df = pd.DataFrame(scaler, columns=X.columns)

scaled_df['Label'] = y

scaled_df_X = scaled_df.drop(columns = 'Label')
scaled_df_y = scaled_df['Label']

del df, X, y

## Variance Threshold
Eine einfache Methode, um die Anzahl der Dimensionen zu verringern, ist VarianceThreshold aus sklearn. Dafür werden Features, die keine oder nur eine geringe Varianz aufweisen entfernt. Bei Anwendung in einem Klassifikationsproblem darf VarianceThreshold in seiner einfachsten Form nur mit Vorsicht angewendet werden, da auch Features mit geringer Varianz relevant sein können. Dies kann anhand eines einfachen Beispiels gezeigt werden:
Bei einem Datensatz mit 5 Klassen, sind die Werte innerhalb eines Features für 4 dieser Klassen konstant, die übrige Werte für die übrige Klasse unterscheiden sich distinkt. Dadurch kann die Varianz des gesamten Features gering sein. Da dieses Feature jedoch ein guter Indikator für die 5. Klasse ist, besitzt es eine hohe Relevanz für das Training.

Damit die Features 'homogen' sind, müssen sie aneinander angepasst werden. Da wir VarianceThreshold betrachten, hilft es nicht die Features zu standardisieren, da die Varianz der Features dadurch konstant 1 sind. Stattdessen werden die Features normalisiert.

Wir definieren nun eine Funktion, die für einen vordefinierten Threshold alle Features auflistet, die den threshold laut VarianceThreshold nicht erreichen. Da wir später mehrere thresholds testen, fügen wir den Parameter old_features hinzu, sodass Features, die bei einem geringeren threshold bereits aussortiert wurden, nicht mehrfach genannt werden.

In [3]:
def variance_threshold(old_features:list[str], threshold=float):
    selector = VarianceThreshold(threshold=threshold)
    selector.fit_transform(X=scaled_df_X)
    features = scaled_df_X.columns
    support_mask = selector.get_support()
    features = features[~support_mask]
    
    new_features = []
    
    for feature in features:
        if feature not in old_features:
            old_features.append(feature)
            new_features.append(feature)
    
    return new_features, old_features

In [4]:
var_ranking = {}
all_features = scaled_df_X.columns
old_features = []
for threshold in [0, 0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.3]:
   try:
       new_features, old_features = variance_threshold(threshold=threshold, old_features=old_features)
       var_ranking[f"{threshold}"] = new_features
       print(threshold, new_features)
   except ValueError:
       print(f"Error: No feature in X meets the variance threshold {threshold}")
       remain = [f for f in all_features if f not in old_features]
       print(f"remaining features: {remain}")
       break

0 []
1e-05 ['Total Length of Fwd Packets', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Header Length.1', 'Subflow Fwd Bytes', 'act_data_pkt_fwd', 'min_seg_size_forward']
5e-05 ['Total Fwd Packets', 'Total Backward Packets', 'Total Length of Bwd Packets', 'Fwd URG Flags', 'CWE Flag Count', 'Down/Up Ratio', 'Subflow Fwd Packets', 'Subflow Bwd Packets', 'Subflow Bwd Bytes', 'Active Mean', 'Active Std', 'Active Min']
0.0001 ['Active Max']
0.0005 ['Flow Bytes/s', 'Bwd Packets/s', 'Min Packet Length', 'RST Flag Count', 'ECE Flag Count']
0.001 ['Fwd Packet Length Max', 'Fwd Packet Length Min', 'Bwd Packet Length Min', 'Flow IAT Min']
0.005 ['Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Flow Packets/s', 'Flow IAT Mean', 'Fwd Packets/s', 'Avg Fwd Segment Size', 'Idle Std']
0.01 ['Flow IAT Std', 'Fwd IAT Mean', 'Fwd IAT Min', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Min', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Variance', 'Average Packet Size']
0.05 ['Bwd Packet Length 

So einfach können wir die Features aber nicht entfernen. Wir betrachten nun für jedes resultierende Feature den mean und die std der einzelen Angriffsklassen. Dadurch kann manuell überprüft werden, ob das Feature trotz geringer Varianz eine hohe Relevanz besitzt.

In [5]:
rank_threshold = 10
rank = 1
for threshold in var_ranking:
    print("#################################################")
    features = var_ranking[threshold]
    print(f"threshold: {threshold}")
    rank += len(features)
    for feature in features:
        print("-------------------------------------------------")
        print(feature)
        print(scaled_df.groupby("Label")[feature].agg(["mean", "std"]))
        print("-------------------------------------------------")
    if rank >= rank_threshold:
        print("#################################################")
        break
    print("#################################################")
print(f"number of removed features: {rank - 1}")

#################################################
threshold: 0
#################################################
#################################################
threshold: 1e-05
-------------------------------------------------
Total Length of Fwd Packets
                      mean           std
Label                                   
BENIGN        5.301714e-05  8.585398e-04
Bot           2.069642e-04  1.433224e-03
Brute Force   5.418838e-05  7.042514e-05
Dos/DDos      1.936311e-05  2.668015e-05
Heartbleed    9.697463e-04  2.783574e-04
Infiltration  2.920355e-02  5.795297e-02
PortScan      9.075069e-08  5.769861e-07
Web Attack    1.435906e-04  6.839153e-04
-------------------------------------------------
-------------------------------------------------
Fwd Header Length
                  mean           std
Label                               
BENIGN        0.999855  7.595910e-04
Bot           0.999856  2.605507e-09
Brute Force   0.999856  7.511819e-09
Dos/DDos      0.999856  2.591

Wir erkennen, dass bei manchen Features bestimmte Angriffsarten nur Nullwerte enthalten und andere Features in allen Klassen einen fast gleichwertigen mean habe. Das Feature Fwd URG Flags besitzt zum Beispiel für jede Angriffsart Nullwerte und besitzt dadurch eine theoretisch wichtige Eigenschaft um BENIGN zu erkennen. Nun stellt sich die Frage, wie sehr sich die Werte der einzelnen Klasse vom gesamten Feature unterscheiden müssen, damit das Feature relevant ist. Als Beispiel soll hier das Feature Flow Bytes/s dienen.

| Label         | Mean      | Std          |
|---------------|-----------|--------------|
| BENIGN        | 0.112640  | 0.01247264   |
| Bot           | 0.112046  | 0.000357943  |
| Brute Force   | 0.111932  | 0.000141700  |
| DDoS          | 0.111947  | 0.000628394  |
| Dos/DDos      | 0.111951  | 0.002404996  |
| Heartbleed    | 0.111949  | 0.000000363  |
| Infiltration  | 0.111930  | 0.000042709  |
| PortScan      | 0.112030  | 0.000243349  |
| Web Attack    | 0.111921  | 0.000000307  |


Dieses Feature befindet sich nicht unter den ersten 19 Features, die durch VarianceThreshold aussortiert wurden. Jedoch ist erkennbar, dass sich die einzelnen Klassen nur minimal voneinander unterscheiden; die Varianz der Klassen zum Feature selbst ist also gering. Dieses Feature sollte demnach durch eine andere Methode aussortiert werden. Bevor wir dazu kommen nutzen wir dieses Feature als inverses Beispiel. Genauso wie Feature wie Flow Bytes/s nicht aussortiert wurden, wurden Features mit relativ hoher Varianz unter den einzelnen Klassen aussortiert. Dies ist z.B. der Fall, wenn zwei Klassen mit wenig Instanzen einen mean ungleich 0 besitzen, während der mean aller anderer Klassen gleich 0 ist. Diese Art von Feature könnte demnach ein guter Indikator für diese Klasse sein, würde aber von VarianceThreshold aussortiert werden. Wir müsssen also das Verhältnis aus der Varianz der Klassen und dem Varianz des Features berücksichtigen. Sollten sich die Klassen stark voneinader unterscheiden, wird das Feature vorerst behalten. Eine Möglichkeit dies zu überprüfen ist der F-Test.

Bevor wir dazu kommen, bleiben wir noch kurz bei Varianzen. Da wir nun wissen, dass auf den VarianceThreshold kein definitiver Verlass ist, ranken wir zunächst alle Features nach ihrer Varianz - beginnend mit der niedrigsten - um das Ergebniss mit dieser vergleichen zu können.

In [6]:
variances = scaled_df_X.var()
feature_names = scaled_df_X.columns

var_ranking_unsorted = pd.DataFrame({
    'Feature': feature_names,
    'variance': variances
})

## F-Test

In [7]:
f_scores, p_values = f_classif(scaled_df_X, scaled_df_y)

# feature_names = X_var.columns if isinstance(X_var, pd.DataFrame) else [f'feature_{i}' for i in range(X_var.shape[1])]
feature_names = scaled_df_X.columns

f_ranking_unsorted = pd.DataFrame({
    'Feature': feature_names,
    'F_score': f_scores,
    'p_value': p_values
})

Sehen wir uns die schlechtesten Features laut F-Score mal genauer an

In [13]:
f_ranking = f_ranking_unsorted.sort_values(by='F_score', ascending=True)
for feature in f_ranking.head(10)['Feature']:
        print("-------------------------------------------------")
        print(feature)
        print(scaled_df.groupby("Label")[feature].agg(["mean", "std"]))
        print("-------------------------------------------------")

-------------------------------------------------
Fwd Header Length
                  mean           std
Label                               
BENIGN        0.999855  7.595910e-04
Bot           0.999856  2.605507e-09
Brute Force   0.999856  7.511819e-09
Dos/DDos      0.999856  2.591428e-09
Heartbleed    0.999858  6.576645e-07
Infiltration  0.999856  9.329845e-07
PortScan      0.999856  3.654475e-10
Web Attack    0.999856  3.917707e-08
-------------------------------------------------
-------------------------------------------------
Fwd Header Length.1
                  mean           std
Label                               
BENIGN        0.999855  7.595910e-04
Bot           0.999856  2.605507e-09
Brute Force   0.999856  7.511819e-09
Dos/DDos      0.999856  2.591428e-09
Heartbleed    0.999858  6.576645e-07
Infiltration  0.999856  9.329845e-07
PortScan      0.999856  3.654475e-10
Web Attack    0.999856  3.917707e-08
-------------------------------------------------
----------------------

Wir erkennen, dass erwartete Features wie 'Fwd Header Length' einen schlechten F-Score besitzen. Unter diesen schlechtesten Features laut F-Score befinden sich jedoch auch andere Features, die sich in den means von ein oder zwei Klassen um ein vielfaches zu den anderen Klassen unterscheiden. Wie auch bei der Varianz scheint der F-Score Features auszusortieren, die intuitiv relevant erscheinen.

Schauen wir uns mal die Top10 an

In [None]:
f_ranking = f_ranking_unsorted.sort_values(by='F_score', ascending=False)
for feature in f_ranking.head(10)['Feature']:
        print("-------------------------------------------------")
        print(feature)
        print(scaled_df.groupby("Label")[feature].agg(["mean", "std"]))
        print("-------------------------------------------------")

Die means der einzelnen Klassen unterscheiden sich in diesen Features etwas deutlicher als in den Flop10, aber auch hier treten gerne ein oder zwei means pro Feature heraus, die sich um ein Vielfaches unterscheiden.

### Overlap (hoher F-Score/hohe Varianz)

Generell sind Features mit diesen Eigenschaften erwünscht. Dafür sortieren wir die Ranglisten, sodass wir zwei Listen mit Features besitzen - einmal absteigend nach F-Score und einmal absteigend nach Varianz. Da der F-Score eher die Klassen berücksichtigt, sortieren wir primär nach dieser und betrachten den Overlap mit der Rangliste der Varianz

In [16]:
var_ranking = var_ranking_unsorted.sort_values(by='variance', ascending=False)
f_ranking = f_ranking_unsorted.sort_values(by='F_score', ascending=False)

for top_n in [5, 10, 15, 20, 25, 30, 40, 50]:
    top_var_df = var_ranking.head(top_n)
    top_f_df = f_ranking.head(top_n)

    top_var = set(top_var_df['Feature'])
    top_f = set(top_f_df['Feature'])
    overlap = top_f.intersection(top_var)

    # Index-Mapping
    var_index_map = {feat: idx for idx, feat in enumerate(var_ranking['Feature'])}
    f_index_map = {feat: idx for idx, feat in enumerate(f_ranking['Feature'])}

    # Nur Features, die in einer Top-N-Liste sind, aber nicht in der anderen (einseitig)
    only_in_f = top_f - top_var

    # Differenzen berechnen, nur wenn Feature auch in der jeweils anderen Gesamtliste vorkommt
    diffs = []
    
    for feat in only_in_f:
        diffs.append((feat, f_index_map[feat], var_index_map[feat]))
    print("-------------------------------------------------")
    print(f"Top {top_n} - Overlap count: {len(overlap)} out of {top_n}: - Difference {diffs}")
    for d in diffs:
        print(f"Feature: {d[0]}, f_rank: {d[1]+1}, var_rank: {d[2]+1}") 
    print("-------------------------------------------------")

-------------------------------------------------
Top 5 - Overlap count: 0 out of 5: - Difference [('Avg Bwd Segment Size', 2, 19), ('Packet Length Std', 4, 16), ('Bwd Packet Length Std', 0, 21), ('Bwd Packet Length Max', 3, 22), ('Bwd Packet Length Mean', 1, 20)]
Feature: Avg Bwd Segment Size, f_rank: 3, var_rank: 20
Feature: Packet Length Std, f_rank: 5, var_rank: 17
Feature: Bwd Packet Length Std, f_rank: 1, var_rank: 22
Feature: Bwd Packet Length Max, f_rank: 4, var_rank: 23
Feature: Bwd Packet Length Mean, f_rank: 2, var_rank: 21
-------------------------------------------------
-------------------------------------------------
Top 10 - Overlap count: 0 out of 10: - Difference [('Max Packet Length', 5, 26), ('Packet Length Variance', 8, 30), ('Avg Bwd Segment Size', 2, 19), ('Average Packet Size', 6, 25), ('Packet Length Std', 4, 16), ('Fwd IAT Std', 9, 18), ('Bwd Packet Length Std', 0, 21), ('Bwd Packet Length Max', 3, 22), ('Packet Length Mean', 7, 24), ('Bwd Packet Length Mean'

Sobald die besten 20 oder 25 Features basierend auf dem F-Score betrachtet werden, ist ein bemerkbarer Overlap vorhanden. Die Wahrscheinlichkeit, dass diese Features Relevanz besitzen ist recht hoch. Sie müssen trotzdem vorher überprüft werden, ob die Varianz nicht durch hohe Outliner zustande kommt. Als Richtlinie kann annehmen, dass die Wahrscheinlichkeit auf Outliner gering ist, wenn mean etwa gleich dem median (50% percentile) ist ,und sich min/25% und/oder max/75% nicht stark voneinander unterscheiden. Als Beispiel betrachten wir 'Packet Length Std':

In [17]:
scaled_df.groupby('Label')['Packet Length Std'].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BENIGN,2095057.0,0.033678,0.066903,0.0,0.000463,0.006483,0.024658,1.0
Bot,1948.0,0.020421,0.092273,0.0,0.0,0.000678,0.015973,0.995238
Brute Force,9150.0,0.014536,0.017789,0.0,0.002605,0.002679,0.039776,0.067876
Dos/DDos,321759.0,0.326576,0.218401,0.0,0.241248,0.349682,0.451933,1.0
Heartbleed,11.0,0.519407,0.017551,0.502577,0.510466,0.513088,0.525283,0.564171
Infiltration,36.0,0.054304,0.028828,0.0,0.042178,0.059326,0.067937,0.13928
PortScan,90694.0,0.000977,0.011827,0.0,0.000488,0.000488,0.000732,0.451779
Web Attack,2143.0,0.007285,0.028867,0.0,0.0,0.0,0.0,0.320951


In [None]:
Es ist erkennbar, dass die Klassen BENIGN, Bot,  PortScan und Web Attack outliner besitzen, die die Verteilung rechtsseitig verzerren

### Overlap (niedriger F-Score/niedrige Varianz)

Dieser Overlap ist generell unerwünscht, da diese Features keine/nur geringe Unterschiede zwischen den Klassen aufweisen und gleichzeitig eine geringe Streuung besitzen

In [47]:
var_ranking = var_ranking_unsorted.sort_values(by='variance', ascending=True)
f_ranking = f_ranking_unsorted.sort_values(by='F_score', ascending=True)


for top_n in [5, 10, 15, 20, 25, 30, 40, 50]:
    top_var_df = var_ranking.head(top_n)
    top_f_df = f_ranking.head(top_n)

    top_var = set(top_var_df['Feature'])
    top_f = set(top_f_df['Feature'])
    overlap = top_f.intersection(top_var)

    # Index-Mapping
    var_index_map = {feat: idx for idx, feat in enumerate(var_ranking['Feature'])}
    f_index_map = {feat: idx for idx, feat in enumerate(f_ranking['Feature'])}

    # Nur Features, die in einer Top-N-Liste sind, aber nicht in der anderen (einseitig)
    only_in_f = top_f - top_var

    # Differenzen berechnen, nur wenn Feature auch in der jeweils anderen Gesamtliste vorkommt
    diffs = []
    
    for feat in only_in_f:
        diffs.append((feat, f_index_map[feat], var_index_map[feat]))
    print("-------------------------------------------------")
    print(f"Top {top_n} - Overlap count: {len(overlap)} out of {top_n}: - Difference {diffs}")
    for d in diffs:
        print(f"Feature: {d[0]}, f_rank: {d[1]+1}, var_rank: {d[2]+1}") 
    print("-------------------------------------------------")

-------------------------------------------------
Top 5 - Overlap count: 3 out of 5: - Difference [('min_seg_size_forward', 3, 5), ('CWE Flag Count', 4, 16)]
Feature: min_seg_size_forward, f_rank: 4, var_rank: 6
Feature: CWE Flag Count, f_rank: 5, var_rank: 17
-------------------------------------------------
-------------------------------------------------
Top 10 - Overlap count: 6 out of 10: - Difference [('Fwd URG Flags', 5, 17), ('CWE Flag Count', 4, 16), ('Total Length of Bwd Packets', 9, 12), ('Total Backward Packets', 7, 10)]
Feature: Fwd URG Flags, f_rank: 6, var_rank: 18
Feature: CWE Flag Count, f_rank: 5, var_rank: 17
Feature: Total Length of Bwd Packets, f_rank: 10, var_rank: 13
Feature: Total Backward Packets, f_rank: 8, var_rank: 11
-------------------------------------------------
-------------------------------------------------
Top 15 - Overlap count: 11 out of 15: - Difference [('Fwd URG Flags', 5, 17), ('CWE Flag Count', 4, 16), ('RST Flag Count', 11, 21), ('ECE Flag

Wir sehen, dass es bei  Der F-Test erkennt häufig nicht Features, die eine geringe Varianz und schwache Unterschiede im mean, aber starke klassenspezifische Signale besitzen. Solche Eigenschaften sollten mit dem Mutual-Information-Test (MI) gefunden werden.

## Mutual Information (MI)

In [None]:
from sklearn.feature_selection import mutual_info_classif

mi_scores = mutual_info_classif(X_var, y, discrete_features='auto', random_state=0)

In [8]:
mi_df = pd.DataFrame({
    'Feature': X_var.columns,
    'MI_score': mi_scores
}).sort_values(by='MI_score', ascending=False)

print(mi_df)

          Feature  MI_score
5  FIN Flag Count  0.034617
7  URG Flag Count  0.034592
4   Fwd PSH Flags  0.014041
6  SYN Flag Count  0.013937
1  CWE Flag Count  0.000311
0   Fwd URG Flags  0.000141
2  RST Flag Count  0.000079
3  ECE Flag Count  0.000000


In [3]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif, mutual_info_classif, SelectFromModel
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
df = pd.read_parquet('../data/CICIDS2017_cleaned.parquet')

X = df.drop(columns = 'Label')
y = df['Label']


# Scale features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [4]:
k_best_mi = SelectKBest(score_func=mutual_info_classif, k='all')
X_mi = k_best_mi.fit_transform(X_scaled, y)
n_mi = (k_best_mi.scores_ > 0).sum()

[[1.33333321e-07 4.55046005e-06 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [1.01666657e-06 0.00000000e+00 3.42557258e-06 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [5.41666617e-07 0.00000000e+00 3.42557258e-06 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [7.91666594e-07 4.55046005e-06 3.42557258e-06 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [8.73873253e-03 2.27523003e-05 6.85114517e-06 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [7.91266594e-04 1.36513802e-05 6.85114517e-06 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]]


In [10]:
mi_scores = k_best_mi.scores_
feature_names = X.columns  # or list(X_scaled.columns) if using a DataFrame

# Create a DataFrame to view sorted scores
mi_df = pd.DataFrame({
    'Feature': feature_names,
    'MI_score': mi_scores
}).sort_values(by='MI_score', ascending=False)

In [12]:
# Display top 10 features
print(mi_df.head(69))

                        Feature  MI_score
49          Average Packet Size  0.460612
39       Packet Length Variance  0.441232
38            Packet Length Std  0.441178
37           Packet Length Mean  0.432565
4   Total Length of Bwd Packets  0.399019
..                          ...       ...
41               SYN Flag Count  0.013961
42               RST Flag Count  0.000163
30                Fwd URG Flags  0.000020
46               CWE Flag Count  0.000000
47               ECE Flag Count  0.000000

[69 rows x 2 columns]


In [15]:
target_feature = ['FIN Flag Count','URG Flag Count', 'Fwd PSH Flags', 'SYN Flag Count', 
                  'CWE Flag Count', 'Fwd URG Flags', 'RST Flag Count', 'ECE Flag Count']

mi_df = pd.DataFrame({
    'Feature': X.columns,
    'MI_score': k_best_mi.scores_
}).sort_values(by='MI_score', ascending=False).reset_index(drop=True)

# Add rank column (1-based)
mi_df['Rank'] = mi_df.index + 1

# Locate the target feature's rank
for target in target_feature:
    feature_rank = mi_df[mi_df['Feature'] == target]
    
    if not feature_rank.empty:
        print(feature_rank)
    else:
        print(f"Feature '{target}' not found.")

           Feature  MI_score  Rank
59  FIN Flag Count  0.034668    60
           Feature  MI_score  Rank
60  URG Flag Count  0.034322    61
          Feature  MI_score  Rank
63  Fwd PSH Flags   0.01424    64
           Feature  MI_score  Rank
64  SYN Flag Count  0.013961    65
           Feature  MI_score  Rank
67  CWE Flag Count       0.0    68
          Feature  MI_score  Rank
66  Fwd URG Flags   0.00002    67
           Feature  MI_score  Rank
65  RST Flag Count  0.000163    66
           Feature  MI_score  Rank
68  ECE Flag Count       0.0    69


### Test StandardScaler

We expect this to not work since the StandardScaler transforms the Features with the purpose of having a mean of 0 and a variance of 1

In [6]:
df = pd.read_parquet('../data/CICIDS2017_cleaned.parquet')
df = df.drop(columns = 'Destination Port')
print(df.shape)
X = df.drop(columns = 'Label')
y = df['Label']

scaler = StandardScaler().fit_transform(X=X, y=y)
scaled_df = pd.DataFrame(scaler, columns=X.columns)
scaled_df['Label'] = y
print(scaled_df.shape)

old_features = []

for threshold in [0, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.8]:
    old_features = variance_threshold(df=scaled_df, threshold=threshold, old_features=old_features)
        
        
del df

(2520798, 70)
(2520798, 70)
threshold: 0
no new features detected
threshold: 0.0001
no new features detected
threshold: 0.0005
no new features detected
threshold: 0.001
no new features detected
threshold: 0.005
no new features detected
threshold: 0.01
no new features detected
threshold: 0.05
no new features detected
threshold: 0.1
no new features detected
threshold: 0.15
no new features detected
threshold: 0.2
no new features detected
threshold: 0.3
no new features detected
threshold: 0.4
no new features detected
threshold: 0.8
no new features detected


### Test MinMaxScaler

Es zeigt sich, dass sich für 4 Features nur die Klasse 'BENIGN' auf die Varianz auswirkt, da die Werte für alle anderen Klassen konstant 0 sind. Dadurch wissen wir, dass es möglich ist drei dieser vier Features zu entfernen. Welches Feature genau entfernt werden sollte, muss noch überprüft werden 

In [2]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif, mutual_info_classif, SelectFromModel
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
df = pd.read_parquet('../data/CICIDS2017_cleaned.parquet')
df = df.drop(columns = 'Destination Port')

X = df.drop(columns = 'Label')
y = df['Label']


# Scale features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# 1. VarianceThreshold (Unsupervised)
var_thresh = VarianceThreshold(threshold=0.01)
X_var = var_thresh.fit_transform(X_scaled)
n_var = X_var.shape[1]

# 2. SelectKBest (ANOVA F-test)
k_best_anova = SelectKBest(score_func=f_classif, k='all')
X_anova = k_best_anova.fit_transform(X_scaled, y)
n_anova = (k_best_anova.scores_ > 0).sum()  # Keep features with non-zero score

# 3. SelectKBest (Mutual Information)
k_best_mi = SelectKBest(score_func=mutual_info_classif, k='all')
X_mi = k_best_mi.fit_transform(X_scaled, y)
n_mi = (k_best_mi.scores_ > 0).sum()

# 4. SelectFromModel (RandomForest)
rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold='median')
rf_selector.fit(X_scaled, y)
X_rf = rf_selector.transform(X_scaled)
n_rf = X_rf.shape[1]

# Combine results
feature_selection_summary = pd.DataFrame({
    'Method': ['VarianceThreshold', 'SelectKBest (ANOVA)', 'SelectKBest (Mutual Info)', 'SelectFromModel (RF)'],
    'Features Selected': [n_var, n_anova, n_mi, n_rf],
    'Total Features': [X.shape[1]] * 4
})

print(feature_selection_summary)

"""
OUTPUT
                      Method  Features Selected  Total Features
0          VarianceThreshold                 23              69
1        SelectKBest (ANOVA)                 69              69
2  SelectKBest (Mutual Info)                 69              69
3       SelectFromModel (RF)                 35              69
"""

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7ff44bdb8e20>>
Traceback (most recent call last):
  File "/home/joscha/.cache/pypoetry/virtualenvs/code-LqrHNUld-py3.10/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


KeyboardInterrupt: 

In [8]:
feature_names = X.columns
print(feature_names[k_best_anova.get_support()])

Index(['Flow Duration', 'Total Fwd Packets', 'Total Backward Packets',
       'Total Length of Fwd Packets', 'Total Length of Bwd Packets',
       'Fwd Packet Length Max', 'Fwd Packet Length Min',
       'Fwd Packet Length Mean', 'Fwd Packet Length Std',
       'Bwd Packet Length Max', 'Bwd Packet Length Min',
       'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s',
       'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
       'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std',
       'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean',
       'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
       'Fwd URG Flags', 'Fwd Header Length', 'Bwd Header Length',
       'Fwd Packets/s', 'Bwd Packets/s', 'Min Packet Length',
       'Max Packet Length', 'Packet Length Mean', 'Packet Length Std',
       'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count',
       'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count', '

### Test RandomForest (many targets)

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

# ---------------------------
# 1. Load and Prepare Dataset
# ---------------------------
df = pd.read_parquet('../data/CICIDS2017_cleaned.parquet')
df = df.drop(columns='Destination Port')  # Optional, as before

In [2]:
# Separate features and target
X = df.drop(columns='Label')
y = df['Label']

# ---------------------------
# 2. Split Train and Test Set
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [3]:
# ---------------------------
# 3. Build Pipeline
# ---------------------------
k = 20  # Number of top features to keep

pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('select', SelectKBest(score_func=f_classif, k=k)),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [4]:
# ---------------------------
# 4. Train and Evaluate
# ---------------------------
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

      BENIGN       1.00      1.00      1.00    419012
         Bot       0.86      0.47      0.61       389
 Brute Force       1.00      0.96      0.98      1830
        DDoS       1.00      1.00      1.00     25603
    Dos/DDos       0.99      0.99      0.99     38749
  Heartbleed       1.00      1.00      1.00         2
Infiltration       0.75      0.43      0.55         7
    PortScan       0.99      1.00      0.99     18139
  Web Attack       0.99      0.91      0.95       429

    accuracy                           1.00    504160
   macro avg       0.95      0.86      0.90    504160
weighted avg       1.00      1.00      1.00    504160

Accuracy: 0.9973480641066328


In [5]:
import joblib

# Save the pipeline to a file
joblib.dump(pipeline, '../models/anova_rf_pipeline_v1.joblib')


['../models/anova_rf_pipeline_v1.joblib']

### Test RandomForest (Benign/Malicious)

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

# ---------------------------
# 1. Load and Prepare Dataset
# ---------------------------
df = pd.read_parquet('../data/CICIDS2017_cleaned.parquet')
df = df.drop(columns='Destination Port')  # Optional, as before

new_labels = {
        'BENIGN': 'BENIGN',
        'DDoS': 'Malicious',
        'PortScan':'Malicious',
        'Bot': 'Malicious',
        'Infiltration': 'Malicious',
        'Web Attack': 'Malicious',
        'Brute Force': 'Malicious',
        'Dos/DDos': 'Malicious',
        'Heartbleed':  'Malicious',
    }
df['Label'] = df['Label'].map(new_labels)


In [2]:
# Separate features and target
X = df.drop(columns='Label')
y = df['Label']

# ---------------------------
# 2. Split Train and Test Set
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [3]:
# ---------------------------
# 3. Build Pipeline
# ---------------------------
k = 20  # Number of top features to keep

pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('select', SelectKBest(score_func=f_classif, k=k)),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [4]:
# ---------------------------
# 4. Train and Evaluate
# ---------------------------
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

      BENIGN       1.00      1.00      1.00    419012
   Malicious       0.99      0.99      0.99     85148

    accuracy                           1.00    504160
   macro avg       1.00      1.00      1.00    504160
weighted avg       1.00      1.00      1.00    504160

Accuracy: 0.9975027768962235


In [None]:
# ---------------------------
# 5. Optional: Cross-Validation
# ---------------------------
scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
print(f"Cross-validated accuracy (mean ± std): {scores.mean():.4f} ± {scores.std():.4f}")

In [5]:
import joblib

# Save the pipeline to a file
joblib.dump(pipeline, '../models/anova_rf_pipeline_v1.joblib')


['../models/anova_rf_pipeline_v1.joblib']

In [None]:
"""df = pd.read_parquet('../data/CICIDS2017_cleaned.parquet')
df[["Fwd URG Flags","RST Flag Count", "CWE Flag Count", "ECE Flag Count"]].corr()"""

In [None]:
"""from sklearn.ensemble import RandomForestClassifier
df = pd.read_parquet('../data/CICIDS2017_cleaned.parquet')
y = df['Label']
X = df.drop(columns = ['Label'])
model = RandomForestClassifier().fit(X, y)
importances = pd.Series(model.feature_importances_, index=X.columns)
print(importances.sort_values(ascending=False))"""
