In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
import warnings
import time

warnings.filterwarnings("ignore")

seconds = time.time()

# List of all columns to be imported
features = ["Bwd Packet Length Std", "Flow Bytes/s", "Total Length of Fwd Packets", "Fwd Packet Length Std",
            "Flow IAT Std", "Flow IAT Min", "Fwd IAT Total", "Flow Duration", "Bwd Packet Length Max", "Flow IAT Max",
            "Flow IAT Mean", "Total Length of Bwd Packets", "Fwd Packet Length Min", "Bwd Packet Length Mean",
            "Flow Packets/s", "Fwd Packet Length Mean", "Total Backward Packets", "Total Fwd Packets", 
            "Fwd Packet Length Max", "Bwd Packet Length Min", 'Label']

df = pd.read_csv('all_data.csv', usecols=features)  # CSV reading

# Print output header
print('%-17s %-17s ' % ("Feature Number", "Feature"))
for i in range(len(features) - 1):
    print('%-17s %-17s' % (i + 1, features[i]))

print('\n\n\n')

# Ensure label column is binary (1 for "BENIGN", 0 for all others)
df['Label'] = df['Label'].apply(lambda x: 1 if x == "BENIGN" else 0)
y = df['Label'].values  # Labels = y
my_list = []
least = 0

# Only the ID3 algorithm is kept
ml_list = {
    "ID3": DecisionTreeClassifier(max_depth=5, criterion="entropy")
}

# Remove the 'Label' feature
features.pop()

print('%-17s %-30s %-10s  %-10s %-15s ' % ("ML algorithm", "Feature Name", "F1-score", "Accuracy", "Feature List"))  # Print output header
for j in ml_list:  # Run for every machine learning algorithm (only ID3 here)
    my_list = []
    for i in features:  # Run for every feature
        my_list.append(i)
        X = df.loc[:, my_list].values  # Data

        # Cross-validation
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

        # Apply the ID3 algorithm
        clf = ml_list[j]
        clf.fit(X_train, y_train)
        predict = clf.predict(X_test)
        f1 = clf.score(X_test, y_test)
        result = f1_score(y_test, predict, average='macro')
        accuracy = round(clf.score(X_test, y_test), 2)
        temp = "["

        for ii in my_list:
            temp += str(my_list.index(ii) + 1) + ", "  # Translate property list to sequence number for less space

        if result >= least:  # If the F1-score is greater than or equal to the previous best, retain the feature
            least = result
            print('%-17s %-30s %-10s  %-10s %-15s %-15s ' % (j, i, result, accuracy, temp, "------> New feature found!!!"))
        else:  # Otherwise, remove it from the list
            my_list.remove(my_list[len(my_list) - 1])
            print('%-17s %-30s %-10s  %-10s %-15s ' % (j, i, result, accuracy, temp))

    print("F1=", least, j, " The most efficient feature list =", my_list, "\n\n")  # Print maximum F1 and feature list

print("mission accomplished!")
print("operation time: =", time.time() - seconds, "seconds")


Feature Number    Feature           
1                 Bwd Packet Length Std
2                 Flow Bytes/s     
3                 Total Length of Fwd Packets
4                 Fwd Packet Length Std
5                 Flow IAT Std     
6                 Flow IAT Min     
7                 Fwd IAT Total    
8                 Flow Duration    
9                 Bwd Packet Length Max
10                Flow IAT Max     
11                Flow IAT Mean    
12                Total Length of Bwd Packets
13                Fwd Packet Length Min
14                Bwd Packet Length Mean
15                Flow Packets/s   
16                Fwd Packet Length Mean
17                Total Backward Packets
18                Total Fwd Packets
19                Fwd Packet Length Max
20                Bwd Packet Length Min




ML algorithm      Feature Name                   F1-score    Accuracy   Feature List    
ID3               Bwd Packet Length Std          0.7307605704350212  0.88       [1,        