In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

# Load the dataset
data = {
    "id_flow": ["b2bb77a570fcfa9325eb9e51b6116d2a", "166af8ade1a6674f4bfa69b278a8c1ed", "4ca73e2e3783c0cc29bdf1dfcb9c636a",
                "56f7d6e2c8de911abda556978d4e9f31", "f9be1d45e1a2670f32b972d5c987fd5e", "a2b4e4d6e8f1c3b5d7e9e0f2f4e6d8a0"] * 1000,
    "nw_src": ["172.16.25.104", "172.16.25.104", "197.210.64.107", "172.16.25.105", "192.168.0.10", "10.0.0.1"] * 1000,
    "tp_src": [41402, 38848, 48156, 12345, 54321, 98765] * 1000,
    "destination_ip": ["34.107.221.82", "34.107.221.82", "52.84.77.43", "192.168.1.100", "172.16.25.105", "192.168.0.1"] * 1000,
    "destination_port": [80, 80, 443, 8080, 22, 3389] * 1000,
    "protocol": [6, 6, 6, 17, 6, 17] * 1000,
    "forward_packet_count": [5, 5, 3, 10, 8, 15] * 1000,
    "forward_byte_count": [300, 300, 198, 500, 400, 800] * 1000,
    "forward_packet_length": [60.00, 60.00, 66.00, 50.00, 50.00, 53.33] * 1000,
    "forward_packet_interarrival_time": [6.0, 6.0, 10.0, 5.0, 7.5, 4.0] * 1000,
    "reverse_packet_interarrival_time_max": [10.333333, 10.000000, 10.333333, 8.500000, 12.000000, 9.500000] * 1000,
    "reverse_packet_interarrival_time_min": [6.00, 6.20, 10.00, 4.50, 8.00, 6.80] * 1000,
    "reverse_packet_per_second_max": [0.166667, 0.161290, 0.100000, 0.200000, 0.173913, 0.142857] * 1000,
    "reverse_packet_per_second_min": [0.096774, 0.100000, 0.096774, 0.222222, 0.090909, 0.117647] * 1000,
    "reverse_bit_per_second_max": [15.133333, 15.133333, 6.000000, 20.000000, 18.000000, 12.000000] * 1000,
    "reverse_bit_per_second_min": [5.806452, 6.000000, 5.806452, 25.000000, 10.000000, 15.000000] * 1000,
    "reverse_duration": [121, 121, 91, 200, 150, 180] * 1000,
    "reverse_packet_size": [15, 15, 9, 10, 8, 12] * 1000,
    "reverse_byte_size": [1114, 1114, 540, 1000, 800, 1440] * 1000,
    "category": ["WWW", "P2P", "DNS", "FTP", "SSH", "VOIP"] * 1000
}

df = pd.DataFrame(data)

# Perform one-hot encoding for categorical variables
df_encoded = pd.get_dummies(df, columns=["id_flow", "nw_src", "destination_ip"])

# Split the data into features and target
X = df_encoded.drop("category", axis=1)
y = df_encoded["category"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate the ID3 classifier
id3_classifier = DecisionTreeClassifier(criterion='entropy')
id3_classifier.fit(X_train, y_train)
id3_predictions = id3_classifier.predict(X_test)

# Train and evaluate the CART classifier
cart_classifier = DecisionTreeClassifier(criterion='gini')
cart_classifier.fit(X_train, y_train)
cart_predictions = cart_classifier.predict(X_test)

# Compute classification metrics
id3_metrics = classification_report(y_test, id3_predictions)
cart_metrics = classification_report(y_test, cart_predictions)

# Print the classification metrics
print("ID3 Classifier Metrics:")
print(id3_metrics)
print("------------------------")
print("CART Classifier Metrics:")
print(cart_metrics)


ID3 Classifier Metrics:
              precision    recall  f1-score   support

         DNS       1.00      1.00      1.00       195
         FTP       1.00      1.00      1.00       193
         P2P       1.00      1.00      1.00       212
         SSH       1.00      1.00      1.00       210
        VOIP       1.00      1.00      1.00       208
         WWW       1.00      1.00      1.00       182

    accuracy                           1.00      1200
   macro avg       1.00      1.00      1.00      1200
weighted avg       1.00      1.00      1.00      1200

------------------------
CART Classifier Metrics:
              precision    recall  f1-score   support

         DNS       1.00      1.00      1.00       195
         FTP       1.00      1.00      1.00       193
         P2P       1.00      1.00      1.00       212
         SSH       1.00      1.00      1.00       210
        VOIP       1.00      1.00      1.00       208
         WWW       1.00      1.00      1.00       182

   