In [1]:
epoche = 30

# Addestramento di benchmarking su dataset originale

In [6]:
import argparse
import os
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
import pandas as pd
import json

# def get_args():
#     parser = argparse.ArgumentParser(
#         description="Parse the script arguments."
#     )

#     parser.add_argument(
#         "mirage2019_LOPEZ_lopez_lopez_36P_4F_APP_xST_PAD_metadata.pickle",
#         type=str,
#         required=True,
#         help="--dataset-path"
#     )

#     parser.add_argument(
#         "output",
#         type=str,
#         required=True,
#         help="--output-dir"
#     )

#     return parser.parse_args()

for i in range(0,2):
    seed = 2025 + i
    def ingest_dataset(path, n_pkts=10, n_features=4):
        with open(path, "rb") as f:
            biflows = pickle.load(f)
            labels = pickle.load(f)
        biflows = np.array(biflows)[:,:n_pkts,:n_features]
        return biflows, labels

    if __name__ == "__main__":
        # Parsing degli argomenti
        # args = get_args()
        dataset_path = "mirage2019_LOPEZ_lopez_lopez_36P_4F_APP_xST_PAD_metadata.pickle"
        output_dir = "output/"

        n_pkts = 10
        
        n_features = 4
        seed = seed

        # Riproducibilità
        np.random.seed(seed)                    # NumPy
        tf.random.set_seed(seed)                # TensorFlow
        tf.keras.utils.set_random_seed(seed)    # Keras

        # Caricamento del dataset
        X, y = ingest_dataset(dataset_path, n_pkts=n_pkts, n_features=n_features)
        num_classes = len(np.unique(y))

        # Codifica delle label
        le = LabelEncoder()
        y = le.fit_transform(y)

        # Partizionamento in train, validation e test set (proporzioni 80/20, 80/20)
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=seed, stratify=y,
        )

        scaler = MinMaxScaler(feature_range=(0, 1))
        scaler.fit(np.reshape(X_train, [-1, n_features]))
        res_samples_train = scaler.transform(np.reshape(X_train, [-1, n_features]))
        res_samples_test = scaler.transform(np.reshape(X_test, [-1, n_features]))
        X_train = np.reshape(res_samples_train, [-1, n_pkts, n_features])
        X_test = np.reshape(res_samples_test, [-1, n_pkts, n_features])

        X_train, X_valid, y_train, y_valid = train_test_split(
            X_train, y_train, test_size=0.2, random_state=seed, stratify=y_train,
        )

        ohe_y_train = tf.keras.utils.to_categorical(y_train, num_classes=num_classes)
        ohe_y_valid = tf.keras.utils.to_categorical(y_valid, num_classes=num_classes)

        # Definizione del modello (questa parte si può ri-modulare)
        model = Sequential(name='lopez2017network_CNN_1')
        model.add(Conv2D(filters=32, kernel_size=(4, 2), strides=1, padding='same', activation='relu',
                            input_shape=(n_pkts, n_features, 1)))
        model.add(MaxPooling2D(pool_size=(3, 2), strides=1, padding='same'))
        model.add(BatchNormalization())
        model.add(Conv2D(filters=64, kernel_size=(4, 2), strides=1, padding='same', activation='relu'))
        model.add(MaxPooling2D(pool_size=(3, 1), strides=1, padding='same'))
        model.add(BatchNormalization())
        model.add(Flatten())
        model.add(Dense(200, activation='relu'))
        model.add(Dense(num_classes, activation='softmax'))

        model.compile(
            optimizer=Adam(learning_rate=0.001),
            loss='categorical_crossentropy',
            metrics=['accuracy']
        )
        earlystop = EarlyStopping(monitor='val_accuracy', min_delta=0, patience=10, verbose=1, mode='auto')
        callbacks = [earlystop]

        with open(f"{output_dir}/model_summary.txt", "w") as f:
            model.summary(print_fn=lambda x: f.write(x + "\n"))

        history = model.fit(X_train, ohe_y_train, validation_data=(X_valid,ohe_y_valid),
                        epochs=epoche, batch_size=50,
                        callbacks=callbacks, verbose=2)
        
        os.makedirs(output_dir, exist_ok=True)
        df_history = pd.DataFrame(history.history)
        df_history.to_csv(f"{output_dir}/training_history{i}.csv", index=False)

        y_pred_probs = model.predict(X_test)
        y_pred = np.argmax(y_pred_probs, axis=1)

        # Salvataggio dei risultati
        soft_values = [",".join(map(str, probs)) for probs in y_pred_probs]
        df_soft = pd.DataFrame({
            "Actual": y_test,
            "soft_values": soft_values
        })
        df_pred = pd.DataFrame({
            "Actual": y_test,
            "Predicted": y_pred
        })
    
        df_soft.to_csv(f"{output_dir}/soft_values{i}.dat", sep="\t", index=False)
        df_pred.to_csv(f"{output_dir}/predictions{i}.dat", sep="\t", index=False)
        labels_map = {}
        for c, enc_c in zip(le.classes_, le.transform(le.classes_)):
            labels_map[str(enc_c)] = c
        with open(f"{output_dir}/labels_map{i}.json", 'w') as f:
            json.dump(labels_map, f)

Epoch 1/30
1242/1242 - 15s - loss: 2.0537 - accuracy: 0.4548 - val_loss: 1.7157 - val_accuracy: 0.5336 - 15s/epoch - 12ms/step
Epoch 2/30
1242/1242 - 14s - loss: 1.4876 - accuracy: 0.5857 - val_loss: 1.5141 - val_accuracy: 0.5824 - 14s/epoch - 12ms/step
Epoch 3/30
1242/1242 - 13s - loss: 1.2844 - accuracy: 0.6350 - val_loss: 1.4380 - val_accuracy: 0.5997 - 13s/epoch - 11ms/step
Epoch 4/30
1242/1242 - 13s - loss: 1.1629 - accuracy: 0.6635 - val_loss: 1.3231 - val_accuracy: 0.6347 - 13s/epoch - 11ms/step
Epoch 5/30
1242/1242 - 13s - loss: 1.0846 - accuracy: 0.6824 - val_loss: 1.3121 - val_accuracy: 0.6333 - 13s/epoch - 11ms/step
Epoch 6/30
1242/1242 - 13s - loss: 1.0155 - accuracy: 0.6997 - val_loss: 1.2529 - val_accuracy: 0.6566 - 13s/epoch - 11ms/step
Epoch 7/30
1242/1242 - 13s - loss: 0.9592 - accuracy: 0.7132 - val_loss: 1.2127 - val_accuracy: 0.6672 - 13s/epoch - 11ms/step
Epoch 8/30
1242/1242 - 13s - loss: 0.9134 - accuracy: 0.7254 - val_loss: 1.1398 - val_accuracy: 0.6864 - 13s/ep

# Addestramento di benchmarking su dataset ridimensinato a 10k

In [3]:
import argparse
import os
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
import pandas as pd
import json

def ingest_dataset(path, n_pkts=10, n_features=4):
    """Carica dataset da pickle e seleziona i primi n_pkts e n_features."""
    with open(path, "rb") as f:
        biflows = pickle.load(f)
        labels = pickle.load(f)
    biflows = np.array(biflows)[:, :n_pkts, :n_features]
    return biflows, labels

if __name__ == "__main__":

    dataset_path = "mirage2019_LOPEZ_lopez_lopez_36P_4F_APP_xST_PAD_metadata.pickle"
    output_dir = "output2/"
    os.makedirs(output_dir, exist_ok=True)

    n_pkts = 10
    n_features = 4
    sample_train_size = 10000    
    epoche = epoche
    seed = 2025

    np.random.seed(seed)
    tf.random.set_seed(seed)
    tf.keras.utils.set_random_seed(seed)

    X, y = ingest_dataset(dataset_path, n_pkts=n_pkts, n_features=n_features)

    le = LabelEncoder()
    y = le.fit_transform(y)
    num_classes = len(np.unique(y))

    X_train_full, X_test, y_train_full, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=seed
    )

    if sample_train_size < len(X_train_full):
        X_train_full, _, y_train_full, _ = train_test_split(
            X_train_full, y_train_full,
            train_size=sample_train_size,
            stratify=y_train_full,
            random_state=seed
        )

    scaler = MinMaxScaler(feature_range=(0, 1))
    X_train_full_scaled = scaler.fit_transform(X_train_full.reshape(-1, n_features)).reshape(-1, n_pkts, n_features)
    X_test_scaled = scaler.transform(X_test.reshape(-1, n_features)).reshape(-1, n_pkts, n_features)

    X_train, X_valid, y_train, y_valid = train_test_split(
        X_train_full_scaled, y_train_full,
        test_size=0.2, stratify=y_train_full, random_state=seed
    )

    ohe_y_train = tf.keras.utils.to_categorical(y_train, num_classes=num_classes)
    ohe_y_valid = tf.keras.utils.to_categorical(y_valid, num_classes=num_classes)

    model = Sequential([
        Conv2D(32, (4, 2), padding='same', activation='relu', input_shape=(n_pkts, n_features, 1)),
        MaxPooling2D((3, 2), padding='same'),
        BatchNormalization(),
        Conv2D(64, (4, 2), padding='same', activation='relu'),
        MaxPooling2D((3, 1), padding='same'),
        BatchNormalization(),
        Flatten(),
        Dense(200, activation='relu'),
        Dense(num_classes, activation='softmax')
    ])

    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss="categorical_crossentropy",
        metrics=["accuracy"]
    )

    earlystop = EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)

    with open(f"{output_dir}/model_summary.txt", "w") as f:
        model.summary(print_fn=lambda x: f.write(x + "\n"))

    X_train_cnn = X_train.reshape(-1, n_pkts, n_features, 1)
    X_valid_cnn = X_valid.reshape(-1, n_pkts, n_features, 1)
    X_test_cnn  = X_test_scaled.reshape(-1, n_pkts, n_features, 1)

    history = model.fit(
        X_train_cnn, ohe_y_train,
        validation_data=(X_valid_cnn, ohe_y_valid),
        epochs=epoche,
        batch_size=50,
        callbacks=[earlystop],
        verbose=2
    )

    pd.DataFrame(history.history).to_csv(f"{output_dir}/training_history.csv", index=False)

    y_pred_probs = model.predict(X_test_cnn)
    y_pred = np.argmax(y_pred_probs, axis=1)

    df_soft = pd.DataFrame({
        "Actual": y_test,
        "soft_values": [",".join(map(str, p)) for p in y_pred_probs]
    })
    df_soft.to_csv(f"{output_dir}/soft_values.dat", sep="\t", index=False)

    df_pred = pd.DataFrame({
        "Actual": y_test,
        "Predicted": y_pred
    })
    df_pred.to_csv(f"{output_dir}/predictions.dat", sep="\t", index=False)

    labels_map = {str(enc): cls for cls, enc in zip(le.classes_, le.transform(le.classes_))}
    with open(f"{output_dir}/labels_map.json", "w") as f:
        json.dump(labels_map, f)

    print("✓ Training completato e file salvati.")


Epoch 1/30
160/160 - 2s - loss: 2.8314 - accuracy: 0.2699 - val_loss: 3.5347 - val_accuracy: 0.0835 - 2s/epoch - 9ms/step
Epoch 2/30
160/160 - 1s - loss: 2.2595 - accuracy: 0.4049 - val_loss: 3.6440 - val_accuracy: 0.0840 - 694ms/epoch - 4ms/step
Epoch 3/30
160/160 - 1s - loss: 2.0035 - accuracy: 0.4618 - val_loss: 3.1257 - val_accuracy: 0.1975 - 619ms/epoch - 4ms/step
Epoch 4/30
160/160 - 1s - loss: 1.8078 - accuracy: 0.5082 - val_loss: 2.4069 - val_accuracy: 0.3690 - 652ms/epoch - 4ms/step
Epoch 5/30
160/160 - 1s - loss: 1.6905 - accuracy: 0.5305 - val_loss: 2.0743 - val_accuracy: 0.4650 - 621ms/epoch - 4ms/step
Epoch 6/30
160/160 - 1s - loss: 1.5606 - accuracy: 0.5669 - val_loss: 1.9800 - val_accuracy: 0.4980 - 654ms/epoch - 4ms/step
Epoch 7/30
160/160 - 1s - loss: 1.4676 - accuracy: 0.5811 - val_loss: 1.9686 - val_accuracy: 0.5035 - 644ms/epoch - 4ms/step
Epoch 8/30
160/160 - 1s - loss: 1.3770 - accuracy: 0.6085 - val_loss: 2.0615 - val_accuracy: 0.4885 - 654ms/epoch - 4ms/step
Epo

# Addestramento di benchmarking su dataset ridimensinato a 30k

In [4]:
import argparse
import os
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
import pandas as pd
import json

def ingest_dataset(path, n_pkts=10, n_features=4):
    """Carica dataset da pickle e seleziona i primi n_pkts e n_features."""
    with open(path, "rb") as f:
        biflows = pickle.load(f)
        labels = pickle.load(f)
    biflows = np.array(biflows)[:, :n_pkts, :n_features]
    return biflows, labels

if __name__ == "__main__":

    dataset_path = "mirage2019_LOPEZ_lopez_lopez_36P_4F_APP_xST_PAD_metadata.pickle"
    output_dir = "output3/"
    os.makedirs(output_dir, exist_ok=True)

    n_pkts = 10
    n_features = 4
    sample_train_size = 30000    
    epoche = epoche
    seed = 2025

    np.random.seed(seed)
    tf.random.set_seed(seed)
    tf.keras.utils.set_random_seed(seed)

    X, y = ingest_dataset(dataset_path, n_pkts=n_pkts, n_features=n_features)

    le = LabelEncoder()
    y = le.fit_transform(y)
    num_classes = len(np.unique(y))

    X_train_full, X_test, y_train_full, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=seed
    )

    if sample_train_size < len(X_train_full):
        X_train_full, _, y_train_full, _ = train_test_split(
            X_train_full, y_train_full,
            train_size=sample_train_size,
            stratify=y_train_full,
            random_state=seed
        )

    scaler = MinMaxScaler(feature_range=(0, 1))
    X_train_full_scaled = scaler.fit_transform(X_train_full.reshape(-1, n_features)).reshape(-1, n_pkts, n_features)
    X_test_scaled = scaler.transform(X_test.reshape(-1, n_features)).reshape(-1, n_pkts, n_features)

    X_train, X_valid, y_train, y_valid = train_test_split(
        X_train_full_scaled, y_train_full,
        test_size=0.2, stratify=y_train_full, random_state=seed
    )

    ohe_y_train = tf.keras.utils.to_categorical(y_train, num_classes=num_classes)
    ohe_y_valid = tf.keras.utils.to_categorical(y_valid, num_classes=num_classes)

    model = Sequential([
        Conv2D(32, (4, 2), padding='same', activation='relu', input_shape=(n_pkts, n_features, 1)),
        MaxPooling2D((3, 2), padding='same'),
        BatchNormalization(),
        Conv2D(64, (4, 2), padding='same', activation='relu'),
        MaxPooling2D((3, 1), padding='same'),
        BatchNormalization(),
        Flatten(),
        Dense(200, activation='relu'),
        Dense(num_classes, activation='softmax')
    ])

    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss="categorical_crossentropy",
        metrics=["accuracy"]
    )

    earlystop = EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)

    with open(f"{output_dir}/model_summary.txt", "w") as f:
        model.summary(print_fn=lambda x: f.write(x + "\n"))

    X_train_cnn = X_train.reshape(-1, n_pkts, n_features, 1)
    X_valid_cnn = X_valid.reshape(-1, n_pkts, n_features, 1)
    X_test_cnn  = X_test_scaled.reshape(-1, n_pkts, n_features, 1)

    history = model.fit(
        X_train_cnn, ohe_y_train,
        validation_data=(X_valid_cnn, ohe_y_valid),
        epochs=epoche,
        batch_size=50,
        callbacks=[earlystop],
        verbose=2
    )

    pd.DataFrame(history.history).to_csv(f"{output_dir}/training_history.csv", index=False)

    y_pred_probs = model.predict(X_test_cnn)
    y_pred = np.argmax(y_pred_probs, axis=1)

    df_soft = pd.DataFrame({
        "Actual": y_test,
        "soft_values": [",".join(map(str, p)) for p in y_pred_probs]
    })
    df_soft.to_csv(f"{output_dir}/soft_values.dat", sep="\t", index=False)

    df_pred = pd.DataFrame({
        "Actual": y_test,
        "Predicted": y_pred
    })
    df_pred.to_csv(f"{output_dir}/predictions.dat", sep="\t", index=False)

    labels_map = {str(enc): cls for cls, enc in zip(le.classes_, le.transform(le.classes_))}
    with open(f"{output_dir}/labels_map.json", "w") as f:
        json.dump(labels_map, f)

    print("✓ Training completato e file salvati.")


Epoch 1/30
480/480 - 3s - loss: 2.4303 - accuracy: 0.3709 - val_loss: 2.8603 - val_accuracy: 0.2582 - 3s/epoch - 6ms/step
Epoch 2/30
480/480 - 2s - loss: 1.8583 - accuracy: 0.5011 - val_loss: 1.8224 - val_accuracy: 0.5057 - 2s/epoch - 4ms/step
Epoch 3/30
480/480 - 2s - loss: 1.6418 - accuracy: 0.5480 - val_loss: 1.6411 - val_accuracy: 0.5458 - 2s/epoch - 4ms/step
Epoch 4/30
480/480 - 2s - loss: 1.4936 - accuracy: 0.5839 - val_loss: 1.5951 - val_accuracy: 0.5642 - 2s/epoch - 4ms/step
Epoch 5/30
480/480 - 2s - loss: 1.3892 - accuracy: 0.6074 - val_loss: 1.6822 - val_accuracy: 0.5418 - 2s/epoch - 4ms/step
Epoch 6/30
480/480 - 2s - loss: 1.3069 - accuracy: 0.6306 - val_loss: 1.5215 - val_accuracy: 0.5867 - 2s/epoch - 4ms/step
Epoch 7/30
480/480 - 2s - loss: 1.2326 - accuracy: 0.6438 - val_loss: 1.4647 - val_accuracy: 0.5975 - 2s/epoch - 4ms/step
Epoch 8/30
480/480 - 2s - loss: 1.1784 - accuracy: 0.6562 - val_loss: 1.4188 - val_accuracy: 0.6013 - 2s/epoch - 4ms/step
Epoch 9/30
480/480 - 2s 

# Addestramento di benchmarking su dataset ridimensinato a 45k

In [5]:
import argparse
import os
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
import pandas as pd
import json

def ingest_dataset(path, n_pkts=10, n_features=4):
    """Carica dataset da pickle e seleziona i primi n_pkts e n_features."""
    with open(path, "rb") as f:
        biflows = pickle.load(f)
        labels = pickle.load(f)
    biflows = np.array(biflows)[:, :n_pkts, :n_features]
    return biflows, labels

if __name__ == "__main__":

    dataset_path = "mirage2019_LOPEZ_lopez_lopez_36P_4F_APP_xST_PAD_metadata.pickle"
    output_dir = "output4/"
    os.makedirs(output_dir, exist_ok=True)

    n_pkts = 10
    n_features = 4
    sample_train_size = 45000
    epoche = epoche
    seed = 2025

    np.random.seed(seed)
    tf.random.set_seed(seed)
    tf.keras.utils.set_random_seed(seed)

    X, y = ingest_dataset(dataset_path, n_pkts=n_pkts, n_features=n_features)

    le = LabelEncoder()
    y = le.fit_transform(y)
    num_classes = len(np.unique(y))

    X_train_full, X_test, y_train_full, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=seed
    )

    if sample_train_size < len(X_train_full):
        X_train_full, _, y_train_full, _ = train_test_split(
            X_train_full, y_train_full,
            train_size=sample_train_size,
            stratify=y_train_full,
            random_state=seed
        )

    scaler = MinMaxScaler(feature_range=(0, 1))
    X_train_full_scaled = scaler.fit_transform(X_train_full.reshape(-1, n_features)).reshape(-1, n_pkts, n_features)
    X_test_scaled = scaler.transform(X_test.reshape(-1, n_features)).reshape(-1, n_pkts, n_features)

    X_train, X_valid, y_train, y_valid = train_test_split(
        X_train_full_scaled, y_train_full,
        test_size=0.2, stratify=y_train_full, random_state=seed
    )

    ohe_y_train = tf.keras.utils.to_categorical(y_train, num_classes=num_classes)
    ohe_y_valid = tf.keras.utils.to_categorical(y_valid, num_classes=num_classes)

    model = Sequential([
        Conv2D(32, (4, 2), padding='same', activation='relu', input_shape=(n_pkts, n_features, 1)),
        MaxPooling2D((3, 2), padding='same'),
        BatchNormalization(),
        Conv2D(64, (4, 2), padding='same', activation='relu'),
        MaxPooling2D((3, 1), padding='same'),
        BatchNormalization(),
        Flatten(),
        Dense(200, activation='relu'),
        Dense(num_classes, activation='softmax')
    ])

    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss="categorical_crossentropy",
        metrics=["accuracy"]
    )

    earlystop = EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)

    with open(f"{output_dir}/model_summary.txt", "w") as f:
        model.summary(print_fn=lambda x: f.write(x + "\n"))

    X_train_cnn = X_train.reshape(-1, n_pkts, n_features, 1)
    X_valid_cnn = X_valid.reshape(-1, n_pkts, n_features, 1)
    X_test_cnn  = X_test_scaled.reshape(-1, n_pkts, n_features, 1)

    history = model.fit(
        X_train_cnn, ohe_y_train,
        validation_data=(X_valid_cnn, ohe_y_valid),
        epochs=epoche,
        batch_size=50,
        callbacks=[earlystop],
        verbose=2
    )

    pd.DataFrame(history.history).to_csv(f"{output_dir}/training_history.csv", index=False)

    y_pred_probs = model.predict(X_test_cnn)
    y_pred = np.argmax(y_pred_probs, axis=1)

    df_soft = pd.DataFrame({
        "Actual": y_test,
        "soft_values": [",".join(map(str, p)) for p in y_pred_probs]
    })
    df_soft.to_csv(f"{output_dir}/soft_values.dat", sep="\t", index=False)

    df_pred = pd.DataFrame({
        "Actual": y_test,
        "Predicted": y_pred
    })
    df_pred.to_csv(f"{output_dir}/predictions.dat", sep="\t", index=False)

    labels_map = {str(enc): cls for cls, enc in zip(le.classes_, le.transform(le.classes_))}
    with open(f"{output_dir}/labels_map.json", "w") as f:
        json.dump(labels_map, f)

    print("✓ Training completato e file salvati.")


Epoch 1/30
720/720 - 4s - loss: 2.2789 - accuracy: 0.4031 - val_loss: 2.1084 - val_accuracy: 0.4303 - 4s/epoch - 5ms/step
Epoch 2/30
720/720 - 3s - loss: 1.7300 - accuracy: 0.5313 - val_loss: 1.7370 - val_accuracy: 0.5367 - 3s/epoch - 4ms/step
Epoch 3/30
720/720 - 3s - loss: 1.5229 - accuracy: 0.5788 - val_loss: 1.5818 - val_accuracy: 0.5612 - 3s/epoch - 4ms/step
Epoch 4/30
720/720 - 3s - loss: 1.3944 - accuracy: 0.6057 - val_loss: 1.5995 - val_accuracy: 0.5502 - 3s/epoch - 4ms/step
Epoch 5/30
720/720 - 3s - loss: 1.2977 - accuracy: 0.6318 - val_loss: 1.4541 - val_accuracy: 0.5968 - 3s/epoch - 4ms/step
Epoch 6/30
720/720 - 3s - loss: 1.2251 - accuracy: 0.6472 - val_loss: 1.4259 - val_accuracy: 0.6089 - 3s/epoch - 4ms/step
Epoch 7/30
720/720 - 3s - loss: 1.1698 - accuracy: 0.6603 - val_loss: 1.4629 - val_accuracy: 0.6138 - 3s/epoch - 4ms/step
Epoch 8/30
720/720 - 3s - loss: 1.1177 - accuracy: 0.6729 - val_loss: 1.3460 - val_accuracy: 0.6341 - 3s/epoch - 4ms/step
Epoch 9/30
720/720 - 3s 

# Addestramento di benchmarking su dataset ridimensinato a 50k

In [5]:
import argparse
import os
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
import pandas as pd
import json

for i in range(0,2):
    seed = 2025 + i
    def ingest_dataset(path, n_pkts=10, n_features=4):
        """Carica dataset da pickle e seleziona i primi n_pkts e n_features."""
        with open(path, "rb") as f:
            biflows = pickle.load(f)
            labels = pickle.load(f)
        biflows = np.array(biflows)[:, :n_pkts, :n_features]
        return biflows, labels

    if __name__ == "__main__":

        dataset_path = "mirage2019_LOPEZ_lopez_lopez_36P_4F_APP_xST_PAD_metadata.pickle"
        output_dir = "output5/"
        os.makedirs(output_dir, exist_ok=True)

        n_pkts = 10
        n_features = 4
        sample_train_size = 50000
        epoche = epoche
        seed = seed

        np.random.seed(seed)
        tf.random.set_seed(seed)
        tf.keras.utils.set_random_seed(seed)

        X, y = ingest_dataset(dataset_path, n_pkts=n_pkts, n_features=n_features)

        le = LabelEncoder()
        y = le.fit_transform(y)
        num_classes = len(np.unique(y))

        X_train_full, X_test, y_train_full, y_test = train_test_split(
            X, y, test_size=0.2, stratify=y, random_state=seed
        )

        if sample_train_size < len(X_train_full):
            X_train_full, _, y_train_full, _ = train_test_split(
                X_train_full, y_train_full,
                train_size=sample_train_size,
                stratify=y_train_full,
                random_state=seed
            )

        scaler = MinMaxScaler(feature_range=(0, 1))
        X_train_full_scaled = scaler.fit_transform(X_train_full.reshape(-1, n_features)).reshape(-1, n_pkts, n_features)
        X_test_scaled = scaler.transform(X_test.reshape(-1, n_features)).reshape(-1, n_pkts, n_features)

        X_train, X_valid, y_train, y_valid = train_test_split(
            X_train_full_scaled, y_train_full,
            test_size=0.2, stratify=y_train_full, random_state=seed
        )

        ohe_y_train = tf.keras.utils.to_categorical(y_train, num_classes=num_classes)
        ohe_y_valid = tf.keras.utils.to_categorical(y_valid, num_classes=num_classes)

        model = Sequential([
            Conv2D(32, (4, 2), padding='same', activation='relu', input_shape=(n_pkts, n_features, 1)),
            MaxPooling2D((3, 2), padding='same'),
            BatchNormalization(),
            Conv2D(64, (4, 2), padding='same', activation='relu'),
            MaxPooling2D((3, 1), padding='same'),
            BatchNormalization(),
            Flatten(),
            Dense(200, activation='relu'),
            Dense(num_classes, activation='softmax')
        ])

        model.compile(
            optimizer=Adam(learning_rate=0.001),
            loss="categorical_crossentropy",
            metrics=["accuracy"]
        )

        earlystop = EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)

        with open(f"{output_dir}/model_summary.txt", "w") as f:
            model.summary(print_fn=lambda x: f.write(x + "\n"))

        X_train_cnn = X_train.reshape(-1, n_pkts, n_features, 1)
        X_valid_cnn = X_valid.reshape(-1, n_pkts, n_features, 1)
        X_test_cnn  = X_test_scaled.reshape(-1, n_pkts, n_features, 1)

        history = model.fit(
            X_train_cnn, ohe_y_train,
            validation_data=(X_valid_cnn, ohe_y_valid),
            epochs=epoche,
            batch_size=50,
            callbacks=[earlystop],
            verbose=2
        )

        pd.DataFrame(history.history).to_csv(f"{output_dir}/training_history{i}.csv", index=False)

        y_pred_probs = model.predict(X_test_cnn)
        y_pred = np.argmax(y_pred_probs, axis=1)

        df_soft = pd.DataFrame({
            "Actual": y_test,
            "soft_values": [",".join(map(str, p)) for p in y_pred_probs]
        })
        df_soft.to_csv(f"{output_dir}/soft_values{i}.dat", sep="\t", index=False)

        df_pred = pd.DataFrame({
            "Actual": y_test,
            "Predicted": y_pred
        })
        df_pred.to_csv(f"{output_dir}/predictions{i}.dat", sep="\t", index=False)

        labels_map = {str(enc): cls for cls, enc in zip(le.classes_, le.transform(le.classes_))}
        with open(f"{output_dir}/labels_map{i}.json", "w") as f:
            json.dump(labels_map, f)

        print("✓ Training completato e file salvati.")


Epoch 1/30
800/800 - 4s - loss: 2.2305 - accuracy: 0.4160 - val_loss: 1.8889 - val_accuracy: 0.5048 - 4s/epoch - 5ms/step
Epoch 2/30
800/800 - 3s - loss: 1.6894 - accuracy: 0.5394 - val_loss: 1.6331 - val_accuracy: 0.5480 - 3s/epoch - 4ms/step
Epoch 3/30
800/800 - 3s - loss: 1.4919 - accuracy: 0.5851 - val_loss: 1.5085 - val_accuracy: 0.5871 - 3s/epoch - 3ms/step
Epoch 4/30
800/800 - 3s - loss: 1.3676 - accuracy: 0.6144 - val_loss: 1.4332 - val_accuracy: 0.6071 - 3s/epoch - 3ms/step
Epoch 5/30
800/800 - 3s - loss: 1.2754 - accuracy: 0.6352 - val_loss: 1.3846 - val_accuracy: 0.6217 - 3s/epoch - 3ms/step
Epoch 6/30
800/800 - 3s - loss: 1.2017 - accuracy: 0.6558 - val_loss: 1.4595 - val_accuracy: 0.5942 - 3s/epoch - 3ms/step
Epoch 7/30
800/800 - 3s - loss: 1.1454 - accuracy: 0.6680 - val_loss: 1.2788 - val_accuracy: 0.6468 - 3s/epoch - 3ms/step
Epoch 8/30
800/800 - 3s - loss: 1.0895 - accuracy: 0.6816 - val_loss: 1.4724 - val_accuracy: 0.5981 - 3s/epoch - 3ms/step
Epoch 9/30
800/800 - 3s 