In [1]:
import os
import sys
sys.path.append(os.path.join(os.getcwd().replace("notebooks/split_models", "")))

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from config import Config
from data.dataset import NetworkDataset, load_datasets
from model_config import MLP_Models, LSTM_Models, GRU_Models
from utils.benchmark import SplitBenchmark

In [2]:
conf = Config()
mlp = MLP_Models()
lstm = LSTM_Models()
gru = GRU_Models()
models = [mlp.split_mlp_3, mlp.split_mlp_2, mlp.split_mlp_1, 
        lstm.split_lstm_3, lstm.split_lstm_2, lstm.split_lstm_1, lstm.split_lstm_0,
        gru.split_gru_3, gru.split_gru_2, gru.split_gru_1, gru.split_gru_0]
model_names = ["split_mlp_3", "split_mlp_2", "split_mlp_1",
            "split_lstm_3", "split_lstm_2", "split_lstm_1", "split_lstm_0",
            "split_gru_3", "split_gru_2", "split_gru_1", "split_gru_0"]


In [3]:
X_train, y_train, X_val, y_val, X_test, y_test = load_datasets(conf.datasets, "lstm")

train_dataset = NetworkDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, conf.batch_size)

X_train, y_train, X_val, y_val, X_test, y_test = load_datasets(conf.datasets, "mlp")
mlp_train_dataset = NetworkDataset(X_test, y_test)
mlp_train_loader = DataLoader(mlp_train_dataset, conf.batch_size)

In [4]:
for i in range(len(models)):
    # set parameters
    model = models[i]
    model.load()
    name = model_names[i]
    result_path = os.path.join(conf.benchmark_host, "split_model", name + ".txt")

    # create dpu model feature
    features, labels = [], []
    if i > 2:
        for i in range(1000):
            data, label = next(iter(train_loader))
            feat, _ = model.model(data, split="dpu")
            features.append(feat)
            labels.append(label)
    else:
        for (data, label) in mlp_train_loader:
            feat, _ = model.model(data, split="dpu")
            features.append(feat)
            labels.append(label)
    
    features, labels = torch.cat(features, dim=0), torch.cat(labels, dim=0)
    dataset =  NetworkDataset(features, labels)
    loader = DataLoader(dataset, conf.batch_size)


    # run benchmark
    benchmark = SplitBenchmark(model, loader, conf.batch_size, name, result_path, split="host")
    benchmark()

    # print and save result
    benchmark.print_result()
    benchmark.save()

Checkpoint loaded from /global/D1/homes/jorgetf/Network-Packet-ML-Model/checkpoint/split_model/split_mlp_3.pth!
Checkpoint loaded from /global/D1/homes/jorgetf/Network-Packet-ML-Model/checkpoint/split_model/split_mlp_3.pth!


[W1217 17:02:24.657721629 CPUAllocator.cpp:245] Memory block of unknown size was allocated before the profiling started, profiler results will not include the deallocation event
ERROR:2025-12-17 17:02:24 1390461:1390461 DeviceProperties.cpp:47] gpuGetDeviceCount failed with code 35


Benchmark - split_mlp_3 model:

Memory usage (MB):
Avg memory usage: 0.106MB
Peak memory usage: 0.627MB

Model inference latency on one batch (batch size = 32):
Avg latency: 0.289ms
Min latency: 0.269ms
Max latency: 0.703ms

Model inference throughput (batch size = 32):
Throughput: 55153.82 samples/sec

Model inference CPU usage (number of logical cores) during runtime:
CPU runtime: 0.03 seconds
Average CPU usage: 48.77/96 cores

Model (split_mlp_3) inference accuracy (%):
Accuracy: 81.25%



Checkpoint loaded from /global/D1/homes/jorgetf/Network-Packet-ML-Model/checkpoint/split_model/split_mlp_2.pth!
Checkpoint loaded from /global/D1/homes/jorgetf/Network-Packet-ML-Model/checkpoint/split_model/split_mlp_2.pth!
Benchmark - split_mlp_2 model:

Memory usage (MB):
Avg memory usage: 0.057MB
Peak memory usage: 0.315MB

Model inference latency on one batch (batch size = 32):
Avg latency: 0.210ms
Min latency: 0.203ms
Max latency: 0.226ms

Model inference throughput (batch size = 32):
Through