In [1]:
import os
import sys
sys.path.append(os.path.join(os.getcwd().replace("notebooks/split_models", "")))

import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from config import Config
from data.dataset import NetworkDataset, load_datasets
from model_config import CNN_models, MLP_Models, LSTM_Models, GRU_Models
from model.copy_param import host_copy_model
from transfer.transfer_tensors import HostSocket
from utils.benchmark import SplitBenchmark

In [2]:
conf = Config()
load_models = CNN_models()
split_conf = load_models.split_cnn_0
model_conf = load_models.cnn_4
split_model = load_models.get_model(split_conf)
model = load_models.get_model(model_conf)
model.load()
host_sock = HostSocket(so_file=conf.sock_so)
location = "host"
name = split_conf["name"]
result_path = os.path.join(conf.benchmark_host, "split_model", name + ".txt")
split_model.model.split = location

Checkpoint loaded from /global/D1/homes/jorgetf/Network-Packet-ML-Model/checkpoint/large_model/cnn_4.pth!


In [3]:
X_train, y_train, X_val, y_val, X_test, y_test = load_datasets(conf.datasets, load_models.type)

# create train, val and test dataloaders
train_dataset = NetworkDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, conf.batch_size, shuffle=True)

val_dataset = NetworkDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, conf.batch_size, shuffle=True)

test_dataset = NetworkDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, conf.batch_size)

In [4]:
data, labels = next(iter(train_loader))
print(data.shape, labels.shape)

torch.Size([1, 1, 513]) torch.Size([1])


In [5]:
# copy parameters from model to split model
split_idx = split_conf["split_idx"]
host_model = split_model.model.host_model
split_model.model.host_model = host_copy_model(model.model, host_model, split_idx, type=load_models.type)

In [6]:
# run benchmark
benchmark = SplitBenchmark(split_model, test_loader, conf.batch_size, name, result_path, socket=host_sock, split=location)
benchmark.open()
benchmark()
benchmark.transfer_time()
benchmark.close()

# print and save result
benchmark.print_result()

Success, opend host socket and listening on port (8065)!
Success, accepted connection from dpu!


[W116 11:41:03.344566862 CPUAllocator.cpp:245] Memory block of unknown size was allocated before the profiling started, profiler results will not include the deallocation event
ERROR:2026-01-16 11:41:03 4078612:4078612 DeviceProperties.cpp:47] gpuGetDeviceCount failed with code 35


Benchmark - split_cnn_0 model:

Memory usage (MB):
Avg memory usage: 0.006MB
Peak memory usage: 0.156MB

Model inference latency on one batch (batch size = 1):
Avg latency: 0.430ms
Min latency: 0.229ms
Max latency: 0.471ms

Model inference throughput (batch size = 1):
Throughput: 21.00 samples/sec

Model inference CPU usage (number of logical cores) during runtime:
CPU runtime: 2.32 seconds
Average CPU usage: 0.01/96 cores

Model (split_cnn_0) Macro-F1, Micro-F1 and Macro ROC AUC scores:
Macro-F1 score: 0.04
Micro-F1 score: 1.00
Macro ROC AUC score: nan

Split Model transfer time from dpu to host (batch size = 1):
Avg transfer time: 1584363739.600ms
Min transfer time: 1584363231.221ms
Max transfer time: 1584364223.973ms



In [7]:
benchmark.save()