In [1]:
import os
import sys
sys.path.append(os.path.join(os.getcwd().replace("notebooks/split_models", "")))

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from config import Config
from data.dataset import NetworkDataset, load_datasets
from model_config import MLP_Models
from transfer.transfer_tensors import DPUSocket
import time
from utils.benchmark import SplitBenchmark

In [2]:
conf = Config()
model_conf = MLP_Models()
model = model_conf.get_model(model_conf.split_mlp_3)
model.load()
dpu_sock = DPUSocket(so_file=conf.sock_so)
location = "dpu"
name = "split_" + model_conf.type
result_path = os.path.join(conf.benchmark_dpu, "split_model", name + ".txt")

Checkpoint loaded from /global/D1/homes/jorgetf/Network-Packet-ML-Model/checkpoint/split_model/split_mlp_3.pth!


In [3]:
X_train, y_train, X_val, y_val, X_test, y_test = load_datasets(conf.datasets, "mlp")

# create train, val and test dataloaders
train_dataset = NetworkDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, conf.batch_size, shuffle=True)

val_dataset = NetworkDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, conf.batch_size, shuffle=True)

test_dataset = NetworkDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, conf.batch_size)

In [4]:
data, labels = next(iter(train_loader))
print(data.shape, labels.shape)

torch.Size([32, 513]) torch.Size([32])


In [5]:
# run benchmark
benchmark = SplitBenchmark(model, test_loader, conf.batch_size, name, result_path, socket=dpu_sock, split=location)
benchmark.open()
benchmark()
benchmark.close()

# print and save result
benchmark.print_result()

Success, opend dpu socket and connected to host (127.0.0.1) at port (8065)!


[W107 18:39:47.202391143 CPUAllocator.cpp:245] Memory block of unknown size was allocated before the profiling started, profiler results will not include the deallocation event
ERROR:2026-01-07 18:39:48 1478489:1478489 DeviceProperties.cpp:47] gpuGetDeviceCount failed with code 35


Benchmark - split_mlp model:

Memory usage (MB):
Avg memory usage: 0.098MB
Peak memory usage: 0.629MB

Model inference latency on one batch (batch size = 32):
Avg latency: 0.651ms
Min latency: 0.174ms
Max latency: 5.333ms

Model inference throughput (batch size = 32):
Throughput: 2515.49 samples/sec

Model inference CPU usage (number of logical cores) during runtime:
CPU runtime: 2.40 seconds
Average CPU usage: 10.29/96 cores



In [6]:
"""
dpu_sock.open()
model.model.eval()

for i in range(100):
    # wait for host to be ready for new batch of tensors
    if i > 0:
        dpu_sock.wait()

    # run inference on batch
    data, labels = next(iter(train_loader))
    with torch.no_grad():   
        features, _ = model.model(data, split="dpu")

    # send tensors to host
    dpu_sock.send(features)
    dpu_sock.send(labels.to(dtype=torch.float))

dpu_sock.close()
"""

'\ndpu_sock.open()\nmodel.model.eval()\n\nfor i in range(100):\n    # wait for host to be ready for new batch of tensors\n    if i > 0:\n        dpu_sock.wait()\n\n    # run inference on batch\n    data, labels = next(iter(train_loader))\n    with torch.no_grad():   \n        features, _ = model.model(data, split="dpu")\n\n    # send tensors to host\n    dpu_sock.send(features)\n    dpu_sock.send(labels.to(dtype=torch.float))\n\ndpu_sock.close()\n'