In [1]:
import os
import sys
import json
sys.path.append(os.path.join(os.getcwd().replace("model_inference", "")))

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.profiler import profiler
import matplotlib.pyplot as plt
import time
from parse_dataset import binary_dataset, NetworkDataset
from split_model import SplitModelDPU, SplitModelHost
from load_models import models
from transfer_tensors import DPUSocket

In [2]:
conf = {
    "batch_size": 128,
    "epochs": 10,
    "learning_rate": 0.0001,
    "dpu": True
}

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [4]:
dataset_path = os.path.join(os.getcwd().replace("model_inference", ""), "datasets")

# load label_dict
json_file = os.path.join(dataset_path, "label_index.json")
with open(json_file, 'r') as file:
    label_dict = json.load(file)

# load train, val and test datasets
train_dataset_file = os.path.join(dataset_path, "train_dataset.pt")
X_train, y_train = torch.load(train_dataset_file)

val_dataset_file = os.path.join(dataset_path, "val_dataset.pt")
X_val, y_val = torch.load(val_dataset_file)

test_dataset_file = os.path.join(dataset_path, "test_dataset.pt")
X_test, y_test = torch.load(test_dataset_file)

X_train, X_val, X_test = X_train.unsqueeze(-1), X_val.unsqueeze(-1), X_test.unsqueeze(-1)
print(X_train.shape)
print(y_train.shape)

# create train, val and test datasets
train_dataset = NetworkDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=conf["batch_size"], shuffle=True)

val_dataset = NetworkDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=conf["batch_size"], shuffle=True)

test_dataset = NetworkDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=conf["batch_size"])

torch.Size([356334, 513, 1])
torch.Size([356334])


In [5]:
class BenchmarkDPU:
    def __init__(self, dpu_model, dpu_model_path):
        self.dpu_model = dpu_model
        self.dpu_model.load(dpu_model_path)
    
    def run(self, loader):
        """Run benchmark measuring model accuracy, cpu time, memory usage and total runtime"""
        self.dpu_model.model.eval()

        # warm-up before profiling:
        for i in range(5):
            with torch.no_grad():
                data, labels = next(iter(loader))
                pred, _ = self.dpu_model.model(data)

        with torch.profiler.profile(
            activities=[profiler.ProfilerActivity.CPU],
            record_shapes=True,
            profile_memory=True,
            with_stack=True
        ) as prof:
            data, labels = next(iter(loader))
            bin_labels = binary_dataset(labels, label_dict)
            start = time.perf_counter()
            # warm-up before profiling:
            with torch.no_grad():
                pred, logits = self.dpu_model.model(data)
                logits = logits.detach()  # break the graph here
                logits = logits.unsqueeze(1)
            end = time.perf_counter()
        
        accuracy = (pred.round() == bin_labels).float().mean()
        
        # Wall-clock time
        wall_clock_time = end - start

        # CPU compute time (µs → s)
        cpu_time_total_s = sum([e.self_cpu_time_total for e in prof.key_averages()]) / 1e6  # microseconds
        cpu_utilization = cpu_time_total_s / wall_clock_time

        # peak memory during profiling
        peak_mem = max([e.cpu_memory_usage for e in prof.key_averages()])
        peak_mem = peak_mem / 1024**2 #MB

        return 100*accuracy, cpu_utilization, peak_mem, wall_clock_time, logits, labels

In [6]:
light_dpu_path = os.path.join(os.getcwd().replace("model_inference", ""), "checkpoint", "light_dpu_split_model.pth")
light_dpu_model = models["light_dpu"]
benchmark = BenchmarkDPU(light_dpu_model, light_dpu_path)

Checkpoint loaded from /home/ubuntu/Network-Packet-ML-Model/checkpoint/light_dpu_split_model.pth!


In [7]:
so_file = os.path.join(os.getcwd().replace("model_inference", ""), "socket_transfer", "socket_transfer.so")
address = "10.128.14.17"
socket = DPUSocket(so_file, address)

In [8]:
acc, cpu, mem, runtime, logits, targets = benchmark.run(test_loader)

[W1024 10:08:29.593027846 CPUAllocator.cpp:245] Memory block of unknown size was allocated before the profiling started, profiler results will not include the deallocation event


In [9]:
print(f"Benchmark (dpu): Accuracy: {acc:.2f}%, CPU Usage: {cpu:.2f} cores, Memory Usage: {mem:.2f}MB, Runtime: {runtime:.2f}s")

Benchmark (dpu): Accuracy: 66.71%, CPU Usage: 1.01 cores, Memory Usage: 192.39MB, Runtime: 1.64s


In [10]:
print(logits.shape, targets.shape)

torch.Size([128, 1, 128]) torch.Size([128])


In [11]:
# send logits
socket.send(logits)

Buffer of size 65536 bytes sent to host! 


In [12]:
targets = targets.to(dtype=torch.float)
#send targets
socket.send(targets)

Buffer of size 512 bytes sent to host! 


In [13]:
print(logits[15])
print(targets[:10])
print(logits.shape, targets.shape)

tensor([[ 0.7519, -0.7116,  0.7584, -0.7286, -0.8710, -0.7440, -0.7528, -0.6558,
         -0.7405,  0.7365,  0.0868,  0.6960,  0.5404,  0.4739, -1.0177, -0.7227,
          0.2148, -0.8008,  0.4949,  0.7224,  0.3411, -0.8695,  0.9343,  0.7056,
          0.5952, -0.6922, -0.4433,  0.2624,  0.7037,  0.1183,  0.7128, -0.7314,
          0.8568,  0.8124, -0.8194,  0.8370,  0.8752,  0.0959,  0.8442, -0.8577,
         -0.2609,  0.8041,  0.8916,  0.4259,  0.4938, -0.2489,  0.7129,  0.8262,
         -0.7765,  0.5222, -0.5039, -0.6198, -0.7860,  0.8677,  0.0428,  0.7230,
         -0.7287,  0.1237,  0.6157, -0.4312,  0.8740,  0.7747,  0.2759, -0.9036,
          0.8885, -0.7883, -0.9757,  0.7501,  0.3384, -0.9236, -1.0406, -0.8356,
         -0.8641, -1.2406,  0.8834, -0.8488,  0.9031,  1.1479,  0.8998, -0.9417,
         -0.0137, -1.3901, -0.9367,  1.2979,  1.2158, -1.1115,  0.7707,  0.5537,
          0.9766,  1.9593,  0.8900, -0.9053,  0.1653, -0.8392,  1.2220,  0.8334,
          1.0840,  0.0981,  