In [1]:
import os
import sys
sys.path.append(os.path.join(os.getcwd().replace("model_inference", "")))

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.profiler import profiler
import matplotlib.pyplot as plt
import time
from parse_dataset import NetworkDataset, parse_dataset, split_datasets, binary_dataset
from split_model import SplitModelDPU, SplitModelHost
from load_models import models
from transfer_tensors import HostSocket

In [2]:
conf = {
    "batch_size": 16,
    "epochs": 10,
    "learning_rate": 0.0001,
    "dpu": False
}

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [4]:
class BenchmarkHost:
    def __init__(self, host_model, host_model_path):
        self.host_model = host_model
        self.host_model.load(host_model_path)
    
    def run(self, logits, labels):
        """Run benchmark measuring model accuracy, cpu time, memory usage and total runtime"""
        self.host_model.model.eval()

        with torch.profiler.profile(
            activities=[profiler.ProfilerActivity.CPU],
            record_shapes=True,
            profile_memory=True,
            with_stack=True
        ) as prof:
            start = time.perf_counter()
            # warm-up before profiling:
            with torch.no_grad():
                pred, logits = self.host_model.model(logits)
            end = time.perf_counter()

        accuracy = (pred.argmax(dim=1) == labels).float().mean()

        # Wall-clock time
        wall_clock_time = end - start

        # CPU compute time (µs → s)
        cpu_time_total_s = sum([e.self_cpu_time_total for e in prof.key_averages()]) / 1e6  # microseconds
        cpu_utilization = cpu_time_total_s / wall_clock_time

        # peak memory during profiling
        peak_mem = max([e.cpu_memory_usage for e in prof.key_averages()])
        peak_mem = peak_mem / 1024**2 #MB

        return 100*accuracy, cpu_utilization, peak_mem, wall_clock_time, logits, labels

In [5]:
host_path = os.path.join(os.getcwd().replace("model_inference", ""), "checkpoint", "host_split_model.pth")
host_model = models["host"]
#host_model.load(host_path)

In [6]:
benchmark = BenchmarkHost(host_model, host_path)

Checkpoint loaded from /home/jorgetf/testmodel/Network-Packet-ML-Model/checkpoint/host_split_model.pth!


In [7]:
so_file = os.path.join(os.getcwd().replace("model_inference", ""), "socket_transfer", "socket_transfer.so")
socket = HostSocket(so_file)

In [8]:
logits = socket.receive()

In [9]:
targets = socket.receive()
targets = targets.to(dtype=torch.long)

In [12]:
acc, cpu, mem, runtime, _, labels = benchmark.run(logits=logits, labels=targets)

In [13]:
print(f"Benchmark (Host): Accuracy: {acc:.2f}%, CPU Usage: {cpu:.2f} cores, Memory Usage: {mem:.2f}MB, Runtime: {runtime:.2f}s")

Benchmark (Host): Accuracy: 100.00%, CPU Usage: 0.63 cores, Memory Usage: 0.10MB, Runtime: 0.00s


In [14]:
print(logits[0])
print(targets[:10])
print(logits.shape, targets.shape)

tensor([[ 0.7089, -0.7476, -0.0036,  1.4039, -0.9515,  0.7841,  1.7679, -0.7348,
          0.9583, -0.7040, -1.2197, -0.9904, -0.7988,  0.2641, -0.5364, -0.3622,
          0.7251, -0.8590,  0.7065,  0.7263, -0.7797, -0.3498, -0.6212,  0.8428,
          0.8782,  0.7267, -0.9448, -0.7076,  0.8949, -0.8892,  0.7929,  0.2626,
         -0.6879,  0.7814,  0.7038,  0.4580,  1.3909,  0.7223, -0.7195,  0.6840,
         -0.7028, -0.7435, -0.8354,  0.8158,  0.7670, -1.2488, -0.8105,  0.8674,
         -0.5246,  0.7223, -1.3902,  1.2885,  0.6975, -0.4087, -0.6764, -1.6874,
          0.8298, -0.6905,  0.7101, -0.7583, -0.8911, -0.6338,  0.9207,  0.8254,
         -0.0968,  0.8207,  0.7985,  0.7873, -0.3857, -0.3739, -0.3412,  1.2092,
         -0.5098, -0.3415,  1.1584, -0.3423,  1.3440,  0.5424,  1.2486,  0.7654,
          0.4450, -0.4075,  0.0813,  0.8742,  1.1671,  0.4403, -0.2603,  0.5596,
         -1.2576,  0.6712,  0.3430,  0.5415, -0.0528, -1.0890,  0.3648, -1.1970,
         -1.2779,  0.3953,  