In [1]:
import os
import sys
sys.path.append(os.path.join(os.getcwd().replace("notebooks/pruning_quantization", "")))

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from config import Config
from data.dataset import NetworkDataset, load_datasets
from model_config import MLP_Models, LSTM_Models, GRU_Models, CNN_models
from utils.benchmark import Benchmark
from compact.pruning import prune_mlp_model, prune_rnn_model, prune_cnn_model
from compact.quantization import dynamic_quantize, static_quantization
import copy
import warnings
warnings.filterwarnings("ignore")

In [2]:
conf = Config()
#load_model = CNN_models()
#load_model = MLP_Models()
load_model = LSTM_Models()
#load_model = GRU_Models()
model_conf = load_model.lstm_4
model = load_model.get_model(model_conf)
model.load()
print(conf.device)

Checkpoint loaded from /global/D1/homes/jorgetf/Network-Packet-ML-Model/checkpoint/large_model/lstm_4.pth!
cpu


In [3]:
# data loader
X_train, y_train, X_val, y_val, X_test, y_test = load_datasets(conf.datasets, model_type=load_model.type)

# create train, val and test dataloaders
train_dataset = NetworkDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, conf.batch_size, shuffle=True)

val_dataset = NetworkDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, conf.batch_size, shuffle=True)

test_dataset = NetworkDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, conf.batch_size, shuffle=True)

train_dataset_no_aug = copy.deepcopy(train_dataset)
calibration_loader = DataLoader(train_dataset_no_aug, batch_size=conf.batch_size, shuffle=False)

In [4]:
# prune model
checkpoint_path = os.path.join(conf.checkpoint, "pruned_quantized", "pruned_" + model_conf["name"] + ".pth")
model.checkpoint_path = checkpoint_path

# new hidden sizes
if load_model.type == "mlp":
    pruned_model = prune_mlp_model(model.model)
elif load_model.type == "rnn":
    pruned_model = prune_rnn_model(model.model)
elif load_model.type == "cnn":
    pruned_model = prune_cnn_model(model.model)
else:
    raise ValueError("model type must be 'mlp', 'rnn' or 'cnn'!")
    

model.model = pruned_model.to(conf.device)
model.optimizer = torch.optim.AdamW(
    model.model.parameters(), 
    lr=conf.learning_rate, 
    weight_decay=conf.weight_decay
)
model.scheduler = torch.optim.lr_scheduler.ExponentialLR(
    model.optimizer, 
    gamma=conf.gamma
)
model.load()

Checkpoint loaded from /global/D1/homes/jorgetf/Network-Packet-ML-Model/checkpoint/pruned_quantized/pruned_lstm_4.pth!


In [5]:
fp32_model = model.model
fp32_model.cpu()
fp32_model.eval()

#fp32_modules = ["embedding", "ln1", "output"] # mlp modules
#fp32_modules = ["embedding", "ln1", "ln2", "output"] # cnn modules
#example_input = torch.randn(1, 513) # mlp
example_input = torch.randn(1, 1, 513) # cnn
#int8_model = static_quantization(fp32_model, calibration_loader, fp32_modules, example_input)
int8_model = dynamic_quantize(fp32_model)

In [6]:
print(int8_model)
model.model = int8_model

LSTM(
  (embedding): L2ByteNorm()
  (rnn): DynamicQuantizedLSTM(1, 51, num_layers=4, batch_first=True, dropout=0.2, bidirectional=True)
  (lh): LastHidden()
  (ln1): LayerNorm((102,), eps=1e-05, elementwise_affine=True)
  (linear): ModuleList(
    (0): Sequential(
      (0): DynamicQuantizedLinear(in_features=102, out_features=102, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
      (1): ReLU()
      (2): Dropout(p=0.2, inplace=False)
    )
  )
  (ln2): LayerNorm((102,), eps=1e-05, elementwise_affine=True)
  (output): DynamicQuantizedLinear(in_features=102, out_features=24, dtype=torch.qint8, qscheme=torch.per_tensor_affine)
)


In [7]:
# benchmark model
name = model_conf["name"]
result_path = os.path.join(conf.benchmark_host, "pruned_quantized_model", "prune_quant_" + name + ".txt")
benchmark = Benchmark(model, test_loader, conf.batch_size, name, result_path)
benchmark()

# print and save result
benchmark.print_result()
benchmark.save()

[W125 18:04:10.543071555 CPUAllocator.cpp:245] Memory block of unknown size was allocated before the profiling started, profiler results will not include the deallocation event
ERROR:2026-01-25 18:04:18 2928050:2928050 DeviceProperties.cpp:47] gpuGetDeviceCount failed with code 35


Benchmark - lstm_4 model:

Memory usage (MB):
Avg memory usage: 5.596MB
Peak memory usage: 135.404MB

Model inference latency on one batch (batch size = 1):
Avg latency: 150.574ms
Min latency: 124.834ms
Max latency: 176.165ms

Model inference throughput (batch size = 1):
Throughput: 5.93 samples/sec

Model inference CPU usage (number of logical cores) during runtime:
CPU runtime: 8.35 seconds
Average CPU usage: 47.98/96 cores

Model (lstm_4) Macro-F1, Micro-F1 and Macro ROC AUC scores:
Macro-F1 score: 0.43
Micro-F1 score: 0.84
Macro ROC AUC score: nan



