In [None]:
import os
import sys
import platform
sys.path.append(os.path.join(os.getcwd().replace("notebooks/pruning_quantization", "")))

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from config import Config
from data.dataset import NetworkDataset, load_datasets
from model_config import MLP_Models, LSTM_Models, GRU_Models
from utils.benchmark import Benchmark
from torch.ao.quantization.quantize_fx import prepare_fx, convert_fx
import torch.ao.quantization as quant

In [2]:
conf = Config()
load_model = LSTM_Models()
model_conf = load_model.lstm_4
model = load_model.get_model(model_conf)

In [3]:
def eager_dynamic_quantize(model: nn.Module) -> nn.Module:
    model.eval()

    quantized_model = torch.quantization.quantize_dynamic(
        model,
        {nn.Linear, nn.LSTM},       # only quantize safe ops
        dtype=torch.qint8
    )

    return quantized_model

In [4]:
def static_quantization(model: nn.Module, calibration_loader: DataLoader) -> nn.Module:
    model.eval()
    cpu = platform.processor()
    if cpu == "x86_64" or cpu == "AMD64":
        backend = "fbgemm"
    elif cpu == "arm64" or cpu == "aarch64":
        backend = "qnnpack"
    else:
        raise ValueError(f"Platform architecture must be 'x86_64'/'AMD64' or 'arm64'/'aarch64' but got {cpu}!")
    
    qconfig = quant.get_default_qconfig(backend)

    x, _ = next(iter(calibration_loader))

    model_prepared = prepare_fx(
        model,
        {"": qconfig},
        x
    )

    # Calibration
    with torch.no_grad():
        for x, _ in calibration_loader:
            model_prepared(x)

    model_quantized = convert_fx(model_prepared)

    return model_quantized

In [5]:
# data loader
X_train, y_train, X_val, y_val, X_test, y_test = load_datasets(conf.datasets, model_type=load_model.type)
X_test, y_test = X_test[:500*conf.batch_size], y_test[:500*conf.batch_size]
print(X_test.shape)
dataset = NetworkDataset(X_test, y_test)
loader = DataLoader(dataset, conf.batch_size, shuffle=True)

torch.Size([16000, 513, 1])


In [6]:
data, label = next(iter(loader))
print(data.shape, label.shape)

torch.Size([32, 513, 1]) torch.Size([32])


In [7]:
# quantize model
name = model_conf["name"]
result_path = os.path.join(conf.benchmark_host, "pruned_quantized_model", "quantized_" + name + ".txt")
model.load()

quantized_model = static_quantization(model.model, loader)
#quantized_model = eager_dynamic_quantize(model.model)
model.model = quantized_model

Checkpoint loaded from /global/D1/homes/jorgetf/Network-Packet-ML-Model/checkpoint/large_model/lstm_4.pth!


For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  model_prepared = prepare_fx(
  prepared = prepare(
For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization A

In [8]:
# benchmark model
benchmark = Benchmark(model, loader, conf.batch_size, name, result_path)
benchmark()

# print and save result
benchmark.print_result()
#benchmark.save()

[W1222 17:08:47.694143693 CPUAllocator.cpp:245] Memory block of unknown size was allocated before the profiling started, profiler results will not include the deallocation event
ERROR:2025-12-22 17:08:51 1929514:1929514 DeviceProperties.cpp:47] gpuGetDeviceCount failed with code 35


Benchmark - lstm_4 model:

Memory usage (MB):
Avg memory usage: 23.264MB
Peak memory usage: 326.506MB

Model inference latency on one batch (batch size = 32):
Avg latency: 391.238ms
Min latency: 369.107ms
Max latency: 408.711ms

Model inference throughput (batch size = 32):
Throughput: 81.14 samples/sec

Model inference CPU usage (number of logical cores) during runtime:
CPU runtime: 19.86 seconds
Average CPU usage: 48.00/96 cores

Model (lstm_4) Macro-F1, Micro-F1 and Macro ROC AUC scores:
Macro-F1 score: 0.77
Micro-F1 score: 0.96
Macro ROC AUC score: nan







In [9]:
#loss, metrics = model.evaluate(loader)
#print(f"Macro-F1 score: {metrics['f1_macro']:.2f}, Micro-F1 score: {metrics['f1_micro']:.2f}, Macro ROC AUC score: {metrics['roc_auc_macro']:.2f}")