# Environment:
beautifulsoup4 
bleach 
dm-tree==0.1.8
flatbuffers==23.5.26
google-auth==2.22.0
google-auth-oauthlib==1.0.0
google-pasta==0.2.0
h5py==3.9.0
ipykernel 
ipython 
jupyter
keras==2.13.1
Markdown==3.4.1
matplotlib==3.5.3
notebook
numpy 
onnx==1.14.0
onnx-simplifier==0.4.33
onnxoptimizer==0.3.13
onnxruntime==1.15.1
opencv-python==4.6.0.66
opt-einsum==3.3.0
packaging 
pandas==1.4.3
Pillow==9.4.0
platformdirs 
protobuf==4.23.4
psutil 
ptyprocess 
pycocotools==2.0.4
requests 
scikit-learn==1.1.2
scipy==1.9.1
sklearn==0.0
tensorboard==2.13.0
tensorboard-data-server==0.7.1
tensorboard-plugin-wit==1.8.1
tensorflow==2.13.0
tensorflow-estimator==2.13.0
tensorflow-io-gcs-filesystem==0.33.0
termcolor==2.3.0
torch==1.12.1
torchaudio==0.12.1
torchvision==0.13.1
tornado 
tqdm==4.64.0

## 1. Train a model

In [2]:
import torch
from torch import nn, optim
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision import datasets

In [3]:
# 设置参数
batch_size = 128
learning_rate = 1e-2
num_epoches = 30


# 设置数据集
train_dataset = datasets.MNIST(root='./data', train=True, transform=transforms.ToTensor(), download=True)
test_dataset = datasets.MNIST(root='./data', train=False, transform=transforms.ToTensor(), download=True)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


# 定义模型
class Cnn(nn.Module):
    def __init__(self, in_dim, n_class):
        super(Cnn, self).__init__()    # super用法:Cnn继承父类nn.Model的属性，并用父类的方法初始化这些属性
        self.conv = nn.Sequential(     #padding=2保证输入输出尺寸相同(参数依次是:输入深度，输出深度，ksize，步长，填充)
            nn.Conv2d(in_dim, 6, 5, stride=1, padding=2),  # (28*28*1)
            nn.ReLU(True),
            nn.Conv2d(6, 16, 5, stride=1, padding=0),
            nn.ReLU(True), 
            nn.Conv2d(16, 16, 5, stride=1, padding=0),
            nn.ReLU(True),
            nn.Conv2d(16, 6, 5, stride=1, padding=0),
            nn.ReLU(True),
            )

        self.fc = nn.Sequential(
            nn.Linear(1536, 120), 
            nn.Linear(120, 84), 
            nn.Linear(84, n_class))

    def forward(self, x):  # x.size=([128, 1, 28, 28])
        out = self.conv(x)  # out.size=([128, 6, 16, 16])
        out = torch.flatten(out, start_dim=1, end_dim=3)  # out.size = ([128, 1536])
        out = self.fc(out)
        return out

In [4]:
# 开始训练
model = Cnn(1, 10)  # 图片大小是28x28,输入深度是1，最终输出的10类
use_gpu = torch.cuda.is_available()  # 判断是否有GPU加速
if use_gpu:
    model = model.cuda()

# 定义loss和optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# 开始训练
for epoch in range(num_epoches):
    print('epoch {}'.format(epoch + 1))      # .format为输出格式，formet括号里的即为左边花括号的输出
    print('*' * 10)
    running_loss = 0.0
    running_acc = 0.0
    for i, data in enumerate(train_loader, 1):
        img, label = data
        if use_gpu:
            img = img.cuda()
            label = label.cuda()
        img = Variable(img)
        label = Variable(label)
        out = model(img)
        loss = criterion(out, label)
        running_loss += loss.item() * label.size(0)
        _, pred = torch.max(out, 1)
        num_correct = (pred == label).sum()
        accuracy = (pred == label).float().mean()
        running_acc += num_correct.item()
        # 向后传播
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    model.eval()
    eval_loss = 0
    eval_acc = 0
    with torch.no_grad():
        for data in test_loader:
            img, label = data
            if use_gpu:
                img = Variable(img).cuda()
                label = Variable(label).cuda()
            else:
                img = Variable(img)
                label = Variable(label)
            out = model(img)
            loss = criterion(out, label)
            eval_loss += loss.item() * label.size(0)
            _, pred = torch.max(out, 1)
            num_correct = (pred == label).sum()
            eval_acc += num_correct.item()
    print('Finish {} epoch, Train Loss: {:.6f}, Acc: {:.6f}, Test Loss: {:.6f}, Acc: {:.6f} '.format(
        epoch + 1, 
        running_loss / (len(train_dataset)), 
        running_acc / (len(train_dataset)), 
        eval_loss / (len(test_dataset)), 
        eval_acc / (len(test_dataset))))
    print()

# 保存模型
torch.save(model.state_dict(), './Pytorch_pth2onnx_onnxQuant_summarize1010.pth')


epoch 1
**********
Finish 1 epoch, Train Loss: 2.299097, Acc: 0.123550, Test Loss: 2.283773, Acc: 0.128200 

epoch 2
**********
Finish 2 epoch, Train Loss: 1.006178, Acc: 0.661183, Test Loss: 0.307233, Acc: 0.911000 

epoch 3
**********
Finish 3 epoch, Train Loss: 0.277282, Acc: 0.918467, Test Loss: 0.208130, Acc: 0.940700 

epoch 4
**********
Finish 4 epoch, Train Loss: 0.190588, Acc: 0.944383, Test Loss: 0.145093, Acc: 0.955200 

epoch 5
**********
Finish 5 epoch, Train Loss: 0.143336, Acc: 0.957783, Test Loss: 0.126834, Acc: 0.961000 

epoch 6
**********
Finish 6 epoch, Train Loss: 0.118813, Acc: 0.964850, Test Loss: 0.117879, Acc: 0.964300 

epoch 7
**********
Finish 7 epoch, Train Loss: 0.105171, Acc: 0.968783, Test Loss: 0.096067, Acc: 0.972100 

epoch 8
**********
Finish 8 epoch, Train Loss: 0.095082, Acc: 0.971817, Test Loss: 0.086646, Acc: 0.974600 

epoch 9
**********
Finish 9 epoch, Train Loss: 0.087431, Acc: 0.974017, Test Loss: 0.087894, Acc: 0.972600 

epoch 10
**********

## 2. Pth模型导出onnx模型

In [5]:
import torch
import torch.onnx
import os

def pth_to_onnx(input, checkpoint, onnx_path, input_names=['input'], output_names=['output']):
    if not onnx_path.endswith('.onnx'):
        print('Warning! The onnx model name is not correct, \ please give a name that ends with \'.onnx\'!')
        return 0

    model = Cnn(1, 10)  # 导入模型
    model.load_state_dict(torch.load(checkpoint))  # 初始化权重
    model.eval()
    
    # opset_version=15  # 设置opset的版本号
    torch.onnx.export(model, input, onnx_path, verbose=True, input_names=input_names, output_names=output_names, opset_version=15) #指定模型的输入，以及onnx的输出路径
    print("Exporting .pth model to onnx model has been successful!")

if __name__ == '__main__':
    checkpoint = './Pytorch_pth2onnx_onnxQuant_summarize1010.pth'
    onnx_path = './Pytorch_pth2onnx_onnxQuant_summarize1010_op15.onnx'
    input = torch.randn(1, 1, 28, 28)
    pth_to_onnx(input, checkpoint, onnx_path)

Exported graph: graph(%input : Float(1, 1, 28, 28, strides=[784, 784, 28, 1], requires_grad=0, device=cpu),
      %conv.0.weight : Float(6, 1, 5, 5, strides=[25, 25, 5, 1], requires_grad=1, device=cpu),
      %conv.0.bias : Float(6, strides=[1], requires_grad=1, device=cpu),
      %conv.2.weight : Float(16, 6, 5, 5, strides=[150, 25, 5, 1], requires_grad=1, device=cpu),
      %conv.2.bias : Float(16, strides=[1], requires_grad=1, device=cpu),
      %conv.4.weight : Float(16, 16, 5, 5, strides=[400, 25, 5, 1], requires_grad=1, device=cpu),
      %conv.4.bias : Float(16, strides=[1], requires_grad=1, device=cpu),
      %conv.6.weight : Float(6, 16, 5, 5, strides=[400, 25, 5, 1], requires_grad=1, device=cpu),
      %conv.6.bias : Float(6, strides=[1], requires_grad=1, device=cpu),
      %fc.0.weight : Float(120, 1536, strides=[1536, 1], requires_grad=1, device=cpu),
      %fc.0.bias : Float(120, strides=[1], requires_grad=1, device=cpu),
      %fc.1.weight : Float(84, 120, strides=[120, 1

## 3. onnx模型量化

In [10]:
# 定义 Convolution Network 模型
class Cnn(nn.Module):
    def __init__(self, in_dim, n_class):
        super(Cnn, self).__init__()    # super用法:Cnn继承父类nn.Model的属性，并用父类的方法初始化这些属性
        self.quant = torch.quantization.QuantStub()  # 静态量化时量化桩用于量化数据
        self.conv = nn.Sequential(     #padding=2保证输入输出尺寸相同(参数依次是:输入深度，输出深度，ksize，步长，填充)
            nn.Conv2d(in_dim, 6, 5, stride=1, padding=2),  # (28*28*1)
            nn.ReLU(True),
            nn.Conv2d(6, 16, 5, stride=1, padding=0),
            nn.ReLU(True), 
            nn.Conv2d(16, 16, 5, stride=1, padding=0),
            nn.ReLU(True),
            nn.Conv2d(16, 6, 5, stride=1, padding=0),
            nn.ReLU(True),
            )
        self.fc = nn.Sequential(nn.Linear(1536, 120), nn.Linear(120, 84), nn.Linear(84, n_class))
        self.dequant = torch.quantization.DeQuantStub() # 取消量化桩

    def forward(self, x):  # x.size=([128, 1, 28, 28])
        x = self.quant(x)  # 量化数据，从fp32->uint8
        out = self.conv(x)  # out.size=([128, 6, 16, 16])
        out = torch.flatten(out, start_dim=1, end_dim=3)  # out.size = ([128, 1536])
        out = self.fc(out)
        x = self.dequant(out)
        return x

In [12]:
# create a model instance
model_fp32 = Cnn(1, 10)  # 创建模型
checkpoint = './Pytorch_pth2onnx_onnxQuant_summarize1010.pth'
model_fp32.load_state_dict(torch.load(checkpoint))  # 初始化权重
model_fp32.eval()  # 推理模式
model_fp32.qconfig = torch.quantization.get_default_qconfig('fbjemm')  # 设置量化配置
model_fp32_fused= model_fp32 # torch.quantization.fuse_modules(model_fp32, [['conv', 'relu']])
model_fp32_prepared = torch.quantization.prepare(model_fp32_fused)
input_fp32 = torch.randn(1, 1, 28, 28)  # 产生伪数据用于测试模型；
model_fp32_prepared(input_fp32) # 数据量化操作，准备范围，刻度等；
model_int8 = torch.quantization.convert(model_fp32_prepared)  # 量化数据

torch.onnx.export(
    model_int8,             # model being run
    input_fp32,                         # model input (or a tuple for multiple inputs)
    './Pytorch_pth2onnx_pthQuant_summarize1010_op16.onnx',   # where to save the model (can be a file or file-like object)
    export_params=True,        # store the trained parameter weights inside the model file
    opset_version=16,          # the ONNX version to export the model to
    input_names = ['input'],   # the model's input names
    )




# ONNX simplifier

### 1. https://convertmodel.com/ online converter and simplifier.

### 2. through onnx simplifier: pip install onnx-simplifier
(https://blog.csdn.net/Mrrunsen/article/details/122870507?utm_medium=distribute.pc_relevant.none-task-blog-2~default~baidujs_baidulandingword~default-1-122870507-blog-128111258.235^v38^pc_relevant_default_base&spm=1001.2101.3001.4242.2&utm_relevant_index=4)
then through python -m onnxsim input_onnx_model output_onnx_model to simplier the model

(IMX500) zya@SSSLXSRVLS002:~/zya/AI/NNet/classification/MNIST_recog$ python -m  onnxsim /home/zya/zya/AI/NNet/classification/MNIST_recog/Pytorch_pth2onnx_pthQuant_summarize1010_op15.onnx /home/zya/zya/AI/NNet/classification/MNIST_recog/Pytorch_pth2onnx_pthQuant_summarize1010_op15_sim.onnx
Your model contains "Tile" ops or/and "ConstantOfShape" ops. Folding these ops can make the simplified model much larger. If it is not expected, please specify "--no-large-tensor" (which will lose 
some optimization chances)
Simplifying...
Finish! Here is the difference:
┏━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓
┃                  ┃ Original Model ┃ Simplified Model ┃
┡━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩
│ Cast             │ 20             │ 0                │
│ Constant         │ 45             │ 45               │
│ ConstantOfShape  │ 7              │ 0                │
│ Conv             │ 4              │ 4                │
│ DequantizeLinear │ 27             │ 27               │
│ Flatten          │ 1              │ 1                │
│ Gemm             │ 3              │ 3                │
│ Identity         │ 39             │ 0                │
│ QuantizeLinear   │ 13             │ 13               │
│ Relu             │ 4              │ 4                │
│ Model Size       │ 216.5KiB       │ 213.0KiB         │
└──────────────────┴────────────────┴──────────────────┘

或者写jupyter

https://blog.csdn.net/hjxu2016/article/details/127265957?utm_medium=distribute.pc_relevant.none-task-blog-2~default~baidujs_baidulandingword~default-1-127265957-blog-122870507.235^v38^pc_relevant_default_base&spm=1001.2101.3001.4242.2&utm_relevant_index=4