In [5]:

import torch
import torch.nn as nn
import torch.optim as optim

import time
from contextlib import contextmanager
@contextmanager
def timed_block(label):
    start = time.perf_counter()
    try:
        yield
    finally:
        end = time.perf_counter()
        print(f"{label} : {end - start:.6f}秒")

# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = torch.device("cuda")
# 定义模型
class LinearModel(nn.Module):
    def __init__(self):
        super(LinearModel, self).__init__()
        self.linear = nn.Linear(2048, 2048,dtype=torch.float16)  # 定义一个线性层

    def forward(self, x):
        return self.linear(x)

# 实例化模型
model = LinearModel()
model.train()

# 定义损失函数和优化器
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=10)

# 数据和标签（假设）
data = torch.randn(1000, 2048,dtype=torch.float16)  # 随机生成数据（1000个样本，每个样本2048维）
labels = torch.randn(1000, 2048,dtype=torch.float16)  # 随机生成标签

with timed_block("model move"):
    model = model.to(device)
with timed_block("data move"):
    data = data.to(device)
with timed_block("labels move"):
    labels = labels.to(device)


# 训练过程
num_epochs = 100  # 训练轮数
with timed_block("train"):
    for epoch in range(num_epochs):
        # 前向传播
        outputs = model(data)
        loss = criterion(outputs, labels)

        # 反向传播和优化
        optimizer.zero_grad()  # 清除之前的梯度
        loss.backward()        # 反向传播
        optimizer.step()       # 更新参数

        # print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
# for name, param in model.named_parameters():
#     print(f"{name}: {param.dtype}")
print("Float16 cpu Done!")



model move : 0.002658秒
data move : 0.001898秒
labels move : 0.001698秒
train : 0.053036秒
Float16 cpu Done!


In [37]:
import torch
import torch.nn as nn
from contextlib import contextmanager
import time

@contextmanager
def timed_block(label):
    start = time.perf_counter()
    try:
        yield
    finally:
        end = time.perf_counter()
        print(f"{label} : {end - start:.6f}秒")

with timed_block("total"):
    # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    device = torch.device("cpu")

    # 定义模型
    class LinearModel(nn.Module):
        def __init__(self):
            super(LinearModel, self).__init__()
            # 使用浮点数类型
            self.linear = nn.Linear(2048, 2048,dtype=torch.float32)  

        def forward(self, x):
            return self.linear(x)

    # 实例化模型
    model = LinearModel()
    model.eval()

    quantized_model = torch.quantization.quantize_dynamic(model, dtype=torch.quint4x2)

    data = torch.randn(1000, 2048, dtype=torch.float32)  

    with timed_block("model"):
        # model = model.to(device)
        quantized_model = quantized_model.to(device)
    with timed_block("data"):
        data = data.to(device)

    # 推理过程
    num_epochs = 20
    with timed_block("inference"):
        # 前向传播
        # outputs = model(data)
        for epoch in range(num_epochs):
            outputs = quantized_model(data)
            # outputs = model(data)

print("Inference cpu quint4 Done!")


model : 0.000092秒
data : 0.000002秒
inference : 0.437242秒
total : 0.499184秒
Inference cpu quint4 Done!
