In [1]:
# ================================================================
# 1. Импорты и проверка устройства
# ================================================================
import torch, torchvision, time, onnx, onnxruntime, numpy as np
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cuda


In [2]:
# ================================================================
# 2. Преобразования и загрузка CIFAR-10
# ================================================================
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

train_ds = datasets.CIFAR10(root="data", train=True,  download=True, transform=transform)
val_ds   = datasets.CIFAR10(root="data", train=False, download=True, transform=transform)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True,  num_workers=2)
val_loader   = DataLoader(val_ds,   batch_size=32, shuffle=False, num_workers=2)

In [3]:
# ================================================================
# 3. Берём предобученную «сверхточную» модель
#    EfficientNet-B3 даёт ≈ 84 % Top-1 на ImageNet → хороший выбор
# ================================================================
model = torchvision.models.efficientnet_b3(pretrained=True)
model.classifier[1] = torch.nn.Linear(model.classifier[1].in_features, 10)
model = model.to(device)



In [4]:
from tqdm import tqdm   # pip install tqdm

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

def accuracy(net, loader):
    net.eval()
    correct = total = 0
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            correct += (net(x).argmax(1) == y).sum().item()
            total   += y.size(0)
    return correct/total

for epoch in range(3):
    model.train()
    # оборачиваем именно тренировочный loader
    with tqdm(train_loader, desc=f"Epoch {epoch+1}") as pbar:
        for x, y in pbar:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            loss = criterion(model(x), y)
            loss.backward()
            optimizer.step()

            # обновляем строку прогресса текущим лоссом
            pbar.set_postfix(loss=loss.item())

    # после эпохи — метрика
    print(f"Epoch {epoch+1}  val-acc={accuracy(model, val_loader):.3f}")

Epoch 1: 100%|█████████████████████████████████████████████████████████| 1563/1563 [03:31<00:00,  7.38it/s, loss=0.511]


Epoch 1  val-acc=0.963


Epoch 2: 100%|█████████████████████████████████████████████████████████| 1563/1563 [03:30<00:00,  7.41it/s, loss=0.218]


Epoch 2  val-acc=0.962


Epoch 3: 100%|████████████████████████████████████████████████████████| 1563/1563 [03:31<00:00,  7.40it/s, loss=0.0348]


Epoch 3  val-acc=0.960


In [6]:
torch.save(model, "test1.pt")

In [11]:
model1 = torch.load("test1.pt", weights_only = False ,map_location=torch.device("cuda"))

In [12]:
print(f'accuracy: {accuracy(model1, val_loader)}') 

accuracy: 0.9599


In [13]:
# ================================================================
# 7. Сравнение скорости инференса
#    Будем прогонять 1000 батчей по 32 картинки = 32 000 изображений
# ================================================================
def pytorch_inference_time(net, loader, batches=1000):
    net.eval()
    torch.cuda.synchronize()
    t0 = time.time()
    with torch.no_grad():
        for i, (x, _) in enumerate(loader):
            if i == batches: break
            x = x.to(device)
            _ = net(x)
        torch.cuda.synchronize()
    return time.time() - t0

def onnx_inference_time(sess, loader, batches=1000):
    input_name  = sess.get_inputs()[0].name
    torch.cuda.synchronize() if device.type=="cuda" else None
    t0 = time.time()
    for i, (x, _) in enumerate(loader):
        if i == batches: break
        x = x.numpy()
        _ = sess.run(None, {input_name: x})
    torch.cuda.synchronize() if device.type=="cuda" else None
    return time.time() - t0

In [14]:
# 7а) PyTorch
pytorch_gpu_time = pytorch_inference_time(model, val_loader)
print(f"PyTorch GPU: {pytorch_gpu_time:.2f} с")

# 7б) ONNX Runtime GPU
ort_session_gpu = onnxruntime.InferenceSession(onnx_path,
                providers=["CUDAExecutionProvider"])
onnx_gpu_time = onnx_inference_time(ort_session_gpu, val_loader)
print(f"ONNX  GPU:   {onnx_gpu_time:.2f} с")

# 7в) ONNX Runtime CPU
ort_session_cpu = onnxruntime.InferenceSession(onnx_path,
                providers=["CPUExecutionProvider"])
onnx_cpu_time = onnx_inference_time(ort_session_cpu, val_loader)
print(f"ONNX  CPU:   {onnx_cpu_time:.2f} с")

PyTorch GPU: 17.48 с


NameError: name 'onnx_path' is not defined

In [15]:
for batch in val_loader:
    # batch содержит (inputs, labels)
    images, labels = batch
    
    # Берем первую картинку из батча
    dummy_input = images[0:1]  # размер: [1, 3, 224, 224]
    
    print(f"✓ Найден dummy_input из CIFAR10 датасета")
    print(f"  Форма: {dummy_input.shape}")
    print(f"  Тип: {dummy_input.dtype}")
    print(f"  Диапазон значений: [{dummy_input.min():.3f}, {dummy_input.max():.3f}]")
    
    # Можно сохранить для дальнейшего использования
    torch.save(dummy_input, 'dummy_input.pt')
    print(f"  Сохранен в 'dummy_input.pt'")
    
    break  # берем только первый батч

✓ Найден dummy_input из CIFAR10 датасета
  Форма: torch.Size([1, 3, 224, 224])
  Тип: torch.float32
  Диапазон значений: [-1.707, 2.570]
  Сохранен в 'dummy_input.pt'


In [34]:
import torch.onnx
import onnx
import onnxruntime as ort
# Создаем dummy_input
dummy_input = torch.randn(32, 3, 224, 224).to(device)
print(f"✓ Dummy input создан: {dummy_input.shape}")

# Экспорт в ONNX
onnx_path = "efficientnet_b3_cifar10.onnx"
model.eval()
torch.onnx.export(
    model,                      # модель
    dummy_input,                # пример входных данных
    onnx_path,                 # путь сохранения
    export_params=True,        # сохранять обученные веса
    opset_version=11,          # версия ONNX (рекомендуется 11+)
    do_constant_folding=True,  # оптимизация констант
    input_names=['input'],     # имя входного узла
    output_names=['output'],   # имя выходного узла
    
)

print(f"\n✓ Модель экспортирована в: {onnx_path}")

W1202 12:30:25.785000 8708 site-packages\torch\onnx\_internal\exporter\_compat.py:114] Setting ONNX exporter to use operator set version 18 because the requested opset_version 11 is a lower version than we have implementations for. Automatic version conversion will be performed, which may not be successful at converting to the requested version. If version conversion is unsuccessful, the opset version of the exported model will be kept at 18. Please consider setting opset_version >=18 to leverage latest ONNX features


✓ Dummy input создан: torch.Size([32, 3, 224, 224])
[torch.onnx] Obtain model graph for `EfficientNet([...]` with `torch.export.export(..., strict=False)`...
[torch.onnx] Obtain model graph for `EfficientNet([...]` with `torch.export.export(..., strict=False)`... ✅
[torch.onnx] Run decomposition...
[torch.onnx] Run decomposition... ✅
[torch.onnx] Translate the graph into ONNX...


The model version conversion is not supported by the onnxscript version converter and fallback is enabled. The model will be converted using the onnx C API (target version: 11).
Failed to convert the model to the target version 11 using the ONNX C API. The model was not modified
Traceback (most recent call last):
  File "C:\Users\Sanit\miniconda3\envs\neyro\Lib\site-packages\onnxscript\version_converter\__init__.py", line 127, in call
    converted_proto = _c_api_utils.call_onnx_api(
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Sanit\miniconda3\envs\neyro\Lib\site-packages\onnxscript\version_converter\_c_api_utils.py", line 65, in call_onnx_api
    result = func(proto)
             ^^^^^^^^^^^
  File "C:\Users\Sanit\miniconda3\envs\neyro\Lib\site-packages\onnxscript\version_converter\__init__.py", line 122, in _partial_convert_version
    return onnx.version_converter.convert_version(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Sanit\minico

[torch.onnx] Translate the graph into ONNX... ✅
Applied 156 of general pattern rewrite rules.

✓ Модель экспортирована в: efficientnet_b3_cifar10.onnx


In [35]:
# 7а) PyTorch
#pytorch_gpu_time = pytorch_inference_time(model, val_loader) #= 20.51c
#print(f"PyTorch GPU: {pytorch_gpu_time:.2f} с")

# 7б) ONNX Runtime GPU
ort_session_gpu = onnxruntime.InferenceSession(onnx_path, 
                providers=["CUDAExecutionProvider"])
onnx_gpu_time = onnx_inference_time(ort_session_gpu, val_loader)
print(f"ONNX  GPU:   {onnx_gpu_time:.2f} с") # 204.60c

# 7в) ONNX Runtime CPU
# ort_session_cpu = onnxruntime.InferenceSession(onnx_path,
#                providers=["CPUExecutionProvider"])
# onnx_cpu_time = onnx_inference_time(ort_session_cpu, val_loader)
# print(f"ONNX  CPU:   {onnx_cpu_time:.2f} с") # 210c

InvalidArgument: [ONNXRuntimeError] : 2 : INVALID_ARGUMENT : Got invalid dimensions for input: input for the following indices
 index: 0 Got: 16 Expected: 32
 Please fix either the inputs/outputs or the model.

eFFICIENT b0

In [3]:
model_b0 = torchvision.models.efficientnet_b0(pretrained=True)
model_b0.classifier[1] = torch.nn.Linear(model_b0.classifier[1].in_features, 10)
model_b0 = model_b0.to(device)



In [1]:
# ================================================================
# 1. Импорты и проверка устройства
# ================================================================
import torch, torchvision, time, onnx, onnxruntime, numpy as np
from torchvision import datasets, transforms
from torch.utils.data import DataLoader


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
from tqdm import tqdm   # pip install tqdm



# ================================================================
# 2. Преобразования и загрузка CIFAR-10
# ================================================================
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

train_ds = datasets.CIFAR10(root="data", train=True,  download=True, transform=transform)
val_ds   = datasets.CIFAR10(root="data", train=False, download=True, transform=transform)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True,  num_workers=2)
val_loader   = DataLoader(val_ds,   batch_size=32, shuffle=False, num_workers=2)

model_b0 = torchvision.models.efficientnet_b0(pretrained=True)
model_b0.classifier[1] = torch.nn.Linear(model_b0.classifier[1].in_features, 10)
model_b0 = model_b0.to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model_b0.parameters(), lr=3e-4)

def accuracy(net, loader):
    net.eval()
    correct = total = 0
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            correct += (net(x).argmax(1) == y).sum().item()
            total   += y.size(0)
    return correct/total
    
for epoch in range(3):
    model_b0.train()
    # оборачиваем именно тренировочный loader
    with tqdm(train_loader, desc=f"Epoch {epoch+1}") as pbar:
        for x, y in pbar:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            loss = criterion(model_b0(x), y)
            loss.backward()
            optimizer.step()

            # обновляем строку прогресса текущим лоссом
            pbar.set_postfix(loss=loss.item())

    # после эпохи — метрика
    print(f"Epoch {epoch+1}  val-acc={accuracy(model_b0, val_loader):.3f}")

Device: cuda


Epoch 1: 100%|█████████████████████████████████████████████████████████| 1563/1563 [02:00<00:00, 12.93it/s, loss=0.249]


Epoch 1  val-acc=0.945


Epoch 2: 100%|████████████████████████████████████████████████████████| 1563/1563 [02:01<00:00, 12.85it/s, loss=0.0937]


Epoch 2  val-acc=0.953


Epoch 3: 100%|██████████████████████████████████████████████████████████| 1563/1563 [02:01<00:00, 12.83it/s, loss=0.12]


Epoch 3  val-acc=0.952


In [3]:
#GPU_INFO
# gpu_info.py
import torch

def get_gpu_info():
    if torch.cuda.is_available():
        device_name = torch.cuda.get_device_name(0)
        vram_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
        
        info = f"""
## Информация о GPU

**Устройство:** {device_name}
**VRAM:** {vram_gb:.2f} GB
**CUDA доступно:** Да
**Текущее устройство:** cuda:{torch.cuda.current_device()}
        """
    else:
        info = "## GPU не доступен\nИспользуется CPU"
    
    return info

if __name__ == "__main__":
    info = get_gpu_info()
    print(info)
    
    # Сохраняем в файл
    with open("docs/gpu_info.md", "w", encoding="utf-8") as f:
        f.write(info)


## Информация о GPU

**Устройство:** NVIDIA GeForce RTX 3070 Ti
**VRAM:** 8.59 GB
**CUDA доступно:** Да
**Текущее устройство:** cuda:0
        


In [1]:
# ================================================================
# 1. Импорты и проверка устройства
# ================================================================
import torch, torchvision, time, onnx, onnxruntime, numpy as np
from torchvision import datasets, transforms
from torch.utils.data import DataLoader


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
from tqdm import tqdm   # pip install tqdm



# ================================================================
# 2. Преобразования и загрузка CIFAR-10
# ================================================================
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

train_ds = datasets.CIFAR10(root="data", train=True,  download=True, transform=transform)
val_ds   = datasets.CIFAR10(root="data", train=False, download=True, transform=transform)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True,  num_workers=2)
val_loader   = DataLoader(val_ds,   batch_size=32, shuffle=False, num_workers=2)

model_b0 = torchvision.models.efficientnet_b0(pretrained=True)
model_b0.classifier[1] = torch.nn.Linear(model_b0.classifier[1].in_features, 10)
model_b0 = model_b0.to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model_b0.parameters(), lr=3e-4)

def accuracy(net, loader):
    net.eval()
    correct = total = 0
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            correct += (net(x).argmax(1) == y).sum().item()
            total   += y.size(0)
    return correct/total
    
for epoch in range(3):
    model_b0.train()
    # оборачиваем именно тренировочный loader
    with tqdm(train_loader, desc=f"Epoch {epoch+1}") as pbar:
        for x, y in pbar:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            loss = criterion(model_b0(x), y)
            loss.backward()
            optimizer.step()

            # обновляем строку прогресса текущим лоссом
            pbar.set_postfix(loss=loss.item())
    torch.save(model_b0, f"model_b0_epochs/model_b0_epoch_{epoch + 1}.pt")

    # после эпохи — метрика
    print(f"Epoch {epoch+1}  val-acc={accuracy(model_b0, val_loader):.3f}")

Device: cuda


Epoch 1: 100%|█████████████████████████████████████████████████████████| 1563/1563 [01:59<00:00, 13.08it/s, loss=0.144]


Epoch 1  val-acc=0.944


Epoch 2: 100%|█████████████████████████████████████████████████████████| 1563/1563 [01:57<00:00, 13.28it/s, loss=0.371]


Epoch 2  val-acc=0.948


Epoch 3: 100%|████████████████████████████████████████████████████████| 1563/1563 [01:58<00:00, 13.18it/s, loss=0.0368]


Epoch 3  val-acc=0.949


LATENCY


БЕЗОПАСНОЕ ИЗМЕРЕНИЕ ЛАТЕНЦИИ
НАЧАЛО ИЗМЕРЕНИЯ ЛАТЕНЦИИ
Попытка измерения на GPU...
Warm-up...

Измерение латенции (100 прогонов)...
Прогон 1/100: 15.345 мс
  Память GPU: 0.05 GB / 0.09 GB
Прогон 11/100: 16.567 мс
  Память GPU: 0.05 GB / 0.09 GB
Прогон 21/100: 16.531 мс
  Память GPU: 0.05 GB / 0.09 GB
Прогон 31/100: 16.803 мс
  Память GPU: 0.05 GB / 0.09 GB
Прогон 41/100: 16.219 мс
  Память GPU: 0.05 GB / 0.09 GB
Прогон 51/100: 16.402 мс
  Память GPU: 0.05 GB / 0.09 GB
Прогон 61/100: 16.628 мс
  Память GPU: 0.05 GB / 0.09 GB
Прогон 71/100: 16.980 мс
  Память GPU: 0.05 GB / 0.09 GB
Прогон 81/100: 16.383 мс
  Память GPU: 0.05 GB / 0.09 GB
