In [34]:
# Memoria GPU
import torch
print(torch.cuda.get_device_name(0))
print("Total Memory: ",round(torch.cuda.mem_get_info(0)[1]/1024**3,1),'GB')
print('Memory Usage:')
# x = torch.rand(100,3,512,512).cuda() # simulacion de 100 imagenes de 512x512 a color...
print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

# algo raro, Nvidia dice que hay 24 GB, en la industria se acostumbra a que GB es potencias de 10 y GiB es potencias de 1024**3 en byes.
# pero aca usaron los GB como GiB, por lo que se tienen 24576 MiB. 
# se scan las cuentas como si fueran 24000 igualmente.
mem_rtx_3090 = 24000

NVIDIA GeForce RTX 3090
Total Memory:  23.7 GB
Memory Usage:
Allocated: 0.3 GB
Cached:    2.2 GB


Determinar de forma general un batch size apropiado para nuestro modelo.

https://discuss.pytorch.org/t/how-to-determine-the-largest-batch-size-of-a-given-model-saturating-the-gpu/146075

Aplicando una busqueda binaria con la funcion de Suraj.

En principio siempre se quiere conseguir el tamaño maximo de batch size que soporte el hardware.


In [39]:
import os
import time
import torch

dimension_imagen = [3,32,32]

def proc_time(b_sz, model, n_iter=10):
    x = torch.rand(b_sz, *dimension_imagen).cuda()
    model.cuda()
    torch.cuda.synchronize()
    start = time.time()
    for _ in range(n_iter):
        model(x)
        print(torch.cuda.mem_get_info())
        print(os.popen("nvidia-smi").read())
        
    torch.cuda.synchronize()
    end = time.time() - start
    throughput = b_sz * n_iter / end
    print(f"Batch: {b_sz} \t {throughput} samples/sec")
    return (b_sz, throughput, )


https://stackoverflow.com/questions/46654424/how-to-calculate-optimal-batch-size 

Use the summaries provided by pytorchsummary (pip install) or keras (builtin).

E.g.

from torchsummary import summary
summary(model)
.....
.....
================================================================
Total params: 1,127,495
Trainable params: 1,127,495
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.02
Forward/backward pass size (MB): 13.93
Params size (MB): 4.30
Estimated Total Size (MB): 18.25
----------------------------------------------------------------
Each instance you put in the batch will require a full forward/backward pass in memory, your model you only need once. People seem to prefer batch sizes of powers of two, probably because of automatic layout optimization on the gpu.

Don't forget to linearly increase your learning rate when increasing the batch size.

Let's assume we have a Tesla P100 at hand with 16 GB memory.

(16000 - model_size) / (forward_back_ward_size)
(16000 - 4.3) / 18.25 = 1148.29
rounded to powers of 2 results in batch size 1024

In [48]:
from torchinfo import summary
import torch.nn as nn

model = nn.Sequential(
          nn.Conv2d(3,20,3),
          nn.ReLU(),
          nn.Conv2d(20,64,3),
          nn.ReLU()
        )
batch_size = 10
resumen = summary(model, input_size=(batch_size, *dimension_imagen))
print(resumen)

import re
model_size = re.search(r"Params size \(MB\): (\d*\.?\d*)",resumen.__repr__()).group(1)
forward_back_ward_size = re.search(r"Estimated Total Size \(MB\): (\d*\.?\d*)",resumen.__repr__()).group(1)
print("model_size",model_size)
print("forward_back_ward_size",forward_back_ward_size)
max_batch_size = (mem_rtx_3090 - float(model_size)) / float(forward_back_ward_size)
print("maximum possible batch size:",round(max_batch_size))


Layer (type:depth-idx)                   Output Shape              Param #
Sequential                               [10, 64, 28, 28]          --
├─Conv2d: 1-1                            [10, 20, 30, 30]          560
├─ReLU: 1-2                              [10, 20, 30, 30]          --
├─Conv2d: 1-3                            [10, 64, 28, 28]          11,584
├─ReLU: 1-4                              [10, 64, 28, 28]          --
Total params: 12,144
Trainable params: 12,144
Non-trainable params: 0
Total mult-adds (M): 95.86
Input size (MB): 0.12
Forward/backward pass size (MB): 5.45
Params size (MB): 0.05
Estimated Total Size (MB): 5.63
model_size 0.05
forward_back_ward_size 5.63
maximum possible batch size: 4263


In [52]:
# revisamos la estimacion con el valor maximo obtenido de la cuenta anterior.
max_batch_size_recommended = 4263
max_batch_size_recommended = 42630


resumen = summary(model, input_size=(max_batch_size_recommended, *dimension_imagen))
print(resumen)

import re
model_size = re.search(r"Params size \(MB\): (\d*\.?\d*)",resumen.__repr__()).group(1)
forward_back_ward_size = re.search(r"Estimated Total Size \(MB\): (\d*\.?\d*)",resumen.__repr__()).group(1)
print("model_size",model_size)
print("forward_back_ward_size",forward_back_ward_size)
max_batch_size = (mem_rtx_3090 - float(model_size)) / float(forward_back_ward_size)
print("maximum possible batch size:",round(max_batch_size))

Layer (type:depth-idx)                   Output Shape              Param #
Sequential                               [42630, 64, 28, 28]       --
├─Conv2d: 1-1                            [42630, 20, 30, 30]       560
├─ReLU: 1-2                              [42630, 20, 30, 30]       --
├─Conv2d: 1-3                            [42630, 64, 28, 28]       11,584
├─ReLU: 1-4                              [42630, 64, 28, 28]       --
Total params: 12,144
Trainable params: 12,144
Non-trainable params: 0
Total mult-adds (G): 408.65
Input size (MB): 523.84
Forward/backward pass size (MB): 23250.74
Params size (MB): 0.05
Estimated Total Size (MB): 23774.63
model_size 0.05
forward_back_ward_size 23774.63
maximum possible batch size: 1


In [40]:
# probemos en la practica
proc_time(max_batch_size_recommended,model)

(19564134400, 25423577088)
Sat Oct 15 23:21:44 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.65.01    Driver Version: 515.65.01    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0  On |                  N/A |
|  0%   52C    P2    45W / 350W |   5588MiB / 24576MiB |      1%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+------------------------------------------------------------

(4263, 48902.25671696654)

# NOTA IMPORTANTE

revisando la documentacion, el hilo de stackoverflow, veo que la version es vieja.
la nueva version summary, ya te dice cuanto va a a pesar el modelo considerando su input ... 
por eso es que la nueva version requiere eso,
la anterior solamente te decia cuanto iba a pesar el modelo.
pero la nueva considera la cantidad de datos de entramiento (batch) cuanto pesa cada uno dimension_imagen, y lo que pesa el propio modelo.