In [2]:
import torch
import torchvision.models as models
from torch.profiler import profile, record_function, ProfilerActivity

In [3]:
model = models.resnet18()
inputs = torch.rand(5, 3, 224, 224)

In [4]:
with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
    with record_function("model_inference"):
        model(inputs)


In [5]:
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                  model_inference         6.35%      12.812ms       100.00%     201.804ms     201.804ms             1  
                     aten::conv2d         0.56%       1.138ms        70.81%     142.900ms       7.145ms            20  
                aten::convolution         0.75%       1.520ms        70.25%     141.762ms       7.088ms            20  
               aten::_convolution         0.54%       1.091ms        69.49%     140.242ms       7.012ms            20  
         aten::mkldnn_convolution        68.21%     137.652ms        68.95%     139.151ms       6.958ms            20  
                 aten::batch_norm       

In [6]:
print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_time_total", row_limit=10))

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls                                                                      Input Shapes  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  --------------------------------------------------------------------------------  
                  model_inference         6.35%      12.812ms       100.00%     201.804ms     201.804ms             1                                                                                []  
                     aten::conv2d         0.37%     751.879us        20.13%      40.619ms      40.619ms             1                             [[5, 3, 224, 224], [64, 3, 7, 7], [], [], [], 

In [7]:
if torch.cuda.is_available():
    device = 'cuda'
elif torch.xpu.is_available():
    device = 'xpu'
else:
    print('Neither CUDA nor XPU devices are available to demonstrate profiling on acceleration devices')
    import sys
    sys.exit(0)

activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA, ProfilerActivity.XPU]
sort_by_keyword = device + "_time_total"

model = models.resnet18().to(device)
inputs = torch.randn(5, 3, 224, 224).to(device)

with profile(activities=activities, record_shapes=True) as prof:
    with record_function("model_inference"):
        model(inputs)

print(prof.key_averages().table(sort_by=sort_by_keyword, row_limit=10))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        model_inference         0.00%       0.000us         0.00%       0.000us       0.000us     254.887ms      7142.60%     254.887ms     127.443ms             2  
                                        model_inference         0.59%       2.912ms       100.00%     496.705ms     496.705ms       0.000us         0.00%       3.569ms       3.569ms             1  
         

In [14]:
model = models.resnet18()
inputs = torch.rand(5, 3, 224, 224)

with profile(activities=[ProfilerActivity.CPU], profile_memory=True, record_shapes=True) as prof:
    with record_function("model_inference"):
        model(inputs)

print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                      aten::empty         0.51%     626.548us         0.51%     626.548us       3.133us      94.86 Mb      94.86 Mb           200  
    aten::max_pool2d_with_indices         8.33%      10.255ms         8.33%      10.255ms      10.255ms      11.48 Mb      11.48 Mb             1  
                      aten::addmm         0.10%     123.735us         0.11%     138.343us     138.343us      19.53 Kb      19.53 Kb             1  
                       aten::mean         0.02%      25.539us         0.10%     121.905us     121.905us      10.

In [15]:
print(prof.key_averages().table(sort_by="cpu_memory_usage", row_limit=10))

---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
---------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                      aten::empty         0.51%     626.548us         0.51%     626.548us       3.133us      94.86 Mb      94.86 Mb           200  
                 aten::batch_norm         0.10%     119.814us        10.62%      13.070ms     653.481us      47.41 Mb           0 b            20  
     aten::_batch_norm_impl_index         0.18%     224.671us        10.52%      12.950ms     647.491us      47.41 Mb           0 b            20  
          aten::native_batch_norm         9.99%      12.301ms        10.30%      12.685ms     634.231us      47.

In [16]:
device = 'cuda'

activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA, ProfilerActivity.XPU]

model = models.resnet18().to(device)
inputs = torch.randn(5, 3, 224, 224).to(device)

with profile(activities=activities) as prof:
    model(inputs)

prof.export_chrome_trace("trace.json")

In [18]:
sort_by_keyword = "self_" + device + "_time_total"

with profile(
    activities=activities,
    with_stack=True,
) as prof:
    model(inputs)

# Print aggregated stats
print(prof.key_averages(group_by_stack_n=5).table(sort_by=sort_by_keyword, row_limit=2))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                aten::cudnn_convolution        22.90%       1.443ms        37.89%       2.387ms     119.368us       2.168ms        61.31%       2.168ms     108.399us            20  
void cudnn::engines_precompiled::nchwToNhwcKernel<fl...         0.00%       0.000us         0.00%       0.000us       0.000us     766.033us        21.66%     766.033us      22.530us            34  
---------

In [None]:
from torch.profiler import schedule

my_schedule = schedule(
    skip_first=10,
    wait=5,
    warmup=1,
    active=3,
    repeat=2)

In [19]:
sort_by_keyword = "self_" + device + "_time_total"

def trace_handler(p):
    output = p.key_averages().table(sort_by=sort_by_keyword, row_limit=10)
    print(output)
    p.export_chrome_trace("/tmp/trace_" + str(p.step_num) + ".json")

with profile(
    activities=activities,
    schedule=torch.profiler.schedule(
        wait=1,
        warmup=1,
        active=2),
    on_trace_ready=trace_handler
) as p:
    for idx in range(8):
        model(inputs)
        p.step()

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                          ProfilerStep*         0.00%       0.000us         0.00%       0.000us       0.000us      13.804ms       174.85%      13.804ms       6.902ms             2  
                                aten::cudnn_convolution        12.61%       1.927ms        20.31%       3.104ms      77.610us       5.170ms        65.48%       5.170ms     129.242us            40  
void cudn