In [1]:
import torch
import os
from torch.utils.cpp_extension import load
from torch.profiler import profile, ProfilerActivity

os.environ['CUDA_LAUNCH_BLOCKING']='1'

In [2]:
%load_ext wurlitzer

### Matrix Multiplication - Global Access

In [3]:
mmul_module = load(
    name="ops",
    sources=["csrc/matrix_multiply.cu"], 
    extra_cuda_cflags=['--ptxas-options=-v', "-O2", "-Xcompiler", "-Werror", "-Xcompiler", "-Wall"], 
    verbose=True
)

Using /home/ganesh/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/ganesh/.cache/torch_extensions/py310_cu121/ops/build.ninja...
Building extension module ops...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


ninja: no work to do.


Loading extension module ops...


In [4]:
gen = torch.Generator(device='cuda:0')
gen.manual_seed(42)

m = 1024
n = 1024
k = 1024

a = torch.randn(size=(m, k), dtype=torch.float32, device='cuda:0', generator=gen).contiguous()
b = torch.randn(size=(k, n), dtype=torch.float32, device='cuda:0', generator=gen).contiguous()

In [5]:
%%time 

mmul_global_memory = mmul_module.ops.matrix_multiply_2d_op(a, b)

CPU times: user 2.01 ms, sys: 0 ns, total: 2.01 ms
Wall time: 2.03 ms


In [6]:
mmul_global_memory.shape

torch.Size([1024, 1024])

In [7]:
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
    for i in range(10000):
        mmul_module.ops.matrix_multiply_2d_op(a, b)
        torch.cuda.synchronize()

print(prof.key_averages())

STAGE:2024-03-18 21:23:32 221396:221396 ActivityProfilerController.cpp:314] Completed Stage: Warm Up


STAGE:2024-03-18 21:23:45 221396:221396 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-03-18 21:23:45 221396:221396 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            aten::empty         0.52%      67.157ms         0.52%      67.157ms       6.716us       0.000us         0.00%       0.000us       0.000us         10000  
                                       cudaLaunchKernel        99.37%       12.784s        99.37%       12.784s       1.278ms       0.000us         0.00%       0.000us       0.000us         10000  
matrix_mu

In [8]:
ai = m*n*k / (m*n + n*k + m*n)
print(f"{ai = }")

ai = 341.3333333333333


In [9]:
mmul_tiled_module = load(
    name="ops",
    sources=["csrc/matrix_multiply_tiled.cu"], 
    extra_cuda_cflags=['--ptxas-options=-v', "-O2", "-Xcompiler", "-Werror", "-Xcompiler", "-Wall"], 
    verbose=True
)

Using /home/ganesh/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...
The input conditions for extension module ops have changed. Bumping to version 1 and re-building as ops_v1...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/ganesh/.cache/torch_extensions/py310_cu121/ops/build.ninja...
Building extension module ops_v1...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


[1/2] /usr/local/cuda/bin/nvcc --generate-dependencies-with-compile --dependency-output matrix_multiply_tiled.cuda.o.d -DTORCH_EXTENSION_NAME=ops_v1 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /home/ganesh/.local/share/virtualenvs/cudamode-lectures-SlOoH9rC/lib/python3.10/site-packages/torch/include -isystem /home/ganesh/.local/share/virtualenvs/cudamode-lectures-SlOoH9rC/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -isystem /home/ganesh/.local/share/virtualenvs/cudamode-lectures-SlOoH9rC/lib/python3.10/site-packages/torch/include/TH -isystem /home/ganesh/.local/share/virtualenvs/cudamode-lectures-SlOoH9rC/lib/python3.10/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /home/ganesh/.pyenv/versions/3.10.13/include/python3.10 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS

Loading extension module ops_v1...


In [10]:
%%time 

mmu_tiled = mmul_tiled_module.ops.matrix_multiply_tiled(a, b)

CPU times: user 2.31 ms, sys: 11.9 ms, total: 14.2 ms
Wall time: 14.4 ms


In [11]:
with torch.profiler.profile() as prof:
    for i in range(10000):
        mmul_tiled_module.ops.matrix_multiply_tiled(a, b)
        torch.cuda.synchronize()
print(prof.key_averages())

STAGE:2024-03-18 21:24:40 221396:221396 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-03-18 21:24:51 221396:221396 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-03-18 21:24:51 221396:221396 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            aten::empty         0.62%      62.154ms         0.62%      62.154ms       6.215us       0.000us         0.00%       0.000us       0.000us         10000  
                                       cudaLaunchKernel        99.26%        9.956s        99.26%        9.956s     995.632us       0.000us         0.00%       0.000us       0.000us         10000  
matrix_mu

In [12]:
%%time 

c = a @ b

CPU times: user 16.9 ms, sys: 8.11 ms, total: 25 ms
Wall time: 23.2 ms


In [13]:
torch.allclose(mmu_tiled, c)

False