In this notebook, I profile Simon's cuda-c kernels with the torch-profiler.

**Result:** Using the torch-profiler gives ~same runtimes as using cuda-events. So this is not the reasons why the runtimes of my numba-cuda kernels are so large.

In [1]:
import numpy as np
from fastcore.basics import tuplify
from numba import cuda
from torch.profiler import profile, record_function, ProfilerActivity
from torch.profiler import schedule as profiler_schedule
from torch import allclose, tensor

from util import to_d, to_h, array_like, cdiv

dtype = 'float32'

os.makedirs('tmp', exist_ok=True) # for tmp file from load_inline

In [2]:
@cuda.jit()
def matmul_2(a,b,c,m,n,k,bs):
    # we defined blocks of size bs*bs
    x = cuda.blockIdx.x * bs + (cuda.threadIdx.x // bs)
    y = cuda.blockIdx.y * bs + (cuda.threadIdx.x % bs)
    if x>=m or y>=n: return 
    tmp = 0
    for i in range(k): tmp += a[x,i] * b[i,y]
    c[x, y] = tmp

This is the c-code from Simon, which runs at ~220ms instead of ~1270ms like my code:
```cpp 
template <const uint BLOCKSIZE>
__global__ void matmul_global_mem_coalesce(int M, int N, int K, float alpha,
                                          const float *A, const float *B,
                                          float beta, float *C) {
  const int cRow = blockIdx.x * BLOCKSIZE + (threadIdx.x / BLOCKSIZE);
  const int cCol = blockIdx.y * BLOCKSIZE + (threadIdx.x % BLOCKSIZE);

  // if statement is necessary to make things work under tile quantization
  if (cRow < M && cCol < N) {
    float tmp = 0.0;
    for (int i = 0; i < K; ++i) {
      tmp += A[cRow * K + i] * B[i * N + cCol];
    }
    C[cRow * N + cCol] = tmp;
  }
}
```

In [3]:
m,n,k = 4092,4092,4092

a = to_d(np.ones((m,k), dtype=dtype))
b = to_d(np.ones((k,n), dtype=dtype))
c = to_d(np.empty((m,n), dtype=dtype))
bs=32
nthreads = bs*bs # 1d block ...
nblocks = cdiv(c.shape, (bs,bs)) # ... in 2d grid 
matmul_2[nblocks, nthreads](a,b,c,m,n,k,bs)

In [4]:
# using the standard scheduler runs the code only once
with profile(activities=[ProfilerActivity.CUDA]) as p:
    c = to_d(np.empty((m,n), dtype=dtype))
    matmul_2[nblocks, nthreads](a,b,c,m,n,k,bs)

print(p.key_averages().table(sort_by="cuda_time_total", row_limit=10))

STAGE:2024-05-05 18:44:51 2445:2445 ActivityProfilerController.cpp:314] Completed Stage: Warm Up


-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
cudapy::__main__::matmul_2[abi:v1][abi:cw51cXTLSUwv1...         0.00%       0.000us         0.00%       0.000us       0.000us        1.258s        98.98%        1.258s        1.258s             1  
                       Memcpy HtoD (Pageable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      12.944ms         1.02%      12.944ms      12.944ms             1  
         

STAGE:2024-05-05 18:44:53 2445:2445 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-05-05 18:44:53 2445:2445 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


In [5]:
wait, warmup = 1,1 # 1 wait cycle to ensure kernel is compiled, 1 warmup cycle to not have overhead of profiler start afterwards
runs = 3

with profile(activities=[ProfilerActivity.CUDA], schedule=profiler_schedule(wait=1, warmup=1, active=runs)) as prof:
    for _ in range(wait+warmup+runs):
        c = to_d(np.empty((m,n), dtype=dtype))
        matmul_2[nblocks, nthreads](a,b,c,m,n,k,bs)
        p.step()

print(p.key_averages().table(sort_by="cuda_time_total", row_limit=10))

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
cudapy::__main__::matmul_2[abi:v1][abi:cw51cXTLSUwv1...         0.00%       0.000us         0.00%       0.000us       0.000us        1.258s        98.98%        1.258s        1.258s             1  
                       Memcpy HtoD (Pageable -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us      12.944ms         1.02%      12.944ms      12.944ms             1  
         

In [6]:
def cuda_mean_runtime(prof_log, kernel_name, do_print=False):
    # extract cuda mean runtime from a torch.profiler log
    kernels = [o for o in prof_log.key_averages() if kernel_name in o.key]
    names = [k.key for k in kernels]
    if len(names)==0: raise RuntimeError(f"Profiling logs have no kernel with 'f{kernel_name}' in its name")
    if len(names)>1: raise RuntimeError(f"Profiling logs have multiple kernel with 'f{kernel_name}' in its name: f{names}. Please be more precise.")
    
    mean_runtime = kernels[0].cuda_time/1e3 # use ms instea of µs
    if do_print: print(f'{mean_runtime/1e3:.3f}s') # print in s
    return mean_runtime

In [7]:
cuda_mean_runtime(p, 'matmul',do_print=True);

1.258s


Let's check what profiling Simon's kernel with torch-profile returns

Let's first if calling kernels from Python works:

In [8]:
import torch
from torch.utils.cpp_extension import load_inline

In [9]:
# Define the CUDA kernel and C++ wrapper
cuda_src = '''
__global__ void square_matrix_kernel(const float* matrix, float* result, int width, int height) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < height && col < width) {
        int idx = row * width + col;
        result[idx] = matrix[idx] * matrix[idx];
    }
}

torch::Tensor square_matrix(torch::Tensor matrix) {
    const auto height = matrix.size(0);
    const auto width = matrix.size(1);

    auto result = torch::empty_like(matrix);

    dim3 threads_per_block(16, 16);
    dim3 number_of_blocks((width + threads_per_block.x - 1) / threads_per_block.x,
                          (height + threads_per_block.y - 1) / threads_per_block.y);

    square_matrix_kernel<<<number_of_blocks, threads_per_block>>>(
        matrix.data_ptr<float>(), result.data_ptr<float>(), width, height);

    return result;
}
'''
cpp_src = "torch::Tensor square_matrix(torch::Tensor matrix);"

# Load the CUDA kernel as a PyTorch extension
square_matrix_extension = load_inline(
    name='square_matrix_extension',
    cpp_sources=cpp_src,
    cuda_sources=cuda_src,
    functions=['square_matrix'],
    with_cuda=True,
    extra_cuda_cflags=["-O2"],
    build_directory='./tmp',
    # extra_cuda_cflags=['--expt-relaxed-constexpr']
)

a = torch.tensor([[1., 2., 3.], [4., 5., 6.]], device='cuda')

print(square_matrix_extension.square_matrix(a))

tensor([[ 1.,  4.,  9.],
        [16., 25., 36.]], device='cuda:0')


It does!

Let's now run Simon's matmul kernel 2

In [10]:
cuda_src = '''
template <const uint BLOCKSIZE>
__global__ void matmul_global_mem_coalesce(const float *A, const float *B, float *C, int M, int N, int K) {
  const int cRow = blockIdx.x * BLOCKSIZE + (threadIdx.x / BLOCKSIZE);
  const int cCol = blockIdx.y * BLOCKSIZE + (threadIdx.x % BLOCKSIZE);

  if (cRow < M && cCol < N) {
    float tmp = 0.0;
    for (int i = 0; i < K; ++i) {
      tmp += A[cRow * K + i] * B[i * N + cCol];
    }
    C[cRow * N + cCol] = tmp;
  }
}

inline unsigned int cdiv(unsigned int a, unsigned int b) { return (a + b - 1) / b;}

torch::Tensor matmul(torch::Tensor a, torch::Tensor b) {
    constexpr uint bs = 32;
    
    //CHECK_INPUT(a); CHECK_INPUT(b);
    int m = a.size(0);
    int n = b.size(1);
    int k = a.size(1);
    //TORCH_CHECK(k==b.size(0), "Size mismatch!");
    auto outp = torch::zeros({m, n}, a.options());

    dim3 tpb(bs*bs);
    dim3 blocks(cdiv(m, bs), cdiv(n, bs));
    matmul_global_mem_coalesce<bs><<<blocks, tpb>>>(
        a.data_ptr<float>(), b.data_ptr<float>(), outp.data_ptr<float>(),
        m, n, k
    );
    //C10_CUDA_KERNEL_LAUNCH_CHECK();
    return outp;
}
'''
cpp_src = "torch::Tensor matmul(torch::Tensor a, torch::Tensor b);"

In [11]:
# Load the CUDA kernel as a PyTorch extension
matmul_2 = load_inline(
    name='matmul',
    cpp_sources=cpp_src,
    cuda_sources=cuda_src,
    functions=['matmul'],
    with_cuda=True,
    extra_cuda_cflags=["-O2"],
    build_directory='tmp/matmul_2/',
    # extra_cuda_cflags=['--expt-relaxed-constexpr']
)

In [12]:
a = torch.ones((2,3), device='cuda')
b = torch.ones((3,4), device='cuda')

In [13]:
matmul_2.matmul(a,b)

tensor([[3., 3., 3., 3.],
        [3., 3., 3., 3.]], device='cuda:0')

In [14]:
m,n,k = 4092,4092,4092

a = torch.ones((m,k), dtype=torch.float32, device='cuda')
b = torch.ones((k,n), dtype=torch.float32, device='cuda')

In [15]:
c = matmul_2.matmul(a,b)
c.shape

torch.Size([4092, 4092])

In [16]:
(a@b==c).all()

tensor(True, device='cuda:0')

In [17]:
wait, warmup = 1,1 # 1 wait cycle to ensure kernel is compiled, 1 warmup cycle to not have overhead of profiler start afterwards
runs = 3

with profile(activities=[ProfilerActivity.CUDA], schedule=profiler_schedule(wait=1, warmup=1, active=runs)) as p:
    for _ in range(wait+warmup+runs):
        matmul_2.matmul(a,b)
        p.step()

print(p.key_averages().table(sort_by="cuda_time_total", row_limit=10))

STAGE:2024-05-05 18:46:23 2445:2445 ActivityProfilerController.cpp:314] Completed Stage: Warm Up


-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
void matmul_global_mem_coalesce<32u>(float const*, f...         0.00%       0.000us         0.00%       0.000us       0.000us     894.970ms        99.87%     894.970ms     223.743ms             4  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       1.125ms         0.13%       1.125ms     281.250us             4  
         

STAGE:2024-05-05 18:46:24 2445:2445 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-05-05 18:46:24 2445:2445 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


Simon's kernel 2 on a T4, measured with torch-profile gives **~200ms**. So the measurement method (torch profile vs cudaevents) is not the source of weirdness.

Let's also measure his kernel 1 and 3:

In [18]:
# kernel 1
cuda_src = '''
__global__ void matmul_naive(const float *A, const float *B, float *C, int M, int N, int K) {
  const uint x = blockIdx.x * blockDim.x + threadIdx.x;
  const uint y = blockIdx.y * blockDim.y + threadIdx.y;

  // if statement is necessary to make things work under tile quantization
  if (x < M && y < N) {
    float tmp = 0.0;
    for (int i = 0; i < K; ++i) {
      tmp += A[x * K + i] * B[i * N + y];
    }
    C[x * N + y] = tmp;
  }
}

inline unsigned int cdiv(unsigned int a, unsigned int b) { return (a + b - 1) / b;}

torch::Tensor matmul(torch::Tensor a, torch::Tensor b) {
    constexpr uint bs = 32;
    
    //CHECK_INPUT(a); CHECK_INPUT(b);
    int m = a.size(0);
    int n = b.size(1);
    int k = a.size(1);
    //TORCH_CHECK(k==b.size(0), "Size mismatch!");
    auto outp = torch::zeros({m, n}, a.options());

    dim3 tpb(bs,bs);
    dim3 blocks(cdiv(m, bs), cdiv(n, bs));
    matmul_naive<<<blocks, tpb>>>(
        a.data_ptr<float>(), b.data_ptr<float>(), outp.data_ptr<float>(),
        m, n, k
    );
    //C10_CUDA_KERNEL_LAUNCH_CHECK();
    return outp;
}
'''
cpp_src = "torch::Tensor matmul(torch::Tensor a, torch::Tensor b);"

In [19]:
matmul_1 = load_inline(
    name='matmul',
    cpp_sources=cpp_src,
    cuda_sources=cuda_src,
    functions=['matmul'],
    with_cuda=True,
    extra_cuda_cflags=["-O2"],
    build_directory='tmp/matmul_1/',
    # extra_cuda_cflags=['--expt-relaxed-constexpr']
)

In [20]:
m,n,k = 4092,4092,4092
a = torch.ones((m,k), dtype=torch.float32, device='cuda')
b = torch.ones((k,n), dtype=torch.float32, device='cuda')

In [21]:
c = matmul_1.matmul(a,b)
c.shape

torch.Size([4092, 4092])

In [22]:
(a@b==c).all()

tensor(True, device='cuda:0')

In [23]:
wait, warmup = 1,1 # 1 wait cycle to ensure kernel is compiled, 1 warmup cycle to not have overhead of profiler start afterwards
runs = 3

with profile(activities=[ProfilerActivity.CUDA], schedule=profiler_schedule(wait=1, warmup=1, active=runs)) as p:
    for _ in range(wait+warmup+runs):
        matmul_1.matmul(a,b)
        p.step()

print(p.key_averages().table(sort_by="cuda_time_total", row_limit=10))

STAGE:2024-05-05 18:47:49 2445:2445 ActivityProfilerController.cpp:314] Completed Stage: Warm Up


-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
matmul_naive(float const*, float const*, float*, int...         0.00%       0.000us         0.00%       0.000us       0.000us        3.351s        99.97%        3.351s     837.869ms             4  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       1.120ms         0.03%       1.120ms     280.000us             4  
         

STAGE:2024-05-05 18:47:54 2445:2445 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-05-05 18:47:54 2445:2445 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


In [24]:
# kernel 3
cuda_src = '''
template <const int BLOCKSIZE>
__global__ void matmul_shared_mem_block(const float *A, const float *B, float *C, int M, int N, int K) {
  // the output block that we want to compute in this threadblock
  const uint cRow = blockIdx.x;
  const uint cCol = blockIdx.y;

  // allocate buffer for current block in fast shared mem
  // shared mem is shared between all threads in a block
  __shared__ float As[BLOCKSIZE * BLOCKSIZE];
  __shared__ float Bs[BLOCKSIZE * BLOCKSIZE];

  // the inner row & col that we're accessing in this thread
  const uint threadCol = threadIdx.x % BLOCKSIZE;
  const uint threadRow = threadIdx.x / BLOCKSIZE;

  // advance pointers to the starting positions
  A += cRow * BLOCKSIZE * K;                    // row=cRow, col=0
  B += cCol * BLOCKSIZE;                        // row=0, col=cCol
  C += cRow * BLOCKSIZE * N + cCol * BLOCKSIZE; // row=cRow, col=cCol

  float tmp = 0.0;
  for (int bkIdx = 0; bkIdx < K; bkIdx += BLOCKSIZE) {
    // Have each thread load one of the elements in A & B
    // Make the threadCol (=threadIdx.x) the consecutive index
    // to allow global memory access coalescing
    As[threadRow * BLOCKSIZE + threadCol] = A[threadRow * K + threadCol];
    Bs[threadRow * BLOCKSIZE + threadCol] = B[threadRow * N + threadCol];

    // block threads in this block until cache is fully populated
    __syncthreads();
    A += BLOCKSIZE;
    B += BLOCKSIZE * N;

    // execute the dotproduct on the currently cached block
    for (int dotIdx = 0; dotIdx < BLOCKSIZE; ++dotIdx) {
      tmp += As[threadRow * BLOCKSIZE + dotIdx] *
             Bs[dotIdx * BLOCKSIZE + threadCol];
    }
    // need to sync again at the end, to avoid faster threads
    // fetching the next block into the cache before slower threads are done
    __syncthreads();
  }
  C[threadRow * N + threadCol] = tmp;
}

inline unsigned int cdiv(unsigned int a, unsigned int b) { return (a + b - 1) / b;}

torch::Tensor matmul(torch::Tensor a, torch::Tensor b) {
    constexpr uint bs = 32;
    
    //CHECK_INPUT(a); CHECK_INPUT(b);
    int m = a.size(0);
    int n = b.size(1);
    int k = a.size(1);
    //TORCH_CHECK(k==b.size(0), "Size mismatch!");
    auto outp = torch::zeros({m, n}, a.options());

    dim3 tpb(bs,bs);
    dim3 blocks(cdiv(m, bs), cdiv(n, bs));
    matmul_shared_mem_block<bs><<<blocks, tpb>>>(
        a.data_ptr<float>(), b.data_ptr<float>(), outp.data_ptr<float>(),
        m, n, k
    );
    //C10_CUDA_KERNEL_LAUNCH_CHECK();
    return outp;
}
'''
cpp_src = "torch::Tensor matmul(torch::Tensor a, torch::Tensor b);"

In [25]:
matmul_3 = load_inline(
    name='matmul',
    cpp_sources=cpp_src,
    cuda_sources=cuda_src,
    functions=['matmul'],
    with_cuda=True,
    extra_cuda_cflags=["-O2"],
    build_directory='tmp/matmul_1/',
    # extra_cuda_cflags=['--expt-relaxed-constexpr']
)

In [26]:
m,n,k = 4092,4092,4092
a = torch.ones((m,k), dtype=torch.float32, device='cuda')
b = torch.ones((k,n), dtype=torch.float32, device='cuda')

In [27]:
c = matmul_3.matmul(a,b)
c.shape

torch.Size([4092, 4092])

In [28]:
(a@b==c).all()

tensor(False, device='cuda:0')

In [29]:
wait, warmup = 1,1 # 1 wait cycle to ensure kernel is compiled, 1 warmup cycle to not have overhead of profiler start afterwards
runs = 3

with profile(activities=[ProfilerActivity.CUDA], schedule=profiler_schedule(wait=1, warmup=1, active=runs)) as p:
    for _ in range(wait+warmup+runs):
        matmul_3.matmul(a,b)
        p.step()

print(p.key_averages().table(sort_by="cuda_time_total", row_limit=10))

STAGE:2024-05-05 18:49:19 2445:2445 ActivityProfilerController.cpp:314] Completed Stage: Warm Up


-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
void matmul_shared_mem_block<32>(float const*, float...         0.00%       0.000us         0.00%       0.000us       0.000us     465.614ms        99.76%     465.614ms     116.403ms             4  
void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us       1.123ms         0.24%       1.123ms     280.750us             4  
         

STAGE:2024-05-05 18:49:19 2445:2445 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-05-05 18:49:19 2445:2445 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


----

To generate the ptx, I want to remove all dependencies on torch. Let's assert that code doesn't have a different runtime

In [30]:
cuda_src = '''
#include <cuda.h>
#include <cuda_runtime.h>

template <const uint BLOCKSIZE>
__global__ void matmul_global_mem_coalesce(const float *A, const float *B, float *C, int M, int N, int K) {
  const int cRow = blockIdx.x * BLOCKSIZE + (threadIdx.x / BLOCKSIZE);
  const int cCol = blockIdx.y * BLOCKSIZE + (threadIdx.x % BLOCKSIZE);

  if (cRow < M && cCol < N) {
    float tmp = 0.0;
    for (int i = 0; i < K; ++i) { tmp += A[cRow * K + i] * B[i * N + cCol]; }
    C[cRow * N + cCol] = tmp;
  }
}

inline unsigned int cdiv(unsigned int a, unsigned int b) { return (a + b - 1) / b; }

void matmul(int M, int N, int K) {
    constexpr uint bs = 32;

    // Allocate memory for A,B,C on device
    float *d_A, *d_B, *d_C;
    cudaMalloc((void **)&d_A, M * K * sizeof(float));
    cudaMalloc((void **)&d_B, K * N * sizeof(float));
    cudaMalloc((void **)&d_C, M * N * sizeof(float));

    // Initialize A,B to ones
    cudaMemset(d_A, 1, M * K * sizeof(float));
    cudaMemset(d_B, 1, K * N * sizeof(float));

    // Initialize C to zeros
    cudaMemset(d_C, 0, M * N * sizeof(float));

    // Configure the grid and block dimensions
    dim3 tpb(bs * bs);
    dim3 blocks(cdiv(M, bs), cdiv(N, bs));

    // Launch the matrix multiplication kernel
    matmul_global_mem_coalesce<bs><<<blocks, tpb>>>(d_A, d_B, d_C, M, N, K);

    // Free device memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
}
'''
cpp_src = "void matmul(int M, int N, int K);"

In [31]:
matmul_2_cudac_notorch = load_inline(
    name='matmul',
    cpp_sources=cpp_src,
    cuda_sources=cuda_src,
    functions=['matmul'],
    with_cuda=True,
    extra_cuda_cflags=["-O2"],
    build_directory='tmp/matmul_2/',
    # extra_cuda_cflags=['--expt-relaxed-constexpr']
)

In [32]:
m,n,k = 4092,4092,4092

In [33]:
matmul_2_cudac_notorch.matmul(m,n,k)

In [34]:
wait, warmup = 1,1 # 1 wait cycle to ensure kernel is compiled, 1 warmup cycle to not have overhead of profiler start afterwards
runs = 3

with profile(activities=[ProfilerActivity.CUDA], schedule=profiler_schedule(wait=1, warmup=1, active=runs)) as p:
    for _ in range(wait+warmup+runs):
        matmul_2_cudac_notorch.matmul(m,n,k)
        p.step()

print(p.key_averages().table(sort_by="cuda_time_total", row_limit=10))

STAGE:2024-05-05 18:50:34 2445:2445 ActivityProfilerController.cpp:314] Completed Stage: Warm Up


-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
void matmul_global_mem_coalesce<32u>(float const*, f...         0.00%       0.000us         0.00%       0.000us       0.000us     665.441ms        99.56%     665.441ms     221.814ms             3  
                                        Memset (Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.919ms         0.44%       2.919ms     324.333us             9  
         

STAGE:2024-05-05 18:50:35 2445:2445 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-05-05 18:50:35 2445:2445 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


Cool, still ~200ms. So the removing torch dependencies doesn't change the runtime of the kernel (as expected).