You can tell nvcc how agressively to optimize the cuda-c code (trading off compilation speed).

**Q:** In this nb, I investigate the effect of different optimization levels on runtime.<br/>
**A:** No effect at all.

In [1]:
import torch
from torch.utils.cpp_extension import load_inline
from torch.profiler import profile, record_function, ProfilerActivity
from torch.profiler import schedule as profiler_schedule

In [2]:
cuda_src = '''
#include <cuda.h>
#include <cuda_runtime.h>

template <const uint BLOCKSIZE>
__global__ void matmul_global_mem_coalesce(const float *A, const float *B, float *C, int M, int N, int K) {
  const int cRow = blockIdx.x * BLOCKSIZE + (threadIdx.x / BLOCKSIZE);
  const int cCol = blockIdx.y * BLOCKSIZE + (threadIdx.x % BLOCKSIZE);

  if (cRow < M && cCol < N) {
    float tmp = 0.0;
    for (int i = 0; i < K; ++i) { tmp += A[cRow * K + i] * B[i * N + cCol]; }
    C[cRow * N + cCol] = tmp;
  }
}

inline unsigned int cdiv(unsigned int a, unsigned int b) { return (a + b - 1) / b; }

void matmul(int M, int N, int K) {
    constexpr uint bs = 32;

    // Allocate memory for A,B,C on device
    float *d_A, *d_B, *d_C;
    cudaMalloc((void **)&d_A, M * K * sizeof(float));
    cudaMalloc((void **)&d_B, K * N * sizeof(float));
    cudaMalloc((void **)&d_C, M * N * sizeof(float));

    // Initialize A,B to ones
    cudaMemset(d_A, 1, M * K * sizeof(float));
    cudaMemset(d_B, 1, K * N * sizeof(float));

    // Initialize C to zeros
    cudaMemset(d_C, 0, M * N * sizeof(float));

    // Configure the grid and block dimensions
    dim3 tpb(bs * bs);
    dim3 blocks(cdiv(M, bs), cdiv(N, bs));

    // Launch the matrix multiplication kernel
    matmul_global_mem_coalesce<bs><<<blocks, tpb>>>(d_A, d_B, d_C, M, N, K);

    // Free device memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
}
'''
cpp_src = "void matmul(int M, int N, int K);"

In [3]:
import time

matmuls = {}

for opt_lv in ['-O0', '-O1', '-O2', '-O3']:
    print(f'🫡 Starting to compile with optimization level {opt_lv}')
    start_time = time.time()
    matmuls[opt_lv] = load_inline(
        name='matmul',
        cpp_sources=cpp_src,
        cuda_sources=cuda_src,
        functions=['matmul'],
        with_cuda=True,
        extra_cuda_cflags=[opt_lv],
        build_directory='tmp/matmul_2/',
        # extra_cuda_cflags=['--expt-relaxed-constexpr']
    )
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f'✅ Done compilation with optimization level {opt_lv} in {elapsed_time:.2f} seconds')


🫡 Starting to compile with optimization level -O0
✅ Done compilation with optimization level -O0 in 51.18 seconds
🫡 Starting to compile with optimization level -O1
✅ Done compilation with optimization level -O1 in 84.41 seconds
🫡 Starting to compile with optimization level -O2
✅ Done compilation with optimization level -O2 in 84.10 seconds
🫡 Starting to compile with optimization level -O3
✅ Done compilation with optimization level -O3 in 82.69 seconds


In [4]:
m,n,k = 4092,4092,4092

In [7]:
matmuls

{'-O0': <module 'matmul' from '/teamspace/studios/this_studio/tmp/matmul_2/matmul.so'>,
 '-O1': <module 'matmul_v1' from '/teamspace/studios/this_studio/tmp/matmul_2/matmul_v1.so'>,
 '-O2': <module 'matmul_v2' from '/teamspace/studios/this_studio/tmp/matmul_2/matmul_v2.so'>,
 '-O3': <module 'matmul_v3' from '/teamspace/studios/this_studio/tmp/matmul_2/matmul_v3.so'>}

In [6]:
# check it runs
for opt_lv in ['-O0', '-O1', '-O2', '-O3']:
    matmuls[opt_lv].matmul(m,n,k)

TypeError: matmul(): incompatible function arguments. The following argument types are supported:
    1. (arg0: torch.Tensor, arg1: torch.Tensor) -> torch.Tensor

Invoked with: 4092, 4092, 4092

In [None]:
wait, warmup = 1,1 # 1 wait cycle to ensure kernel is compiled, 1 warmup cycle to not have overhead of profiler start afterwards
runs = 3

for opt_lv in ['-O0', '-O1', '-O2', '-O3']:
    print(f'🔍 Profiling matmul with optimization level {opt_lv}')
    with profile(activities=[ProfilerActivity.CUDA], schedule=profiler_schedule(wait=1, warmup=1, active=runs)) as p:
        for _ in range(wait+warmup+runs):
            matmuls[opt_lv].matmul(m,n,k)
            p.step()

    print(f'📊 Results for optimization level {opt_lv}:')
    print(p.key_averages().table(sort_by="cuda_time_total", row_limit=10))