<a href="https://colab.research.google.com/github/Chirag005/CUDA-Kernel-project/blob/main/CUDA_0_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Cell 1: Enable GPU Runtime
# Go to: Runtime > Change runtime type > Hardware accelerator > GPU > Save

# Cell 2: Verify CUDA availability
!nvcc --version
!nvidia-smi

import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"Device: {torch.cuda.get_device_name(0)}")


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0
Sun Oct 26 18:29:37 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   63C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                       

In [3]:
# Cell 3: Install nvcc4jupyter
!pip install nvcc4jupyter

Detected platform "Colab". Running its setup...
Source files will be saved in "/tmp/tmp210u15u5".


In [6]:
# Cell 4: Custom ReLU CUDA Kernel
%load_ext nvcc4jupyter
%%cu
#include <torch/extension.h>

__global__ void relu_cuda_kernel(const float* input, float* output, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) {
        output[idx] = input[idx] > 0.0f ? input[idx] : 0.0f;
    }
}

torch::Tensor relu_cuda_forward(torch::Tensor input) {
    const int size = input.numel();
    auto output = torch::empty_like(input);

    const int threads = 256;
    const int blocks = (size + threads - 1) / threads;

    relu_cuda_kernel<<<blocks, threads>>>(
        input.data_ptr<float>(),
        output.data_ptr<float>(),
        size
    );

    cudaDeviceSynchronize();
    return output;
}

SyntaxError: invalid decimal literal (ipython-input-3793857798.py, line 9)

In [12]:
print("\n" + "=" * 80)
print("STEP 2: Compiling Custom CUDA Kernel")
print("=" * 80)

from torch.utils.cpp_extension import load_inline

# CUDA kernel source code
cuda_source = """
#include <torch/extension.h>

__global__ void relu_cuda_kernel(const float* input, float* output, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) {
        output[idx] = input[idx] > 0.0f ? input[idx] : 0.0f;
    }
}

torch::Tensor relu_forward(torch::Tensor input) {
    const int size = input.numel();
    auto output = torch::empty_like(input);

    const int threads = 256;
    const int blocks = (size + threads - 1) / threads;

    relu_cuda_kernel<<<blocks, threads>>>(
        input.data_ptr<float>(),
        output.data_ptr<float>(),
        size
    );

    return output;
}
"""

cpp_source = """
torch::Tensor relu_forward(torch::Tensor input);
"""

# Compile the extension
print("\n🔨 Compiling CUDA extension...")
relu_cuda = load_inline(
    name='relu_cuda',
    cpp_sources=cpp_source,
    cuda_sources=cuda_source,
    functions=['relu_forward'],
    verbose=False,
    extra_cflags=['-O3'],
    extra_cuda_cflags=['-O3', '--use_fast_math']
)
print("✅ CUDA extension compiled successfully!")

W1026 18:37:35.468000 424 torch/utils/cpp_extension.py:2425] TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. 
W1026 18:37:35.468000 424 torch/utils/cpp_extension.py:2425] If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'] to specific architectures.



STEP 2: Compiling Custom CUDA Kernel

🔨 Compiling CUDA extension...
✅ CUDA extension compiled successfully!


In [10]:
# Install ninja
!pip install ninja

Collecting ninja
  Downloading ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (5.1 kB)
Downloading ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (180 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/180.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.7/180.7 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ninja
Successfully installed ninja-1.13.0


In [13]:
print("\n" + "=" * 80)
print("STEP 3: Testing Correctness")
print("=" * 80)

# Test with simple values
test_input = torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0], device='cuda')
custom_output = relu_cuda.relu_forward(test_input)
pytorch_output = torch.relu(test_input)

print(f"\n🧪 Test Input:     {test_input.cpu().numpy()}")
print(f"✅ Custom Output:  {custom_output.cpu().numpy()}")
print(f"✅ PyTorch Output: {pytorch_output.cpu().numpy()}")
print(f"✅ Outputs Match:  {torch.allclose(custom_output, pytorch_output)}")



STEP 3: Testing Correctness

🧪 Test Input:     [-2. -1.  0.  1.  2.]
✅ Custom Output:  [0. 0. 0. 1. 2.]
✅ PyTorch Output: [0. 0. 0. 1. 2.]
✅ Outputs Match:  True


In [14]:
print("\n" + "=" * 80)
print("STEP 4: Performance Benchmarking")
print("=" * 80)

def benchmark(size=1000000, iterations=100):
    input_tensor = torch.randn(size, device='cuda')

    # Warmup
    for _ in range(10):
        _ = relu_cuda.relu_forward(input_tensor)
    torch.cuda.synchronize()

    # Custom ReLU timing
    torch.cuda.synchronize()
    start = time.time()
    for _ in range(iterations):
        output = relu_cuda.relu_forward(input_tensor)
    torch.cuda.synchronize()
    custom_time = (time.time() - start) / iterations

    # PyTorch ReLU timing
    torch.cuda.synchronize()
    start = time.time()
    for _ in range(iterations):
        output = torch.relu(input_tensor)
    torch.cuda.synchronize()
    pytorch_time = (time.time() - start) / iterations

    print(f"\n📊 Benchmark Results (Size: {size:,} elements, {iterations} iterations):")
    print(f"   Custom CUDA ReLU:  {custom_time*1000:.4f} ms")
    print(f"   PyTorch ReLU:      {pytorch_time*1000:.4f} ms")
    print(f"   Speedup:           {pytorch_time/custom_time:.2f}x")

    return custom_time, pytorch_time

# Run benchmarks with different sizes
for size in [100000, 1000000, 10000000]:
    benchmark(size=size, iterations=50)


STEP 4: Performance Benchmarking

📊 Benchmark Results (Size: 100,000 elements, 50 iterations):
   Custom CUDA ReLU:  0.0111 ms
   PyTorch ReLU:      0.0112 ms
   Speedup:           1.01x

📊 Benchmark Results (Size: 1,000,000 elements, 50 iterations):
   Custom CUDA ReLU:  0.0380 ms
   PyTorch ReLU:      0.0366 ms
   Speedup:           0.96x

📊 Benchmark Results (Size: 10,000,000 elements, 50 iterations):
   Custom CUDA ReLU:  0.3381 ms
   PyTorch ReLU:      0.3375 ms
   Speedup:           1.00x


In [15]:
print("\n" + "=" * 80)
print("STEP 5: PyTorch Profiler Analysis")
print("=" * 80)

from torch.profiler import profile, ProfilerActivity

input_tensor = torch.randn(1000000, device='cuda')

with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    record_shapes=True,
) as prof:
    for _ in range(10):
        output = relu_cuda.relu_forward(input_tensor)

print("\n🔍 Profiler Results (Top 10 by CUDA time):")
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))


STEP 5: PyTorch Profiler Analysis

🔍 Profiler Results (Top 10 by CUDA time):
-----------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                           Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-----------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
    relu_cuda_kernel(float const*, float*, int)         0.00%       0.000us         0.00%       0.000us       0.000us     414.644us       100.00%     414.644us      41.464us            10  
                               aten::empty_like        49.03%       2.811ms        94.73%       5.431ms     543.096us       0.000us         0.00%       0.000us  

In [16]:
print("\n" + "=" * 80)
print("STEP 6: Optimized CUDA Kernel (Vectorized with float4)")
print("=" * 80)

cuda_source_optimized = """
#include <torch/extension.h>

__global__ void relu_cuda_kernel_optimized(const float* input, float* output, int size) {
    int idx = (blockIdx.x * blockDim.x + threadIdx.x) * 4;

    // Process 4 elements at once using float4
    if (idx + 3 < size) {
        float4 val = reinterpret_cast<const float4*>(input)[idx/4];
        float4 result;
        result.x = val.x > 0.0f ? val.x : 0.0f;
        result.y = val.y > 0.0f ? val.y : 0.0f;
        result.z = val.z > 0.0f ? val.z : 0.0f;
        result.w = val.w > 0.0f ? val.w : 0.0f;
        reinterpret_cast<float4*>(output)[idx/4] = result;
    }

    // Handle remaining elements
    for (int i = (size/4)*4 + threadIdx.x; i < size; i += blockDim.x) {
        if (i < size) {
            output[i] = input[i] > 0.0f ? input[i] : 0.0f;
        }
    }
}

torch::Tensor relu_forward_optimized(torch::Tensor input) {
    const int size = input.numel();
    auto output = torch::empty_like(input);

    const int threads = 256;
    const int blocks = (size/4 + threads - 1) / threads;

    relu_cuda_kernel_optimized<<<blocks, threads>>>(
        input.data_ptr<float>(),
        output.data_ptr<float>(),
        size
    );

    return output;
}
"""

print("\n🔨 Compiling optimized CUDA kernel...")
relu_cuda_opt = load_inline(
    name='relu_cuda_opt',
    cpp_sources="torch::Tensor relu_forward_optimized(torch::Tensor input);",
    cuda_sources=cuda_source_optimized,
    functions=['relu_forward_optimized'],
    verbose=False,
    extra_cuda_cflags=['-O3', '--use_fast_math']
)
print("✅ Optimized CUDA extension compiled successfully!")

# Test optimized version
test_input = torch.randn(1000000, device='cuda')
opt_output = relu_cuda_opt.relu_forward_optimized(test_input)
pytorch_output = torch.relu(test_input)
print(f"✅ Optimized version correctness: {torch.allclose(opt_output, pytorch_output, atol=1e-5)}")


W1026 18:41:07.690000 424 torch/utils/cpp_extension.py:2425] TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. 
W1026 18:41:07.690000 424 torch/utils/cpp_extension.py:2425] If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'] to specific architectures.



STEP 6: Optimized CUDA Kernel (Vectorized with float4)

🔨 Compiling optimized CUDA kernel...
✅ Optimized CUDA extension compiled successfully!
✅ Optimized version correctness: True


In [19]:
print("\n" + "=" * 80)
print("STEP 7: Performance Comparison (All Versions)")
print("=" * 80)

def benchmark_all_versions(size=10000000, iterations=50):
    input_tensor = torch.randn(size, device='cuda')

    # Warmup
    for _ in range(10):
        _ = relu_cuda.relu_forward(input_tensor)
        _ = relu_cuda_opt.relu_forward_optimized(input_tensor)
        _ = torch.relu(input_tensor)
    torch.cuda.synchronize()

    # Naive custom
    start = time.time()
    for _ in range(iterations):
        _ = relu_cuda.relu_forward(input_tensor)
    torch.cuda.synchronize()
    naive_time = (time.time() - start) / iterations

    # Optimized custom
    start = time.time()
    for _ in range(iterations):
        _ = relu_cuda_opt.relu_forward_optimized(input_tensor)
    torch.cuda.synchronize()
    opt_time = (time.time() - start) / iterations

    # PyTorch native
    start = time.time()
    for _ in range(iterations):
        _ = torch.relu(input_tensor)
    torch.cuda.synchronize()
    pytorch_time = (time.time() - start) / iterations

    print(f"\n📊 Final Benchmark (Size: {size:,} elements):")
    print(f"   {'Method':<30} {'Time (ms)':<15} {'Speedup':<10}")
    print(f"   {'-'*55}")
    print(f"   {'Naive Custom CUDA':<30} {naive_time*1000:>10.4f} ms   {pytorch_time/naive_time:>6.2f}x")
    print(f"   {'Optimized CUDA (float4)':<30} {opt_time*1000:>10.4f} ms   {pytorch_time/opt_time:>6.2f}x")
    print(f"   {'PyTorch Native':<30} {pytorch_time*1000:>10.4f} ms   {'1.00x':>10}")
    print(f"\n   🚀 Optimization gain: {naive_time/opt_time:.2f}x faster than naive version")

benchmark_all_versions()


STEP 7: Performance Comparison (All Versions)

📊 Final Benchmark (Size: 10,000,000 elements):
   Method                         Time (ms)       Speedup   
   -------------------------------------------------------
   Naive Custom CUDA                  0.3365 ms     1.01x
   Optimized CUDA (float4)            0.3311 ms     1.02x
   PyTorch Native                     0.3385 ms        1.00x

   🚀 Optimization gain: 1.02x faster than naive version


In [17]:
print("\n" + "=" * 80)
print("STEP 8: Memory Usage Analysis")
print("=" * 80)

torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

size = 10000000
input_tensor = torch.randn(size, device='cuda')
output = relu_cuda.relu_forward(input_tensor)

print(f"\n📈 Memory Statistics (for {size:,} elements):")
print(f"   Input size:       {input_tensor.element_size() * input_tensor.nelement() / 1024**2:.2f} MB")
print(f"   Output size:      {output.element_size() * output.nelement() / 1024**2:.2f} MB")
print(f"   Allocated memory: {torch.cuda.memory_allocated()/1024**2:.2f} MB")
print(f"   Reserved memory:  {torch.cuda.memory_reserved()/1024**2:.2f} MB")
print(f"   Peak memory:      {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")



STEP 8: Memory Usage Analysis

📈 Memory Statistics (for 10,000,000 elements):
   Input size:       38.15 MB
   Output size:      38.15 MB
   Allocated memory: 88.67 MB
   Reserved memory:  102.00 MB
   Peak memory:      92.48 MB


In [18]:
print("\n" + "=" * 80)
print("PROJECT SUMMARY")
print("=" * 80)

print("""
✅ Successfully implemented custom CUDA ReLU kernel in PyTorch
✅ Compiled and tested on GPU
✅ Benchmarked performance against PyTorch native implementation
✅ Profiled with PyTorch Profiler
✅ Implemented optimized version with vectorized memory access (float4)
✅ Analyzed memory usage patterns

Key Achievements:
- Custom CUDA kernel integration with PyTorch
- Performance profiling and bottleneck identification
- Code optimization with vectorization
- Memory-efficient GPU computation

Skills Demonstrated:
- PyTorch + CUDA programming
- GPU kernel optimization
- Performance profiling and analysis
- Memory management on GPU
""")

print("=" * 80)
print("🎉 CUDA KERNEL PROJECT COMPLETED SUCCESSFULLY!")
print("=" * 80)


PROJECT SUMMARY

✅ Successfully implemented custom CUDA ReLU kernel in PyTorch
✅ Compiled and tested on GPU
✅ Benchmarked performance against PyTorch native implementation
✅ Profiled with PyTorch Profiler
✅ Implemented optimized version with vectorized memory access (float4)
✅ Analyzed memory usage patterns

Key Achievements:
- Custom CUDA kernel integration with PyTorch
- Performance profiling and bottleneck identification
- Code optimization with vectorization
- Memory-efficient GPU computation

Skills Demonstrated:
- PyTorch + CUDA programming
- GPU kernel optimization
- Performance profiling and analysis
- Memory management on GPU

🎉 CUDA KERNEL PROJECT COMPLETED SUCCESSFULLY!
