In [23]:
%%writefile no_divergence.cu
#include <iostream>
#include <cuda_runtime.h>

#define N 1000000

__global__ void no_divergence_kernel(int* data) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < N) {
        if (true) {
            data[idx] *= 2;
        }
    }
}

int main() {
    int* h_data = new int[N];
    int* d_data;

    for (int i = 0; i < N; i++) {
        h_data[i] = i + 1;
    }

    cudaMalloc(&d_data, N * sizeof(int));
    cudaMemcpy(d_data, h_data, N * sizeof(int), cudaMemcpyHostToDevice);

    dim3 blockSize(256);
    dim3 gridSize((N + blockSize.x - 1) / blockSize.x);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    no_divergence_kernel<<<gridSize, blockSize>>>(d_data);
    cudaDeviceSynchronize();

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float ms = 0;
    cudaEventElapsedTime(&ms, start, stop);

    cudaMemcpy(h_data, d_data, N * sizeof(int), cudaMemcpyDeviceToHost);

    std::cout << "Step 1: No Divergence\n";
    std::cout << "Execution Time: " << ms << " ms\n";
    std::cout << "Output: " << h_data[0] << ", " << h_data[1] << ", " << h_data[2] << ", ..." << std::endl;

    cudaFree(d_data);
    delete[] h_data;

    return 0;
}


Overwriting no_divergence.cu


In [None]:
!nvcc no_divergence.cu -o no_divergence && ./no_divergence

In [30]:
%%writefile divergence.cu
#include <iostream>
#include <cuda_runtime.h>

#define N 1000000

__global__ void divergence_kernel(int* data) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < N) {
        if (idx % 2 == 0) {
            data[idx] *= 2;
        } else {
            data[idx] *= 3;
        }
    }
}

int main() {
    int* h_data = new int[N];
    int* d_data;

    for (int i = 0; i < N; i++) {
        h_data[i] = i + 1;
    }

    cudaMalloc(&d_data, N * sizeof(int));
    cudaMemcpy(d_data, h_data, N * sizeof(int), cudaMemcpyHostToDevice);

    dim3 blockSize(256);
    dim3 gridSize((N + blockSize.x - 1) / blockSize.x);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    divergence_kernel<<<gridSize, blockSize>>>(d_data);
    cudaDeviceSynchronize();

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float ms = 0;
    cudaEventElapsedTime(&ms, start, stop);

    cudaMemcpy(h_data, d_data, N * sizeof(int), cudaMemcpyDeviceToHost);

    std::cout << "Step 2: Divergence\n";
    std::cout << "Execution Time: " << ms << " ms\n";
    std::cout << "Output: " << h_data[0] << ", " << h_data[1] << ", " << h_data[2] << ", ..." << std::endl;

    cudaFree(d_data);
    delete[] h_data;

    return 0;
}


Overwriting divergence.cu


In [None]:
!nvcc divergence.cu -o divergence && ./divergence

In [37]:
%%writefile warp_aligned.cu
#include <iostream>
#include <cuda_runtime.h>

#define N 1000000

__global__ void warp_aligned_kernel(int* data) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int warpSize = 32;
    if (idx < N) {
        if ((idx / warpSize) % 2 == 0) {
            data[idx] *= 2;
        } else {
            data[idx] *= 3;
        }
    }
}

int main() {
    int* h_data = new int[N];
    int* d_data;

    for (int i = 0; i < N; i++) {
        h_data[i] = i + 1;
    }

    cudaMalloc(&d_data, N * sizeof(int));
    cudaMemcpy(d_data, h_data, N * sizeof(int), cudaMemcpyHostToDevice);

    dim3 blockSize(256);
    dim3 gridSize((N + blockSize.x - 1) / blockSize.x);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    warp_aligned_kernel<<<gridSize, blockSize>>>(d_data);
    cudaDeviceSynchronize();

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float ms = 0;
    cudaEventElapsedTime(&ms, start, stop);

    cudaMemcpy(h_data, d_data, N * sizeof(int), cudaMemcpyDeviceToHost);

    std::cout << "Step 3: Warp-Aligned Branching\n";
    std::cout << "Execution Time: " << ms << " ms\n";
    std::cout << "Output: " << h_data[0] << ", " << h_data[1] << ", " << h_data[2] << ", ..." << std::endl;

    cudaFree(d_data);
    delete[] h_data;

    return 0;
}



Overwriting warp_aligned.cu


In [None]:
!nvcc warp_aligned.cu -o warp_aligned && ./warp_aligned