2D array addition

In [3]:
%%writefile addition.cu
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>

__global__ void add_arrays(int *a, int *b, int *c, int M, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    int idx = row * N + col;

    if (row < M && col < N) {
        c[idx] = a[idx] + b[idx];
    }
}

int main() {
    int M = 10; // Number of rows
    int N = 10; // Number of columns

    // Allocate host memory for arrays a and b
    int *h_a, *h_b;
    h_a = (int *)malloc(M * N * sizeof(int));
    h_b = (int *)malloc(M * N * sizeof(int));
    if (h_a == nullptr || h_b == nullptr) {
        fprintf(stderr, "Failed to allocate host memory!\n");
        return 1;
    }

    // Initialize arrays a and b with sample values (replace with your initialization logic)
    for (int i = 0; i < M; i++) {
        for (int j = 0; j < N; j++) {
            h_a[i * N + j] = i * j;
            h_b[i * N + j] = i + j;
        }
    }

    // Allocate device memory for arrays a, b, and c
    int *d_a, *d_b, *d_c;
    cudaMalloc((void **)&d_a, M * N * sizeof(int));
    cudaMalloc((void **)&d_b, M * N * sizeof(int));
    cudaMalloc((void **)&d_c, M * N * sizeof(int));
    if (d_a == nullptr || d_b == nullptr || d_c == nullptr) {
        fprintf(stderr, "Failed to allocate device memory!\n");
        cudaFree(h_a);
        cudaFree(h_b);
        return 1;
    }

    // Copy data from host to device memory
    cudaMemcpy(d_a, h_a, M * N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, M * N * sizeof(int), cudaMemcpyHostToDevice);
    cudaDeviceSynchronize(); // Ensure data transfer is complete

    // Define kernel launch parameters
    int threadsPerBlock = 256; // Adjust based on your GPU architecture
    dim3 blocksInGrid(ceil(float(N) / threadsPerBlock), ceil(float(M) / threadsPerBlock));

    // Launch the kernel
    add_arrays<<<blocksInGrid, threadsPerBlock>>>(d_a, d_b, d_c, M, N);

    // Wait for kernel execution to finish
    cudaDeviceSynchronize();

    // Optional: Copy results from device to host memory
    // int *h_c;
    // h_c = (int *)malloc(M * N * sizeof(int));
    // cudaMemcpy(h_c, d_c, M * N * sizeof(int), cudaMemcpyDeviceToHost);

    // Free device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    // Free host memory
    free(h_a);
    free(h_b);

    // Optional: Print the results on the host (if h_c was allocated)
    // for (int i = 0; i < M; i++) {
    //     for (int j = 0; j < N; j++) {
    //         printf("%d ", h_c[i * N + j]);
    //     }
    //     printf("\n");
    // }

    return 0;
}


Overwriting addition.cu


In [4]:
!nvcc -o addition addition.cu

In [13]:
!./a.out

Thread 0 (0, 0) is adding elements at index 0
Thread 1 (0, 0) is adding elements at index 1
Thread 2 (0, 0) is adding elements at index 2
Thread 3 (0, 0) is adding elements at index 3
Thread 4 (0, 0) is adding elements at index 4
Thread 5 (0, 0) is adding elements at index 5
Thread 6 (0, 0) is adding elements at index 6
Thread 7 (0, 0) is adding elements at index 7
Thread 8 (0, 0) is adding elements at index 8
Thread 9 (0, 0) is adding elements at index 9


In [9]:
%%writefile add.cu
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>

__global__ void add_arrays(int *a, int *b, int *c, int M, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    int idx = row * N + col;

    if (row < M && col < N) {
        c[idx] = a[idx] + b[idx];
        printf("Thread %d (%d, %d) is adding elements at index %d\n",
               blockIdx.x * blockDim.x + threadIdx.x + blockIdx.y * blockDim.y * gridDim.x + threadIdx.y,
               blockIdx.x, blockIdx.y, idx);
    }
}

int main() {
    int M = 10; // Number of rows
    int N = 10; // Number of columns

    // Allocate host memory for arrays a and b
    int *h_a, *h_b;
    h_a = (int *)malloc(M * N * sizeof(int));
    h_b = (int *)malloc(M * N * sizeof(int));
    if (h_a == nullptr || h_b == nullptr) {
        fprintf(stderr, "Failed to allocate host memory!\n");
        return 1;
    }

    // Initialize arrays a and b with sample values (replace with your initialization logic)
    for (int i = 0; i < M; i++) {
        for (int j = 0; j < N; j++) {
            h_a[i * N + j] = i * j;
            h_b[i * N + j] = i + j;
        }
    }

    // Allocate device memory for arrays a, b, and c
    int *d_a, *d_b, *d_c;
    cudaMalloc((void **)&d_a, M * N * sizeof(int));
    cudaMalloc((void **)&d_b, M * N * sizeof(int));
    cudaMalloc((void **)&d_c, M * N * sizeof(int));
    if (d_a == nullptr || d_b == nullptr || d_c == nullptr) {
        fprintf(stderr, "Failed to allocate device memory!\n");
        cudaFree(h_a);
        cudaFree(h_b);
        return 1;
    }

    // Copy data from host to device memory
    cudaMemcpy(d_a, h_a, M * N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, M * N * sizeof(int), cudaMemcpyHostToDevice);
    cudaDeviceSynchronize(); // Ensure data transfer is complete

    // Define kernel launch parameters
    int threadsPerBlock = 256; // Adjust based on your GPU architecture
    dim3 blocksInGrid(ceil(float(N) / threadsPerBlock), ceil(float(M) / threadsPerBlock));

    // Launch the kernel
    add_arrays<<<blocksInGrid, threadsPerBlock>>>(d_a, d_b, d_c, M, N);

    // Wait for kernel execution to finish
    cudaDeviceSynchronize();

    // Free device memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

    // Free host memory
    free(h_a);
    free(h_b);

    return 0;
}

Writing add.cu


In [11]:
!nvcc add.cu

In [12]:
!./a.out

Thread 0 (0, 0) is adding elements at index 0
Thread 1 (0, 0) is adding elements at index 1
Thread 2 (0, 0) is adding elements at index 2
Thread 3 (0, 0) is adding elements at index 3
Thread 4 (0, 0) is adding elements at index 4
Thread 5 (0, 0) is adding elements at index 5
Thread 6 (0, 0) is adding elements at index 6
Thread 7 (0, 0) is adding elements at index 7
Thread 8 (0, 0) is adding elements at index 8
Thread 9 (0, 0) is adding elements at index 9
