In [3]:
%%writefile MatrixAdd.cu
#include <iostream>
#include <stdio.h>

__global__ void MatrixAdd_C(const float* A, const float* B, float* C, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    if (i < N)  {
        printf("Kernel C - Thread (%d, %d) in Block (%d, %d): i = %d\n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, i);
        for(int j=0;j<N;j++){
          C[i*N+j] = A[i*N+j] + B[i*N+j];
        }
    return;
    }
}



__global__ void MatrixAdd_B(const float* A, const float* B, float* C, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int j = blockIdx.y * blockDim.y + threadIdx.y;
    if ((i >= N) && (j >= N)) { return ; }

    printf("Kernel B - Thread (%d, %d) in Block (%d, %d): i = %d, j = %d\n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, i, j);
    C[i*N+j] = A[i*N+j] + B[i*N+j];

    }

__global__ void MatrixAdd_D(const float* A, const float* B, float* C, int N) {
    int j = blockIdx.y * blockDim.y + threadIdx.y;

    if (j < N)  {
        printf("Kernel D - Thread (%d, %d) in Block (%d, %d): j = %d\n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, j);
        for(int i=0;i<N;i++){

          C[i*N+j] = A[i*N+j] + B[i*N+j];

        }

    }

}



int main() {
    const int N = 10;
    float *A, *B, *C;

    // initialize the input matrices
    A = (float *)malloc( N*N* sizeof(float));
    B = (float *)malloc(N*N* sizeof(float));
    C = (float *)malloc(N*N * sizeof(float));


    for (int i = 0; i < N; i++)
    {
        for (int j = 0; j < N; j++)
        {
            A[i * N + j] = 1.0f;
            B[i * N + j] = 2.0f;
            C[i * N + j] = 0.0f;
        }
    }

    float *d_a, *d_b,*d_c;
    cudaMalloc((void **)&d_a,N*N*sizeof(float));
    cudaMalloc((void **)&d_b,N*N*sizeof(float));
    cudaMalloc((void **)&d_c,N*N*sizeof(float));
    cudaMemcpy(d_a,A,N*N*sizeof(float),cudaMemcpyHostToDevice);
    cudaMemcpy(d_b,B,N*N*sizeof(float),cudaMemcpyHostToDevice);

    dim3 dimBlock(32, 16);
    dim3 dimGrid(ceil(N / 32.0f), ceil(N/ 16.0f));
    MatrixAdd_B<<<dimGrid, dimBlock>>>(d_a, d_b, d_c,N);
    cudaDeviceSynchronize();

    cudaMemcpy(C,d_c,N*N*sizeof(float),cudaMemcpyDeviceToHost);
    printf("C:\n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {

            printf("%.2f ",C[i * N + j]); // Prints each element with 2 decimal precision
        }
        printf("\n"); // Adds a newline after each row
    }
     printf("A:\n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {

            printf("%.2f ", A[i * N + j]); // Prints each element with 2 decimal precision
        }
        printf("\n"); // Adds a newline after each row
    }
     printf("B:\n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {

            printf("%.2f ", B[i * N + j]); // Prints each element with 2 decimal precision
        }
        printf("\n"); // Adds a newline after each row
    }
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

}

Overwriting MatrixAdd.cu


In [4]:
# Compile with the specified architecture
!nvcc MatrixAdd.cu -o MatrixAdd -gencode arch=compute_75,code=sm_75

# Run the executable
!./MatrixAdd

Kernel B - Thread (0, 12) in Block (0, 0): i = 0, j = 12
Kernel B - Thread (1, 12) in Block (0, 0): i = 1, j = 12
Kernel B - Thread (2, 12) in Block (0, 0): i = 2, j = 12
Kernel B - Thread (3, 12) in Block (0, 0): i = 3, j = 12
Kernel B - Thread (4, 12) in Block (0, 0): i = 4, j = 12
Kernel B - Thread (5, 12) in Block (0, 0): i = 5, j = 12
Kernel B - Thread (6, 12) in Block (0, 0): i = 6, j = 12
Kernel B - Thread (7, 12) in Block (0, 0): i = 7, j = 12
Kernel B - Thread (8, 12) in Block (0, 0): i = 8, j = 12
Kernel B - Thread (9, 12) in Block (0, 0): i = 9, j = 12
Kernel B - Thread (0, 13) in Block (0, 0): i = 0, j = 13
Kernel B - Thread (1, 13) in Block (0, 0): i = 1, j = 13
Kernel B - Thread (2, 13) in Block (0, 0): i = 2, j = 13
Kernel B - Thread (3, 13) in Block (0, 0): i = 3, j = 13
Kernel B - Thread (4, 13) in Block (0, 0): i = 4, j = 13
Kernel B - Thread (5, 13) in Block (0, 0): i = 5, j = 13
Kernel B - Thread (6, 13) in Block (0, 0): i = 6, j = 13
Kernel B - Thread (7, 13) in Bl