In [0]:
%%cu
#include <stdio.h>
#include <stdlib.h>
#define SQMATWIDTH 3

//generating kernel
__global__ void MatrixMultiply (float * d_inMat1, float * d_inMat2, float * d_productMat)
{
    int N = SQMATWIDTH;
    int row = blockIdx.y * N + threadIdx.y;
    int column = blockIdx.x * N + threadIdx.x;

    if (row < N && column < N)     //to ensure no extra threads are utilized in kernel
    {
        float sum = 0;
        for (int i = 0; i < N; i++)
           {sum += d_inMat1[row * N + i] * d_inMat2[i * N + column]; } //multiplication of matrices

        d_productMat[row * N + column] = sum;

    }

}

int main()
{
    const int width = SQMATWIDTH;
    const int totalElements = SQMATWIDTH * SQMATWIDTH;
    const int matrixsize =  totalElements * sizeof(float);

    //declaring input matrices
    float h_inMat1[totalElements] = {1,2,3,4,5,6,7,8,9};
    float h_inMat2[totalElements] = {1,2,3,4,5,6,7,8,9};
    float h_productMat [totalElements];

    //generating input matrix 1 from user input
    printf ("Please enter values of %d elements for Input Matrix 1:", totalElements);

    for (int i = 0; i < SQMATWIDTH; i++)
        for (int j = 0; j < SQMATWIDTH; j++)
              scanf ("%f", &h_inMat1[i*SQMATWIDTH + j]);


    printf ("Please enter values of %d elements for Input Matrix 2:", totalElements);

    for (int i = 0; i < SQMATWIDTH; i++)
        for (int j = 0; j < SQMATWIDTH; j++)
              scanf ("%f", &h_inMat2[i*SQMATWIDTH + j]);

    //displaying input matrices

    printf("\n \n Input Matrix 1: \n");

    for (int i = 0; i < SQMATWIDTH; i++)
    {
        for (int j = 0; j < SQMATWIDTH; j++)
        {
            printf("%0.2f \t", h_inMat1[i*SQMATWIDTH + j]);
        }
         printf("\n");
    }

    printf("\n \n Input Matrix 2: \n");

    for (int i = 0; i < SQMATWIDTH; i++)
    {
        for (int j = 0; j < SQMATWIDTH; j++)
        {
            printf("%0.2f \t", h_inMat2[i*SQMATWIDTH + j]);
        }
        printf("\n");
    }

    //declare GPU memory pointers
    float * d_inMat1;
    float * d_inMat2;
    float * d_productMat;

    //allocate memory on GPU
    cudaMalloc ((void**) &d_inMat1, matrixsize);
    cudaMalloc ((void**) &d_inMat2, matrixsize);
    cudaMalloc ((void**) &d_productMat, matrixsize);

    //transfer array to GPU
    cudaMemcpy (d_inMat1, h_inMat1, matrixsize, cudaMemcpyHostToDevice);
    cudaMemcpy (d_inMat2, h_inMat2, matrixsize, cudaMemcpyHostToDevice);

    //launching kernel
    MatrixMultiply<<<dim3(5) , dim3(19,19)>>>(d_inMat1, d_inMat2, d_productMat);

    //copy output matrix back to host
    cudaMemcpy (h_productMat, d_productMat, matrixsize, cudaMemcpyDeviceToHost);

    //display output matrix
    printf("\n \n The Product Matrix is : \n");

    for (int i = 0; i < SQMATWIDTH; i++)
    {
        for (int j = 0; j < SQMATWIDTH; j++)
        {
            printf("%0.2f \t", h_productMat[i*SQMATWIDTH + j]);
        }
        printf("\n");
    }


    return 0;
}
