In [1]:
%%writefile matrix_vec_mult.cu
#include <iostream>
#include <stdio.h>

__global__ void vectorMatrixMult(const float* A, const float* B, float* C, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N) {
      float sum=0.0f;
      for (int j = 0; j < N; j++) {
         sum += A[i*N+j]*B[j];
         // Add print statement inside the inner loop of the kernel
         printf("Thread %d, row %d, col %d: A[%d][%d] * B[%d] = %.2f * %.2f = %.2f, current sum = %.2f\n",
                threadIdx.x, i, j, i, j, j, A[i*N+j], B[j], A[i*N+j]*B[j], sum);
      }
      C[i]=sum;
      // Add print statement outside the inner loop of the kernel
      printf("Thread %d, row %d: Final sum for C[%d] = %.2f\n", threadIdx.x, i, i, C[i]);
}}

int main() {
    //initialize the matrix
    const int N = 10;
    float *A, *B, *C;

    // initialize the input matrices
    A = (float *)malloc( N*N* sizeof(float));
    B = (float *)malloc(N*sizeof(float));
    C = (float *)malloc(N*sizeof(float));


    for (int i = 0; i < N; i++)
    {
        // Add print statement inside the outer loop (for rows) of matrix A initialization
        printf("Initializing row %d of matrix A\n", i);
        for (int j = 0; j < N; j++)
        {
            A[i * N + j] = 1.0f;
            // Add print statement inside the inner loop (for columns) of matrix A initialization
            printf("  Initializing element A[%d][%d] to %.2f\n", i, j, A[i * N + j]);
        }
        B[i] = 2.0f;
        C[i] = 0.0f;
        // Add print statement inside the loop for initializing vectors B and C
        printf("Initializing element B[%d] to %.2f and C[%d] to %.2f\n", i, B[i], i, C[i]);
    }

    float *d_a, *d_b,*d_c;
    cudaMalloc(&d_a,N*N*sizeof(float));
    cudaMalloc(&d_b,N*sizeof(float));
    cudaMalloc(&d_c,N*sizeof(float));
    cudaMemcpy(d_a,A,N*N*sizeof(float),cudaMemcpyHostToDevice);
    cudaMemcpy(d_b,B,N*sizeof(float),cudaMemcpyHostToDevice);
    int blocksize=256;
    int gridsize = (N + blocksize - 1) / blocksize;
    vectorMatrixMult<<<gridsize,blocksize>>>(d_a,d_b,d_c,N);

  cudaDeviceSynchronize();
cudaMemcpy(C,d_c,N*sizeof(float),cudaMemcpyDeviceToHost);

     printf("A:\n");
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {

            printf("%.2f ", A[i * N + j]); // Prints each element with 2 decimal precision
        }
        printf("\n"); // Adds a newline after each row
    }

    printf("C:\n");
    for (int i = 0; i < N; i++) {


            printf("%.2f ",C[i]); // Prints each element with 2 decimal precision

    }
printf("\n");
     printf("B:\n");
    for (int i = 0; i < N; i++) {


            printf("%.2f ", B[i ]); // Prints each element with 2 decimal precision

    }



    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);

}

Writing matrix_vec_mult.cu


In [2]:
# Compile with the specified architecture
!nvcc matrix_vec_mult.cu -o matrix_vec_mult -gencode arch=compute_75,code=sm_75

# Run the executable
!./matrix_vec_mult

Initializing row 0 of matrix A
  Initializing element A[0][0] to 1.00
  Initializing element A[0][1] to 1.00
  Initializing element A[0][2] to 1.00
  Initializing element A[0][3] to 1.00
  Initializing element A[0][4] to 1.00
  Initializing element A[0][5] to 1.00
  Initializing element A[0][6] to 1.00
  Initializing element A[0][7] to 1.00
  Initializing element A[0][8] to 1.00
  Initializing element A[0][9] to 1.00
Initializing element B[0] to 2.00 and C[0] to 0.00
Initializing row 1 of matrix A
  Initializing element A[1][0] to 1.00
  Initializing element A[1][1] to 1.00
  Initializing element A[1][2] to 1.00
  Initializing element A[1][3] to 1.00
  Initializing element A[1][4] to 1.00
  Initializing element A[1][5] to 1.00
  Initializing element A[1][6] to 1.00
  Initializing element A[1][7] to 1.00
  Initializing element A[1][8] to 1.00
  Initializing element A[1][9] to 1.00
Initializing element B[1] to 2.00 and C[1] to 0.00
Initializing row 2 of matrix A
  Initializing element A[