In [35]:
%%writefile matrices_multiply.cu
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define N 512                  // Size of the matrices (N x N)
#define THREADS_PER_BLOCK_X 32    // Threads per block x dimension
#define THREADS_PER_BLOCK_Y 32    // Threads per block y dimension

// CUDA kernel for matrix multiplication using blocks and threads
__global__ void matrixMul(int *A, int *B, int *C, int width) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < width && col < width) {
        int value = 0;
        for (int k = 0; k < width; k++) {
            value += A[row * width + k] * B[k * width + col];
        }
        C[row * width + col] = value;
    }
}

// Function to initialize matrices with random integers
void random_ints(int *array, int size) {
    for (int i = 0; i < size; i++) {
        array[i] = rand() % 10;  // Random integers between 0 and 9
    }
}

int main(void) {
    int *A, *B, *C;             // Host copies of A, B, C
    int *d_A, *d_B, *d_C;       // Device copies of A, B, C
    int size = N * N * sizeof(int); // Size in bytes

    // Allocate space for device copies of A, B, C
    cudaMalloc((void **)&d_A, size);
    cudaMalloc((void **)&d_B, size);
    cudaMalloc((void **)&d_C, size);

    // Allocate space for host copies of A, B, C and initialize input values
    A = (int *)malloc(size); random_ints(A, N * N);
    B = (int *)malloc(size); random_ints(B, N * N);
    C = (int *)malloc(size);

    // Copy inputs to device
    cudaMemcpy(d_A, A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, size, cudaMemcpyHostToDevice);

     // Define grid and block dimensions
    dim3 threadsPerBlock(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y);
    dim3 blocksPerGrid((N + threadsPerBlock.x - 1) / threadsPerBlock.x,
                       (N + threadsPerBlock.y - 1) / threadsPerBlock.y);


    // Launch matrixMul() kernel on GPU
    matrixMul<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);

    // Copy result back to host
    cudaMemcpy(C, d_C, size, cudaMemcpyDeviceToHost);

    // Print a few results for verification
    printf("Result of Matrix multiplication with blocks and threads:\n");
    for (int i = 0; i < 10; i++) {  // Print first 10 results
        printf("C[%d][%d] = %d\n", i / N, i % N, C[i]);
    }

    // Cleanup
    free(A); free(B); free(C);
    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
    return 0;
}

Overwriting matrices_multiply.cu


In [36]:
!nvcc -arch=sm_75 matrices_multiply.cu -o matrices_multiply

In [37]:
!./matrices_multiply

Result of Matrix multiplication with blocks and threads:
C[0][0] = 11057
C[0][1] = 10652
C[0][2] = 10663
C[0][3] = 10558
C[0][4] = 10841
C[0][5] = 9932
C[0][6] = 9859
C[0][7] = 10535
C[0][8] = 10708
C[0][9] = 10692


In [38]:
!nvprof ./matrices_multiply

==5731== NVPROF is profiling process 5731, command: ./matrices_multiply
Result of Matrix multiplication with blocks and threads:
C[0][0] = 11057
C[0][1] = 10652
C[0][2] = 10663
C[0][3] = 10558
C[0][4] = 10841
C[0][5] = 9932
C[0][6] = 9859
C[0][7] = 10535
C[0][8] = 10708
C[0][9] = 10692
==5731== Profiling application: ./matrices_multiply
==5731== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   76.35%  885.64us         1  885.64us  885.64us  885.64us  matrixMul(int*, int*, int*, int)
                   15.10%  175.20us         2  87.598us  87.518us  87.678us  [CUDA memcpy HtoD]
                    8.55%  99.166us         1  99.166us  99.166us  99.166us  [CUDA memcpy DtoH]
      API calls:   97.10%  93.809ms         3  31.270ms  3.3130us  93.734ms  cudaMalloc
                    2.35%  2.2709ms         3  756.97us  234.98us  1.7360ms  cudaMemcpy
                    0.25%  237.38us         3  79.128us  14.053us  118.01u