In [None]:
%%writefile mult.cu

#include <iostream>         // For standard input/output operations
#include <cuda_runtime.h>   // For CUDA runtime API
#include <device_launch_parameters.h> // For defining CUDA kernel launch parameters
#include <stdlib.h>



#define TILE_SIZE 16  // Defining the tile size


__global__ void matrixMulSharedMemory(float *A, float *B, float *C, int N) {
    __shared__ float tile_A[TILE_SIZE][TILE_SIZE];  // Shared memory for A
    __shared__ float tile_B[TILE_SIZE][TILE_SIZE];  // Shared memory for B

    int tx = threadIdx.x;  // Thread x-coordinate
    int ty = threadIdx.y;  // Thread y-coordinate
    int row = blockIdx.y * TILE_SIZE + ty;     // Row index for C
    int col = blockIdx.x * TILE_SIZE + tx;  //Column index for C
    float Cvalue = 0.0f;

    for (int t = 0; t < (N / TILE_SIZE); ++t) {
        // Load tiles into shared memory
        tile_A[ty][tx] = A[row * N + (t * TILE_SIZE + tx)];
        tile_B[ty][tx] = B[(t * TILE_SIZE + ty) * N + col];

        __syncthreads();  // Synchronize threads to ensure all data is loaded

        // Perform multiplication for the current tile
        for (int k = 0; k < TILE_SIZE; ++k) {
            Cvalue += tile_A[ty][k] * tile_B[k][tx];
        }

        __syncthreads();  // Synchronize again before loading the next tile
    }

    if (row < N && col < N) {
        C[row * N + col] = Cvalue;  // Store the result in matrix C
    }
}

int main() {
    int N = 32;  // Size of the matrix (NxN)
    size_t size = N * N * sizeof(float);

    // Allocate memory on the host
    float *h_A = (float*)malloc(size);
    float *h_B = (float*)malloc(size);
    float *h_C = (float*)malloc(size);

    // Initialize matrices A and B with random values
    for (int i = 0; i < N * N; ++i) {
        h_A[i] = 1;
        h_B[i] = 2;
    }

    // Allocate memory on the device
    float *d_A, *d_B, *d_C;
    cudaMalloc((void**)&d_A, size);
    cudaMalloc((void**)&d_B, size);
    cudaMalloc((void**)&d_C, size);

    // Copy matrices A and B to device memory
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

    // Define grid and block dimensions
    dim3 threadsPerBlock(TILE_SIZE, TILE_SIZE);
    dim3 numBlocks(N / TILE_SIZE, N / TILE_SIZE);

    // Launch the kernel
    matrixMulSharedMemory<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C, N);

    cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
    std::cout << "CUDA Kernel Launch Error: " << cudaGetErrorString(err) << std::endl;
}

    cudaDeviceSynchronize();

    // Copy result matrix C back to host memory
    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

    // Optionally, print part of the result matrix for verification
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            if (i < 5 && j < 5) {  // Just print a few values for checking
                std::cout << h_C[i * N + j] << " ";
            }
        }
        std::cout << std::endl;
    }

    // Free memory
    free(h_A);
    free(h_B);
    free(h_C);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}


Writing mult.cu


In [None]:
!nvcc -gencode=arch=compute_75,code=sm_75 -o mult mult.cu

In [None]:

!nvprof ./mult

==608== NVPROF is profiling process 608, command: ./mult
64 64 64 64 64 
64 64 64 64 64 
64 64 64 64 64 
64 64 64 64 64 
64 64 64 64 64 



























==608== Profiling application: ./mult
==608== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   52.58%  5.2150us         1  5.2150us  5.2150us  5.2150us  matrixMulSharedMemory(float*, float*, float*, int)
                   24.52%  2.4320us         2  1.2160us     992ns  1.4400us  [CUDA memcpy HtoD]
                   22.91%  2.2720us         1  2.2720us  2.2720us  2.2720us  [CUDA memcpy DtoH]
      API calls:   99.76%  186.34ms         3  62.112ms  3.4990us  186.33ms  cudaMalloc
                    0.07%  136.18us       114  1.1940us     105ns  58.062us  cuDeviceGetAttribute
                    0.06%  115.45us         3  38.483us  5.5650us  99.516us  cudaFree
                    0.06%  111.50us         1  111.50us  111.50us  111.50us  cudaLaunchKernel
    

In [None]:
!ncu --version

NVIDIA (R) Nsight Compute Command Line Profiler
Copyright (c) 2018-2024 NVIDIA Corporation
Version 2024.2.1.0 (build 34372528) (public-release)


In [None]:
!nvcc --version


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Jun__6_02:18:23_PDT_2024
Cuda compilation tools, release 12.5, V12.5.82
Build cuda_12.5.r12.5/compiler.34385749_0


In [1]:
!lscpu

Architecture:             x86_64
  CPU op-mode(s):         32-bit, 64-bit
  Address sizes:          46 bits physical, 48 bits virtual
  Byte Order:             Little Endian
CPU(s):                   2
  On-line CPU(s) list:    0,1
Vendor ID:                GenuineIntel
  Model name:             Intel(R) Xeon(R) CPU @ 2.20GHz
    CPU family:           6
    Model:                79
    Thread(s) per core:   2
    Core(s) per socket:   1
    Socket(s):            1
    Stepping:             0
    BogoMIPS:             4399.99
    Flags:                fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge m
                          ca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht sysc
                          all nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xt
                          opology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq
                           ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt
                           aes xsave avx f16c rdrand hypervisor 