In [24]:
!apt-get update
!apt-get install -y nvidia-cuda-toolkit
!sudo apt-get install nvidia-nsight-compute
!sudo apt-get install nvidia-nsight-systems

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:6 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Hit:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Get:11 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [2,734 kB]
Fetched 2,991 kB in 5s (658 kB/s)
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.il

In [34]:
%%writefile task6.cu
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cassert>
#include <iostream>
using namespace std;

// Matrix dimensions
#define M 512
#define K 512
#define N 512
#define TILE_SIZE 16
#define PADDED_TILE_SIZE (TILE_SIZE + 1) // Padding to avoid bank conflicts

// Optimized kernel with loop unrolling for matrix multiplication
__global__ void matrixMultiplyShared(const float* A, const float* B, float* C, int m, int k, int n) {
    __shared__ float tileA[PADDED_TILE_SIZE][PADDED_TILE_SIZE];
    __shared__ float tileB[PADDED_TILE_SIZE][PADDED_TILE_SIZE];

    int row = blockIdx.y * TILE_SIZE + threadIdx.y;
    int col = blockIdx.x * TILE_SIZE + threadIdx.x;

    float sum = 0.0f;

    // Process tiles of A and B
    for (int t = 0; t < (k + TILE_SIZE - 1) / TILE_SIZE; t++) {
        // Load shared memory with tiles from A and B
        if (row < m && t * TILE_SIZE + threadIdx.x < k)
            tileA[threadIdx.y][threadIdx.x] = A[row * k + t * TILE_SIZE + threadIdx.x];
        else
            tileA[threadIdx.y][threadIdx.x] = 0.0f;

        if (col < n && t * TILE_SIZE + threadIdx.y < k)
            tileB[threadIdx.y][threadIdx.x] = B[(t * TILE_SIZE + threadIdx.y) * n + col];
        else
            tileB[threadIdx.y][threadIdx.x] = 0.0f;

        __syncthreads();

        // Loop unrolling: Unroll loop by 4 (adjust based on profiling feedback)
        #pragma unroll 4
        for (int i = 0; i < TILE_SIZE; i++) {
            sum += tileA[threadIdx.y][i] * tileB[i][threadIdx.x];
        }

        __syncthreads();
    }

    // Write the result back to global memory
    if (row < m && col < n) {
        C[row * n + col] = sum;
    }
}

void verify_result(float* A, float* B, float* C, int n) {
    float tmp;
    const float epsilon = 1e-4;

    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            tmp = 0.0f;
            for (int k = 0; k < n; k++) {
                tmp += A[i * n + k] * B[k * n + j];
            }

            if (fabs(tmp - C[i * n + j]) > epsilon) {
                fprintf(stderr, "Verification failed at row %d, column %d: CPU = %f, GPU = %f\n",
                        i, j, tmp, C[i * n + j]);
                exit(EXIT_FAILURE);
            }
        }
    }
}

int main() {
    int m = M, k = K, n = N;

    size_t sizeA = m * k * sizeof(float);
    size_t sizeB = k * n * sizeof(float);
    size_t sizeC = m * n * sizeof(float);
    float *h_A = (float*)malloc(sizeA);
    float *h_B = (float*)malloc(sizeB);
    float *h_C = (float*)malloc(sizeC);

    for (int i = 0; i < m * k; i++) h_A[i] = 1.0f;
    for (int i = 0; i < k * n; i++) h_B[i] = 1.0f;

    float *d_A, *d_B, *d_C;
    cudaMalloc((void**)&d_A, sizeA);
    cudaMalloc((void**)&d_B, sizeB);
    cudaMalloc((void**)&d_C, sizeC);

    cudaMemcpy(d_A, h_A, sizeA, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, sizeB, cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(TILE_SIZE, TILE_SIZE);
    dim3 blocksPerGrid((n + TILE_SIZE - 1) / TILE_SIZE, (m + TILE_SIZE - 1) / TILE_SIZE);

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    matrixMultiplyShared<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, m, k, n);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    printf("Kernel Execution Time: %f ms\n", milliseconds);

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    cudaMemcpy(h_C, d_C, sizeC, cudaMemcpyDeviceToHost);
    verify_result(h_A, h_B, h_C, n);

    printf("Matrix Multiplication Successful and Verified!\n");

    free(h_A);
    free(h_B);
    free(h_C);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}


Overwriting task6.cu


In [35]:
!nvcc -o task6 task6.cu

In [39]:
!nvprof ./task6

==17205== NVPROF is profiling process 17205, command: ./task6
Kernel Execution Time: 1.145632 ms
Matrix Multiplication Successful and Verified!
==17205== Profiling application: ./task6
==17205== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   79.26%  979.18us         1  979.18us  979.18us  979.18us  matrixMultiplyShared(float const *, float const *, float*, int, int, int)
                   14.20%  175.48us         2  87.741us  87.645us  87.837us  [CUDA memcpy HtoD]
                    6.54%  80.766us         1  80.766us  80.766us  80.766us  [CUDA memcpy DtoH]
      API calls:   96.84%  96.783ms         3  32.261ms  3.7950us  96.695ms  cudaMalloc
                    1.45%  1.4459ms         3  481.95us  245.74us  871.34us  cudaMemcpy
                    0.98%  982.12us         1  982.12us  982.12us  982.12us  cudaEventSynchronize
                    0.31%  313.36us         3  104.45us  38.071us  158.98us  cudaFree
  