In [58]:
%%writefile matrix_multiply_shared_memory.cu
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>

//ajoute/
#include <cassert>
#include <iostream>
using namespace std;

// Taille des matrices
#define M 512
#define K 512
#define N 512
#define TILE_SIZE 16 // Taille d'un bloc partagé

// Kernel pour la multiplication matricielle optimisée avec mémoire partagée
__global__ void matrixMultiplyShared(const float* A, const float* B, float* C, int m, int k, int n) {
    // Matrices partagées pour les blocs
    __shared__ float tileA[TILE_SIZE][TILE_SIZE];
    __shared__ float tileB[TILE_SIZE][TILE_SIZE];

    // Indices du thread dans la grille et le bloc
    int row = blockIdx.y * TILE_SIZE + threadIdx.y;
    int col = blockIdx.x * TILE_SIZE + threadIdx.x;

    float sum = 0.0f;

    // Parcourir les sous-matrices de A et B
    for (int t = 0; t < (k + TILE_SIZE - 1) / TILE_SIZE; t++) {
        // Charger les blocs dans la mémoire partagée
        if (row < m && t * TILE_SIZE + threadIdx.x < k)
            tileA[threadIdx.y][threadIdx.x] = A[row * k + t * TILE_SIZE + threadIdx.x];
        else
            tileA[threadIdx.y][threadIdx.x] = 0.0f;

        if (col < n && t * TILE_SIZE + threadIdx.y < k)
            tileB[threadIdx.y][threadIdx.x] = B[(t * TILE_SIZE + threadIdx.y) * n + col];
        else
            tileB[threadIdx.y][threadIdx.x] = 0.0f;

        __syncthreads(); // Synchronisation des threads pour que tout le bloc soit chargé

        // Calcul des produits scalaires pour ce sous-bloc
        for (int i = 0; i < TILE_SIZE; i++) {
            sum += tileA[threadIdx.y][i] * tileB[i][threadIdx.x];
        }

        __syncthreads(); // Synchronisation avant de charger les prochains blocs
    }

    // Écriture du résultat dans la mémoire globale
    if (row < m && col < n) {
        C[row * n + col] = sum;
    }
}

//verif on CPU
void verify_result(float* A, float* B, float* C, int n) {
    float tmp;
    // for every row
    for (int i = 0; i < n; i++) {
        //for every col
        for (int j = 0; j < n; j++) {
            //for every element in the row-col pair
            tmp = 0;
            for (int k = 0; k < n ; k++) {
                tmp += A[i * n + k] * B[k * n + j];
            }

            //check each result
            assert(tmp == C[i * n + j]);
        }
    }
}

// Fonction principale
int main() {
    int m = M, k = K, n = N;

    // Allocation et initialisation des matrices sur le CPU
    size_t sizeA = m * k * sizeof(float);
    size_t sizeB = k * n * sizeof(float);
    size_t sizeC = m * n * sizeof(float);
    float *h_A = (float*)malloc(sizeA);
    float *h_B = (float*)malloc(sizeB);
    float *h_C = (float*)malloc(sizeC);

    for (int i = 0; i < m * k; i++) h_A[i] = 1 ;
    for (int i = 0; i < k * n; i++) h_B[i] = 1 ;

    // Allocation mémoire sur le GPU
    float *d_A, *d_B, *d_C;
    cudaMalloc((void**)&d_A, sizeA);
    cudaMalloc((void**)&d_B, sizeB);
    cudaMalloc((void**)&d_C, sizeC);

    cudaMemcpy(d_A, h_A, sizeA, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, sizeB, cudaMemcpyHostToDevice);

    // Définir les dimensions des threads et des blocs
    dim3 threadsPerBlock(TILE_SIZE, TILE_SIZE);
    dim3 blocksPerGrid((n + TILE_SIZE - 1) / TILE_SIZE, (m + TILE_SIZE - 1) / TILE_SIZE);

    // Définition des événements pour la mesure du temps
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Démarrage de la mesure
    cudaEventRecord(start);

    // Lancement du kernel
    matrixMultiplyShared<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, m, k, n);

    // Arrêt de la mesure
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    // Calcul de la durée
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    printf("Temps d'exécution du kernel : %f ms\n", milliseconds);

    // Destruction des événements
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    // Copier le résultat vers le CPU
    cudaMemcpy(h_C, d_C, sizeC, cudaMemcpyDeviceToHost);

    printf("C[0][0] = %f\n", h_C[0]);

    //verif result with the CPU
    verify_result(h_A, h_B, h_C, n);
    cout << "Matrix Multiplication Successfully Calculated on GPU and Verified by the CPU" << endl;

    // Libération de la mémoire
    free(h_A);
    free(h_B);
    free(h_C);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}

Overwriting matrix_multiply_shared_memory.cu


In [59]:
!nvcc matrix_multiply_shared_memory.cu -o matrix_multiply_shared_memory

In [60]:
!./matrix_multiply_shared_memory

Temps d'exécution du kernel : 0.900704 ms
C[0][0] = 512.000000
Matrix Multiplication Successfully Calculated on GPU and Verified by the CPU


In [61]:
!nvprof ./matrix_multiply_shared_memory

==15722== NVPROF is profiling process 15722, command: ./matrix_multiply_shared_memory
Temps d'exécution du kernel : 0.875360 ms
C[0][0] = 512.000000
Matrix Multiplication Successfully Calculated on GPU and Verified by the CPU
==15722== Profiling application: ./matrix_multiply_shared_memory
==15722== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   73.36%  742.38us         1  742.38us  742.38us  742.38us  matrixMultiplyShared(float const *, float const *, float*, int, int, int)
                   17.36%  175.64us         2  87.821us  87.325us  88.318us  [CUDA memcpy HtoD]
                    9.29%  93.981us         1  93.981us  93.981us  93.981us  [CUDA memcpy DtoH]
      API calls:   96.15%  70.824ms         3  23.608ms  3.3880us  70.750ms  cudaMalloc
                    1.90%  1.4011ms         3  467.03us  240.86us  830.41us  cudaMemcpy
                    1.01%  745.13us         1  745.13us  745.13us  745.13us  cud