In [None]:
!apt-get update
!apt-get install -y nvidia-cuda-toolkit

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,113 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:11 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [8,482 kB]
Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:13 http://security.

In [43]:
%%writefile Task4.cu
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>

//ajoute/
#include <cassert>
#include <iostream>
using namespace std;

// Taille des matrices
#define M 512
#define K 512
#define N 512
#define TILE_SIZE 16 // Taille d'un bloc partagé
#define PADDED_TILE_SIZE (TILE_SIZE + 1) // Add padding to prevent bank conflicts

// Kernel pour la multiplication matricielle optimisée avec mémoire partagée
__global__ void matrixMultiplyShared(const float* A, const float* B, float* C, int m, int k, int n) {
    // Matrices partagées pour les blocs avec padding
    __shared__ float tileA[PADDED_TILE_SIZE][PADDED_TILE_SIZE];
    __shared__ float tileB[PADDED_TILE_SIZE][PADDED_TILE_SIZE];

    // Indices du thread dans la grille et le bloc
    int row = blockIdx.y * TILE_SIZE + threadIdx.y;
    int col = blockIdx.x * TILE_SIZE + threadIdx.x;

    float sum = 0.0f;

    // Parcourir les sous-matrices de A et B
    for (int t = 0; t < (k + TILE_SIZE - 1) / TILE_SIZE; t++) {
        // Charger les blocs dans la mémoire partagée avec padding
        if (row < m && t * TILE_SIZE + threadIdx.x < k)
            tileA[threadIdx.y][threadIdx.x] = A[row * k + t * TILE_SIZE + threadIdx.x];
        else
            tileA[threadIdx.y][threadIdx.x] = 0.0f;

        if (col < n && t * TILE_SIZE + threadIdx.y < k)
            tileB[threadIdx.y][threadIdx.x] = B[(t * TILE_SIZE + threadIdx.y) * n + col];
        else
            tileB[threadIdx.y][threadIdx.x] = 0.0f;

        __syncthreads(); // Synchronisation des threads pour que tout le bloc soit chargé

        // Calcul des produits scalaires pour ce sous-bloc
        for (int i = 0; i < TILE_SIZE; i++) {
            sum += tileA[threadIdx.y][i] * tileB[i][threadIdx.x];
        }

        __syncthreads(); // Synchronisation avant de charger les prochains blocs
    }

    // Écriture du résultat dans la mémoire globale
    if (row < m && col < n) {
        C[row * n + col] = sum;
    }
}

//verif on CPU
void verify_result(float* A, float* B, float* C, int n) {
    float tmp;
    const float epsilon = 1e-4; // Tolerance for floating-point comparison

    // for every row
    for (int i = 0; i < n; i++) {
        // for every column
        for (int j = 0; j < n; j++) {
            // for every element in the row-col pair
            tmp = 0.0f;
            for (int k = 0; k < n; k++) {
                tmp += A[i * n + k] * B[k * n + j];
            }

            // Check result within tolerance
            if (fabs(tmp - C[i * n + j]) > epsilon) {
                fprintf(stderr, "Verification failed at row %d, column %d: CPU = %f, GPU = %f\n",
                        i, j, tmp, C[i * n + j]);
                exit(EXIT_FAILURE);
            }
        }
    }
}

// Fonction principale
int main() {
    int m = M, k = K, n = N;

    // Allocation et initialisation des matrices sur le CPU
    size_t sizeA = m * k * sizeof(float);
    size_t sizeB = k * n * sizeof(float);
    size_t sizeC = m * n * sizeof(float);
    float *h_A = (float*)malloc(sizeA);
    float *h_B = (float*)malloc(sizeB);
    float *h_C = (float*)malloc(sizeC);

    for (int i = 0; i < m * k; i++) h_A[i] = 1.0f; // Initialize A with all 1s
    for (int i = 0; i < k * n; i++) h_B[i] = 1.0f; // Initialize B with all 1s


    // Allocation mémoire sur le GPU
    float *d_A, *d_B, *d_C;
    cudaMalloc((void**)&d_A, sizeA);
    cudaMalloc((void**)&d_B, sizeB);
    cudaMalloc((void**)&d_C, sizeC);

    cudaMemcpy(d_A, h_A, sizeA, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, sizeB, cudaMemcpyHostToDevice);

    // Définir les dimensions des threads et des blocs
    dim3 threadsPerBlock(TILE_SIZE, TILE_SIZE);
    dim3 blocksPerGrid((n + TILE_SIZE - 1) / TILE_SIZE, (m + TILE_SIZE - 1) / TILE_SIZE);

    // Définition des événements pour la mesure du temps
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Démarrage de la mesure
    cudaEventRecord(start);

    // Lancement du kernel
    matrixMultiplyShared<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, m, k, n);

    // Arrêt de la mesure
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    // Calcul de la durée
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    printf("Temps d'exécution du kernel : %f ms\n", milliseconds);

    // Destruction des événements
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    // Copier le résultat vers le CPU
    cudaMemcpy(h_C, d_C, sizeC, cudaMemcpyDeviceToHost);

    printf("C[0][0] = %f\n", h_C[0]);

    //verif result with the CPU
    verify_result(h_A, h_B, h_C, n);
    cout << "Matrix Multiplication Successfully Calculated on GPU and Verified by the CPU" << endl;

    // Libération de la mémoire
    free(h_A);
    free(h_B);
    free(h_C);
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    return 0;
}


Overwriting Task4.cu


In [44]:
!nvcc Task4.cu -o Task4

In [45]:
!./Task4

Temps d'exécution du kernel : 1.115232 ms
C[0][0] = 512.000000
Matrix Multiplication Successfully Calculated on GPU and Verified by the CPU


In [46]:
!nvprof ./Task4

==8966== NVPROF is profiling process 8966, command: ./Task4
Temps d'exécution du kernel : 1.120896 ms
C[0][0] = 512.000000
Matrix Multiplication Successfully Calculated on GPU and Verified by the CPU
==8966== Profiling application: ./Task4
==8966== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   70.65%  976.14us         1  976.14us  976.14us  976.14us  matrixMultiplyShared(float const *, float const *, float*, int, int, int)
                   14.78%  204.25us         1  204.25us  204.25us  204.25us  [CUDA memcpy DtoH]
                   14.57%  201.34us         2  100.67us  94.942us  106.40us  [CUDA memcpy HtoD]
      API calls:   98.29%  199.05ms         3  66.351ms  3.3950us  198.96ms  cudaMalloc
                    0.85%  1.7195ms         3  573.16us  275.63us  1.1067ms  cudaMemcpy
                    0.49%  984.56us         1  984.56us  984.56us  984.56us  cudaEventSynchronize
                    0.16%  333.12u