In [None]:
!nvidia-smi

Sun Nov 24 02:07:54 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
!apt-get update

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


In [4]:
%%writefile lab4_2.cu
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cassert>
#include <iostream>
using namespace std;

#define BLOCK_SIZE 32 // Taille du bloc pour une bonne utilisation de la mémoire partagée

// Kernel sans padding
__global__ void matrixTransposeNoPadding(float* input, float* output, int width, int height) {
    __shared__ float tile[BLOCK_SIZE][BLOCK_SIZE];

    int x = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    int y = blockIdx.y * BLOCK_SIZE + threadIdx.y;

    if (x < width && y < height) {
        tile[threadIdx.y][threadIdx.x] = input[y * width + x];
    }

    __syncthreads();

    x = blockIdx.y * BLOCK_SIZE + threadIdx.x;
    y = blockIdx.x * BLOCK_SIZE + threadIdx.y;

    if (x < height && y < width) {
        output[y * height + x] = tile[threadIdx.x][threadIdx.y];
    }
}

// Kernel avec padding
__global__ void matrixTransposeWithPadding(float* input, float* output, int width, int height) {
    __shared__ float tile[BLOCK_SIZE][BLOCK_SIZE + 1]; // Ajout d'un padding pour éviter les conflits de banque

    int x = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    int y = blockIdx.y * BLOCK_SIZE + threadIdx.y;

    if (x < width && y < height) {
        tile[threadIdx.y][threadIdx.x] = input[y * width + x];
    }
    __syncthreads();

    x = blockIdx.y * BLOCK_SIZE + threadIdx.x;
    y = blockIdx.x * BLOCK_SIZE + threadIdx.y;

    if (x < height && y < width) {
        output[y * height + x] = tile[threadIdx.x][threadIdx.y];
    }
}

// Kernel de réduction parallèle optimisée
__global__ void reduceOptimized(const float* input, float* output, int n) {
    __shared__ double sharedMem[BLOCK_SIZE]; // Mémoire partagée en double précision pour minimiser les erreurs

    int tid = threadIdx.x;                // Identifiant du thread dans le bloc
    int globalIdx = blockIdx.x * blockDim.x + threadIdx.x; // Index global

    // Charger les données dans la mémoire partagée
    sharedMem[tid] = (globalIdx < n) ? static_cast<double>(input[globalIdx]) : 0.0;
    __syncthreads();

    // Réduction dans la mémoire partagée
    for (int stride = blockDim.x / 2; stride > 32; stride /= 2) {
        if (tid < stride) {
            sharedMem[tid] += sharedMem[tid + stride];
        }
        __syncthreads();
    }

    // Réduction finale dans un warp
    if (tid < 32) {
        double val = sharedMem[tid];
        for (int offset = 16; offset > 0; offset /= 2) {
            val += __shfl_down_sync(0xffffffff, val, offset);
        }
        sharedMem[tid] = val;
    }

    // Le thread 0 écrit le résultat final
    if (tid == 0) {
        output[blockIdx.x] = static_cast<float>(sharedMem[0]);
    }
}

void transposeVerifyCPU(float* input, float* output, int width, int height) {
    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x++) {
            assert(output[y * width + x] == input[x * height + y]);
        }
    }
}

int main() {
    int width = 1024;
    int height = 1024;

    size_t size = width * height * sizeof(float);

    float* h_input = (float*)malloc(size);
    float* h_output = (float*)malloc(size);

    // Remplir la matrice avec des valeurs arbitraires
    for (int i = 0; i < width * height; i++) {
        h_input[i] = static_cast<float>(i);
    }

    float *d_input, *d_output;
    cudaMalloc(&d_input, size);
    cudaMalloc(&d_output, size);

    cudaMemcpy(d_input, h_input, size, cudaMemcpyHostToDevice);

    dim3 blockSize(BLOCK_SIZE, BLOCK_SIZE);
    dim3 gridSize((width + BLOCK_SIZE - 1) / BLOCK_SIZE, (height + BLOCK_SIZE - 1) / BLOCK_SIZE);

    // Profiler et comparer les deux implémentations
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Transposition sans padding
    cudaEventRecord(start);
    matrixTransposeNoPadding<<<gridSize, blockSize>>>(d_input, d_output, width, height);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float noPaddingTime;
    cudaEventElapsedTime(&noPaddingTime, start, stop);
    printf("Temps sans padding : %f ms\n", noPaddingTime);
    cudaMemcpy(h_output, d_output, size, cudaMemcpyDeviceToHost);
    transposeVerifyCPU(h_input, h_output, width, height);

    // Transposition avec padding
    cudaEventRecord(start);
    matrixTransposeWithPadding<<<gridSize, blockSize>>>(d_input, d_output, width, height);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float withPaddingTime;
    cudaEventElapsedTime(&withPaddingTime, start, stop);
    printf("Temps avec padding : %f ms\n", withPaddingTime);
    cudaMemcpy(h_output, d_output, size, cudaMemcpyDeviceToHost);
    transposeVerifyCPU(h_input, h_output, width, height);

    // Réduction parallèle optimisée
    int n = 1024;
    size_t reductionSize = n * sizeof(float);
    float* h_reductionInput = (float*)malloc(reductionSize);
    float* h_reductionOutput = (float*)malloc(reductionSize / BLOCK_SIZE);

    for (int i = 0; i < n; i++) {
        h_reductionInput[i] = 1.0f; // Exemple : Tableau rempli avec des 1
    }

    float *d_reductionInput, *d_reductionOutput;
    cudaMalloc(&d_reductionInput, reductionSize);
    cudaMalloc(&d_reductionOutput, reductionSize / BLOCK_SIZE);

    cudaMemcpy(d_reductionInput, h_reductionInput, reductionSize, cudaMemcpyHostToDevice);

    dim3 reductionBlockSize(BLOCK_SIZE);
    dim3 reductionGridSize((n + BLOCK_SIZE - 1) / BLOCK_SIZE);

    cudaEventRecord(start);
    reduceOptimized<<<reductionGridSize, reductionBlockSize>>>(d_reductionInput, d_reductionOutput, n);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float reductionTime;
    cudaEventElapsedTime(&reductionTime, start, stop);
    printf("Temps de réduction optimisée : %f ms\n", reductionTime);
    cudaMemcpy(h_reductionOutput, d_reductionOutput, reductionSize / BLOCK_SIZE, cudaMemcpyDeviceToHost);

    float reductionSum = 0;
    for (int i = 0; i < reductionGridSize.x; i++) {
        reductionSum += h_reductionOutput[i];
    }
    cout << "ReductionSum of GPU : " << reductionSum << endl;

    float reductionSumCPU = 0;
    for (int i = 0; i < n; i++) {
        reductionSumCPU += h_reductionInput[i];
    }
    cout << "Reduction of CPU : " << reductionSumCPU << endl;

    // Nettoyage
    cudaFree(d_input);
    cudaFree(d_output);
    cudaFree(d_reductionInput);
    cudaFree(d_reductionOutput);
    free(h_input);
    free(h_output);
    free(h_reductionInput);
    free(h_reductionOutput);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return 0;
}


Writing lab4_2.cu


In [5]:
!nvcc -arch=sm_75 -gencode=arch=compute_75,code=sm_75 lab4_2.cu -o lab4_2


In [6]:
!./lab4_2

Temps sans padding : 0.209632 ms
Temps avec padding : 0.094400 ms
Temps de réduction optimisée : 0.032704 ms
ReductionSum of GPU : 1024
Reduction of CPU : 1024


In [7]:
!nvprof ./lab4_2

==3764== NVPROF is profiling process 3764, command: ./lab4_2
Temps sans padding : 0.341536 ms
Temps avec padding : 0.115264 ms
Temps de réduction optimisée : 0.053728 ms
ReductionSum of GPU : 1024
Reduction of CPU : 1024
==3764== Profiling application: ./lab4_2
==3764== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   48.98%  2.7054ms         3  901.81us  1.6640us  2.0390ms  [CUDA memcpy DtoH]
                   47.38%  2.6170ms         2  1.3085ms  1.4720us  2.6156ms  [CUDA memcpy HtoD]
                    2.44%  134.91us         1  134.91us  134.91us  134.91us  matrixTransposeNoPadding(float*, float*, int, int)
                    1.13%  62.207us         1  62.207us  62.207us  62.207us  matrixTransposeWithPadding(float*, float*, int, int)
                    0.08%  4.2560us         1  4.2560us  4.2560us  4.2560us  reduceOptimized(float const *, float*, int)
      API calls:   95.77%  210.37ms         4  52.593ms  1