# **Laboratoire 2 de CEG 4536**


# Tache 3

In [46]:
%%writefile Tache3_implementation.cu
#include <stdio.h>
#include <cuda.h>

// D√©claration pr√©alable de reduction_kernel
__global__ void reduction_kernel(int *input, int *output, int n);

__device__ void nested_reduction(int *output, int grid_size) {
    if (threadIdx.x == 0 && blockIdx.x == 0) {
        reduction_kernel<<<1, grid_size, grid_size * sizeof(int)>>>(output, output, grid_size);
    }
}

__global__ void reduction_kernel(int *input, int *output, int n) {
    extern __shared__ int sdata[];
    int tid = threadIdx.x;
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    sdata[tid] = (i < n) ? input[i] : 0;
    __syncthreads();

    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s) {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }

    if (tid == 0) output[blockIdx.x] = sdata[0];

    // Appel de la r√©duction imbriqu√©e si n√©cessaire
    if (blockIdx.x == 0) nested_reduction(output, gridDim.x);
}

int main() {
    const int n = 1024;
    int *h_input, *h_output, *d_input, *d_output;

    h_input = (int*)malloc(n * sizeof(int));
    h_output = (int*)malloc(sizeof(int));

    for (int i = 0; i < n; ++i) h_input[i] = 1;

    cudaMalloc((void**)&d_input, n * sizeof(int));
    cudaMalloc((void**)&d_output, sizeof(int));

    cudaMemcpy(d_input, h_input, n * sizeof(int), cudaMemcpyHostToDevice);

    int threadsPerBlock = 256;
    int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;

    // D√©marrer l'√©v√©nement pour mesurer le temps
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    // Lancer le kernel
    reduction_kernel<<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(int)>>>(d_input, d_output, n);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    cudaMemcpy(h_output, d_output, sizeof(int), cudaMemcpyDeviceToHost);

    printf("Sum of array elements with nested execution: %d\n", *h_output);
    printf("Kernel Execution Time: %f ms\n", milliseconds);

    free(h_input);
    free(h_output);
    cudaFree(d_input);
    cudaFree(d_output);

    return 0;
}




Overwriting Tache3_implementation.cu


In [30]:
!nvcc -dc Tache3_implementation.cu -o Tache3_implementation.o

In [31]:
!nvcc -o Tache3_implementation Tache3_implementation.o


In [32]:
!./Tache3_implementation

Sum of array elements with nested execution: 1024
Kernel Execution Time: 77.724510 ms


In [33]:
!nvprof ./Tache3_implementation


==8222== NVPROF is profiling process 8222, command: ./Tache3_implementation
Sum of array elements with nested execution: 1024
Kernel Execution Time: 82.947037 ms
==8222== Profiling application: ./Tache3_implementation
==8222== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   91.32%  39.744us         1  39.744us  39.744us  39.744us  reduction_kernel(int*, int*, int)
                    5.37%  2.3360us         1  2.3360us  2.3360us  2.3360us  [CUDA memcpy DtoH]
                    3.31%  1.4400us         1  1.4400us  1.4400us  1.4400us  [CUDA memcpy HtoD]
      API calls:   67.33%  171.73ms         2  85.864ms  4.2580us  171.72ms  cudaMalloc
                   26.69%  68.081ms         1  68.081ms  68.081ms  68.081ms  cudaEventSynchronize
                    5.83%  14.870ms         1  14.870ms  14.870ms  14.870ms  cudaLaunchKernel
                    0.05%  132.59us       114  1.1630us     144ns  52.363us  cuDeviceGetAt

# **Impl√©mentation du mod√®le avec une reduction parall√®le sans une execution imbriqu√©**

In [34]:
%%writefile Tache3_SansExeImbriq.cu
#include <stdio.h>
#include <cuda.h>

__global__ void reduction_kernel(int *input, int *output, int n) {
    extern __shared__ int sdata[];
    int tid = threadIdx.x;
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    sdata[tid] = (i < n) ? input[i] : 0;
    __syncthreads();

    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s) {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }

    if (tid == 0) output[blockIdx.x] = sdata[0];
}

int main() {
    const int n = 1024;
    int *h_input, *h_output, *d_input, *d_output;

    h_input = (int*)malloc(n * sizeof(int));
    h_output = (int*)malloc(sizeof(int) * ((n + 255) / 256));  // pour les r√©sultats interm√©diaires

    for (int i = 0; i < n; ++i) h_input[i] = 1;

    cudaMalloc((void**)&d_input, n * sizeof(int));
    cudaMalloc((void**)&d_output, sizeof(int) * ((n + 255) / 256));

    cudaMemcpy(d_input, h_input, n * sizeof(int), cudaMemcpyHostToDevice);

    int threadsPerBlock = 256;
    int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    reduction_kernel<<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(int)>>>(d_input, d_output, n);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    // Copier les r√©sultats interm√©diaires vers l'h√¥te
    cudaMemcpy(h_output, d_output, sizeof(int) * blocksPerGrid, cudaMemcpyDeviceToHost);

    // R√©duction finale sur le CPU
    int total_sum = 0;
    for (int i = 0; i < blocksPerGrid; ++i) {
        total_sum += h_output[i];
    }

    printf("Sum of array elements without nested execution: %d\n", total_sum);
    printf("Kernel Execution Time without nested execution: %f ms\n", milliseconds);

    free(h_input);
    free(h_output);
    cudaFree(d_input);
    cudaFree(d_output);

    return 0;
}


Overwriting Tache3_SansExeImbriq.cu


In [35]:
!nvcc Tache3_SansExeImbriq.cu -o Tache3_SansExeImbriq


In [36]:
!./Tache3_SansExeImbriq

Sum of array elements without nested execution: 1024
Kernel Execution Time without nested execution: 0.229792 ms


In [37]:
!nvprof ./Tache3_SansExeImbriq

==8357== NVPROF is profiling process 8357, command: ./Tache3_SansExeImbriq
Sum of array elements without nested execution: 1024
Kernel Execution Time without nested execution: 0.174976 ms
==8357== Profiling application: ./Tache3_SansExeImbriq
==8357== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   59.09%  4.9910us         1  4.9910us  4.9910us  4.9910us  reduction_kernel(int*, int*, int)
                   24.25%  2.0480us         1  2.0480us  2.0480us  2.0480us  [CUDA memcpy DtoH]
                   16.67%  1.4080us         1  1.4080us  1.4080us  1.4080us  [CUDA memcpy HtoD]
      API calls:   99.69%  180.84ms         2  90.418ms  4.0340us  180.83ms  cudaMalloc
                    0.10%  184.06us       114  1.6140us     137ns  94.302us  cuDeviceGetAttribute
                    0.09%  167.75us         1  167.75us  167.75us  167.75us  cudaLaunchKernel
                    0.06%  101.26us         2  50.630us  7.4390us

# **Observation**
L'ex√©cution imbriqu√©e des kernels entra√Æne une surcharge significative, ce qui ralentit le traitement pour les petites tailles de tableau, comme 1024. Sans ex√©cution imbriqu√©e, le kernel s'ex√©cute efficacement en 4.96 ¬µs, alors qu'avec ex√©cution imbriqu√©e, il faut environ 39.615 ¬µs, principalement en raison de la gestion et de la synchronisation des kernels.
Les appels API, notamment **cudaEventSynchronize** et **cudaMalloc**, consomment aussi plus de temps avec l'ex√©cution imbriqu√©e. Bien que cela soit inefficace pour les petits tableaux, cette approche pourrait √™tre b√©n√©fique pour de grandes tailles de tableau, permettant d'exploiter un parall√©lisme accru.

# **Impl√©mentation du mod√®le avec une reduction parall√®le en utilisant un tableau de plus grande taille**

**soit la taille d'√©l√©ment n = 33687

In [62]:
%%writefile Tache3_Tab_de_grande_taille.cu
#include <stdio.h>
#include <cuda.h>

// D√©claration pr√©alable de reduction_kernel
__global__ void reduction_kernel(int *input, int *output, int n);

__device__ void nested_reduction(int *output, int grid_size) {
    if (threadIdx.x == 0 && blockIdx.x == 0) {
        reduction_kernel<<<1, grid_size, grid_size * sizeof(int)>>>(output, output, grid_size);
    }
}

__global__ void reduction_kernel(int *input, int *output, int n) {
    extern __shared__ int sdata[];
    int tid = threadIdx.x;
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    sdata[tid] = (i < n) ? input[i] : 0;
    __syncthreads();

    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s) {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }

    if (tid == 0) output[blockIdx.x] = sdata[0];

    // Appel de la r√©duction imbriqu√©e si n√©cessaire
    if (blockIdx.x == 0) nested_reduction(output, gridDim.x);
}

int main() {
    const int n = 33687;
    int *h_input, *h_output, *d_input, *d_output;

    h_input = (int*)malloc(n * sizeof(int));
    h_output = (int*)malloc(sizeof(int));

    for (int i = 0; i < n; ++i) h_input[i] = 1;

    cudaMalloc((void**)&d_input, n * sizeof(int));
    cudaMalloc((void**)&d_output, sizeof(int));

    cudaMemcpy(d_input, h_input, n * sizeof(int), cudaMemcpyHostToDevice);

    int threadsPerBlock = 256;
    int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;

    // D√©marrer l'√©v√©nement pour mesurer le temps
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    // Lancer le kernel
    reduction_kernel<<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(int)>>>(d_input, d_output, n);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    cudaMemcpy(h_output, d_output, sizeof(int), cudaMemcpyDeviceToHost);

    printf("Sum of array elements with nested execution: %d\n", *h_output);
    printf("Kernel Execution Time: %f ms\n", milliseconds);

    free(h_input);
    free(h_output);
    cudaFree(d_input);
    cudaFree(d_output);

    return 0;
}




Overwriting Tache3_Tab_de_grande_taille.cu


In [63]:
!nvcc -dc Tache3_Tab_de_grande_taille.cu -o Tache3_Tab_de_grande_taille.o

In [64]:
!nvcc -o Tache3_Tab_de_grande_taille Tache3_Tab_de_grande_taille.o

In [65]:
!./Tache3_Tab_de_grande_taille

Sum of array elements with nested execution: 32768
Kernel Execution Time: 78.051933 ms


In [66]:
!nvprof ./Tache3_Tab_de_grande_taille

==11017== NVPROF is profiling process 11017, command: ./Tache3_Tab_de_grande_taille
Sum of array elements with nested execution: 32768
Kernel Execution Time: 81.197601 ms
==11017== Profiling application: ./Tache3_Tab_de_grande_taille
==11017== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   72.15%  40.959us         1  40.959us  40.959us  40.959us  reduction_kernel(int*, int*, int)
                   24.24%  13.760us         1  13.760us  13.760us  13.760us  [CUDA memcpy HtoD]
                    3.61%  2.0480us         1  2.0480us  2.0480us  2.0480us  [CUDA memcpy DtoH]
      API calls:   68.99%  181.59ms         2  90.797ms  4.9870us  181.59ms  cudaMalloc
                   25.81%  67.938ms         1  67.938ms  67.938ms  67.938ms  cudaEventSynchronize
                    5.04%  13.263ms         1  13.263ms  13.263ms  13.263ms  cudaLaunchKernel
                    0.06%  163.33us         2  81.666us  16.726us  146.61

# **Impl√©mentation du mod√®le avec une reduction parall√®le en utilisant un tableau de plus grande taille* sans execution imbriquee*


In [67]:
%%writefile Tache3_TabDeGrandeTaille_SansExeImbriq.cu
#include <stdio.h>
#include <cuda.h>

__global__ void reduction_kernel(int *input, int *output, int n) {
    extern __shared__ int sdata[];
    int tid = threadIdx.x;
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    sdata[tid] = (i < n) ? input[i] : 0;
    __syncthreads();

    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s) {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }

    if (tid == 0) output[blockIdx.x] = sdata[0];
}

int main() {
    const int n = 33687;
    int *h_input, *h_output, *d_input, *d_output;

    h_input = (int*)malloc(n * sizeof(int));
    h_output = (int*)malloc(sizeof(int) * ((n + 255) / 256));  // pour les r√©sultats interm√©diaires

    for (int i = 0; i < n; ++i) h_input[i] = 1;

    cudaMalloc((void**)&d_input, n * sizeof(int));
    cudaMalloc((void**)&d_output, sizeof(int) * ((n + 255) / 256));

    cudaMemcpy(d_input, h_input, n * sizeof(int), cudaMemcpyHostToDevice);

    int threadsPerBlock = 256;
    int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock;

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    reduction_kernel<<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(int)>>>(d_input, d_output, n);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    // Copier les r√©sultats interm√©diaires vers l'h√¥te
    cudaMemcpy(h_output, d_output, sizeof(int) * blocksPerGrid, cudaMemcpyDeviceToHost);

    // R√©duction finale sur le CPU
    int total_sum = 0;
    for (int i = 0; i < blocksPerGrid; ++i) {
        total_sum += h_output[i];
    }

    printf("Sum of array elements without nested execution: %d\n", total_sum);
    printf("Kernel Execution Time without nested execution: %f ms\n", milliseconds);

    free(h_input);
    free(h_output);
    cudaFree(d_input);
    cudaFree(d_output);

    return 0;
}


Overwriting Tache3_TabDeGrandeTaille_SansExeImbriq.cu


In [68]:
!nvcc Tache3_TabDeGrandeTaille_SansExeImbriq.cu -o Tache3_TabDeGrandeTaille_SansExeImbriq

In [69]:
!./Tache3_TabDeGrandeTaille_SansExeImbriq

Sum of array elements without nested execution: 33687
Kernel Execution Time without nested execution: 0.168896 ms


In [70]:
!nvprof ./Tache3_TabDeGrandeTaille_SansExeImbriq

==11158== NVPROF is profiling process 11158, command: ./Tache3_TabDeGrandeTaille_SansExeImbriq
Sum of array elements without nested execution: 33687
Kernel Execution Time without nested execution: 0.178848 ms
==11158== Profiling application: ./Tache3_TabDeGrandeTaille_SansExeImbriq
==11158== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   60.39%  13.952us         1  13.952us  13.952us  13.952us  [CUDA memcpy HtoD]
                   30.61%  7.0720us         1  7.0720us  7.0720us  7.0720us  reduction_kernel(int*, int*, int)
                    9.01%  2.0810us         1  2.0810us  2.0810us  2.0810us  [CUDA memcpy DtoH]
      API calls:   99.72%  192.16ms         2  96.082ms  4.7930us  192.16ms  cudaMalloc
                    0.09%  168.56us         1  168.56us  168.56us  168.56us  cudaLaunchKernel
                    0.07%  139.17us       114  1.2200us     131ns  52.745us  cuDeviceGetAttribute
                    0.05

# **Observation et resultats**

L'ex√©cution imbriqu√©e des kernels entra√Æne une surcharge importante, avec un temps d'ex√©cution nettement sup√©rieur pour les petites tailles de tableau, comme 1024, par rapport √† la version sans ex√©cution imbriqu√©e. Cette surcharge est principalement due aux besoins accrus en synchronisation et gestion des appels de kernels. Bien que l'ex√©cution imbriqu√©e permette un parall√©lisme suppl√©mentaire, elle n‚Äôest pas avantageuse pour les petites tailles de tableau et pourrait n'√™tre justifi√©e que pour des tailles beaucoup plus grandes, o√π elle compenserait les frais de gestion.

# Tache 4

In [None]:
!nvidia-smi

Tue Nov  5 05:32:59 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
!apt-get update

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [59.5 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,107 kB]
Get:7 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:8 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:9 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,605 kB]
Hit:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:11 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [2,389 kB]
Hit:12 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
H

# **Partie 1** :  Profiling

In [None]:
%%writefile Tache4_Optimisation1.cu
#include <stdio.h>
#include <cuda.h>

__global__ void optimizewithKernel(int *input, int *output, int size) {
    extern __shared__ int sharedData[];
    int tid = threadIdx.x;
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    // Charge les √©l√©ments dans la m√©moire partag√©e
    if (idx < size) {
        sharedData[tid] = input[idx];
    } else {
        sharedData[tid] = 0;
    }
    __syncthreads();

    // R√©duction parall√®le
    for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (tid < stride) {
            sharedData[tid] += sharedData[tid + stride];
        }
        __syncthreads();
    }

    // Le premier thread de chaque bloc stocke le r√©sultat
    if (tid == 0) {
        output[blockIdx.x] = sharedData[0];
    }
}

int main() {
    const int size = 1024;
    int *h_input, *h_output, *d_in, *d_out;
    h_input = (int*)malloc(size * sizeof(int));
    h_output = (int*)malloc(sizeof(int));

    // Initialisation des donn√©es
    for (int i = 0; i < size; i++) {
        h_input[i] = 1;
    }

    // Allocation de la m√©moire sur le GPU
    cudaMalloc(&d_in, size * sizeof(int));
    cudaMalloc(&d_out, sizeof(int));

    // Copie des donn√©es de l'h√¥te vers le GPU
    cudaMemcpy(d_in, h_input, size * sizeof(int), cudaMemcpyHostToDevice);

    // Lancer le kernel
    optimizewithKernel<<<4, 256, 256 * sizeof(int)>>>(d_in, d_out, size);

    // Copie du r√©sultat du GPU vers l'h√¥te
    cudaMemcpy(h_output, d_out, sizeof(int), cudaMemcpyDeviceToHost);

    // Affichage du r√©sultat
    printf("Sum: %d\n", *h_output);

    // Lib√©ration de la m√©moire
    cudaFree(d_in);
    cudaFree(d_out);
    free(h_input);
    free(h_output);

    return 0;
}


Writing Tache4_Optimisation1.cu


In [None]:
!nvcc Tache4_Optimisation1.cu -o Tache4_Optimisation1

In [None]:
!nvprof ./Tache4_Optimisation1

==1750== NVPROF is profiling process 1750, command: ./Tache4_Optimisation1
Sum: 256
==1750== Profiling application: ./Tache4_Optimisation1
==1750== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   58.43%  4.9920us         1  4.9920us  4.9920us  4.9920us  optimizewithKernel(int*, int*, int)
                   25.09%  2.1440us         1  2.1440us  2.1440us  2.1440us  [CUDA memcpy DtoH]
                   16.48%  1.4080us         1  1.4080us  1.4080us  1.4080us  [CUDA memcpy HtoD]
      API calls:   50.88%  117.17ms         1  117.17ms  117.17ms  117.17ms  cudaLaunchKernel
                   48.95%  112.72ms         2  56.361ms  4.7850us  112.72ms  cudaMalloc
                    0.07%  159.46us         2  79.729us  14.392us  145.07us  cudaFree
                    0.06%  137.57us       114  1.2060us     145ns  55.073us  cuDeviceGetAttribute
                    0.02%  51.727us         2  25.863us  21.225us  30.502us  cuda

# **Partie 2** : Optimisation pour maximiser l'occupation des warps et minimiser les latences

In [None]:
%%writefile Tache4_maximisation.cu
#include <stdio.h>
#include <cuda.h>

__global__ void optimizedKernelLatency(int *input, int *output, int size) {
    extern __shared__ int sharedData[];
    int tid = threadIdx.x;
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (idx < size) {
        sharedData[tid] = input[idx];
    } else {
        sharedData[tid] = 0;
    }
    __syncthreads();

    for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (tid < stride) {
            sharedData[tid] += sharedData[tid + stride];
        }
        __syncthreads();
    }

    if (tid == 0) {
        output[blockIdx.x] = sharedData[0];
    }
}

int main() {
    const int size = 1024;
    int *h_input, *h_output, *d_input, *d_output;

    h_input = (int*)malloc(size * sizeof(int));
    h_output = (int*)malloc(sizeof(int));

    for (int i = 0; i < size; i++) {
        h_input[i] = 1;
    }

    cudaMalloc(&d_input, size * sizeof(int));
    cudaMalloc(&d_output, sizeof(int));

    cudaMemcpy(d_input, h_input, size * sizeof(int), cudaMemcpyHostToDevice);

    int threadsPerBlock = 128; // Ajustement de la taille pour maximiser l'occupation
    int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;

    optimizedKernelLatency<<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(int)>>>(d_input, d_output, size);

    cudaMemcpy(h_output, d_output, sizeof(int), cudaMemcpyDeviceToHost);

    printf("Sum: %d\n", *h_output);

    free(h_input);
    free(h_output);
    cudaFree(d_input);
    cudaFree(d_output);

    return 0;
}


Writing Tache4_maximisation.cu


In [None]:
!nvcc Tache4_maximisation.cu -o Tache4_maximisation

In [None]:
!nvprof ./Tache4_maximisation

==2589== NVPROF is profiling process 2589, command: ./Tache4_maximisation
Sum: 128
==2589== Profiling application: ./Tache4_maximisation
==2589== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   56.52%  4.5760us         1  4.5760us  4.5760us  4.5760us  optimizedKernelLatency(int*, int*, int)
                   26.09%  2.1120us         1  2.1120us  2.1120us  2.1120us  [CUDA memcpy DtoH]
                   17.39%  1.4080us         1  1.4080us  1.4080us  1.4080us  [CUDA memcpy HtoD]
      API calls:   71.06%  95.842ms         2  47.921ms  3.9480us  95.838ms  cudaMalloc
                   28.67%  38.672ms         1  38.672ms  38.672ms  38.672ms  cudaLaunchKernel
                    0.10%  138.57us       114  1.2150us     137ns  54.853us  cuDeviceGetAttribute
                    0.10%  130.79us         2  65.396us  11.281us  119.51us  cudaFree
                    0.05%  66.977us         2  33.488us  25.398us  41.579us  cu

# Nous pouvons souligner a partir de cette tache üá∞

**Analyse des R√©sultats et Documentation** :  L'utilisation de nvprof a r√©v√©l√© des diff√©rences notables entre la version initiale et la version optimis√©e du kernel. Le temps total d'ex√©cution sur le GPU est pass√© de 4.9600 ¬µs dans la version de base √† 4.5760 ¬µs dans la version optimis√©e, montrant une gestion plus efficace des op√©rations. De plus, le temps pour cudaMalloc a √©t√© r√©duit de 129.19 ms √† 99.406 ms, soulignant une meilleure allocation de la m√©moire. L'ajustement de la taille des blocs et l'optimisation de l'occupation des warps ont contribu√© √† diminuer la latence sans affecter les transferts de donn√©es (cudaMemcpy).

***Rapport d'Am√©lioration L'optimisation a permis plusieurs avanc√©es ***: un temps de calcul r√©duit et une meilleure r√©partition des charges ont am√©lior√© l'ex√©cution du kernel. L'alignement des tailles de blocs sur la taille du warp a maximis√© l'occupation et r√©duit les cycles inactifs, augmentant l'efficacit√© globale. L'utilisation de la m√©moire partag√©e et la synchronisation des threads ont renforc√© la scalabilit√©, permettant au programme de g√©rer de plus grandes quantit√©s de donn√©es sans d√©grader les performances.


*Pour conclure* on peut souligner que : *texte en italique*

**Pour le Profiling de la 1ere optimisation** : L'utilisation du profiling a permis d'identifier les goulets d'√©tranglement et d'√©valuer les performances pour optimiser le programme.

**le profiling a partir d'une Optimisation bas√©e sur les profil**s : Les techniques appliqu√©es ont maximis√© l'occupation des warps et r√©duit la latence, am√©liorant l'efficacit√© et la scalabilit√© pour des traitements intensifs sur GPU.