In [4]:
%%writefile min_max_avg.cu
#include <iostream>
#include <cstdlib>
#include <ctime>
#include <float.h>

__global__ void find_min(int *arr, int *result, int n) {
    __shared__ int shared_min[256];
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    int local_min = INT_MAX;

    while (tid < n) {
        if (arr[tid] < local_min)
            local_min = arr[tid];
        tid += blockDim.x * gridDim.x;
    }

    shared_min[threadIdx.x] = local_min;
    __syncthreads();

    // Reduction within block
    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (threadIdx.x < s)
            shared_min[threadIdx.x] = min(shared_min[threadIdx.x], shared_min[threadIdx.x + s]);
        __syncthreads();
    }

    if (threadIdx.x == 0)
        atomicMin(result, shared_min[0]);
}

__global__ void find_max(int *arr, int *result, int n) {
    __shared__ int shared_max[256];
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    int local_max = INT_MIN;

    while (tid < n) {
        if (arr[tid] > local_max)
            local_max = arr[tid];
        tid += blockDim.x * gridDim.x;
    }

    shared_max[threadIdx.x] = local_max;
    __syncthreads();

    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (threadIdx.x < s)
            shared_max[threadIdx.x] = max(shared_max[threadIdx.x], shared_max[threadIdx.x + s]);
        __syncthreads();
    }

    if (threadIdx.x == 0)
        atomicMax(result, shared_max[0]);
}

__global__ void find_sum(int *arr, int *sum, int n) {
    __shared__ int shared_sum[256];
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    int local_sum = 0;

    while (tid < n) {
        local_sum += arr[tid];
        tid += blockDim.x * gridDim.x;
    }

    shared_sum[threadIdx.x] = local_sum;
    __syncthreads();

    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (threadIdx.x < s)
            shared_sum[threadIdx.x] += shared_sum[threadIdx.x + s];
        __syncthreads();
    }

    if (threadIdx.x == 0)
        atomicAdd(sum, shared_sum[0]);
}

int main() {
    int n;
    std::cout << "Enter number of elements: ";
    std::cin >> n;

    int *h_arr = new int[n];
    srand(time(0));
    std::cout << "Array: ";
    for (int i = 0; i < n; i++) {
        h_arr[i] = rand() % 100;
        std::cout << h_arr[i] << " ";
    }
    std::cout << "\n";

    int *d_arr, *d_min, *d_max, *d_sum;
    int h_min = INT_MAX, h_max = INT_MIN, h_sum = 0;

    cudaMalloc(&d_arr, n * sizeof(int));
    cudaMalloc(&d_min, sizeof(int));
    cudaMalloc(&d_max, sizeof(int));
    cudaMalloc(&d_sum, sizeof(int));

    cudaMemcpy(d_arr, h_arr, n * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_min, &h_min, sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_max, &h_max, sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_sum, &h_sum, sizeof(int), cudaMemcpyHostToDevice);

    find_min<<<1, 256>>>(d_arr, d_min, n);
    find_max<<<1, 256>>>(d_arr, d_max, n);
    find_sum<<<1, 256>>>(d_arr, d_sum, n);

    cudaMemcpy(&h_min, d_min, sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(&h_max, d_max, sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(&h_sum, d_sum, sizeof(int), cudaMemcpyDeviceToHost);

    std::cout << "Minimum = " << h_min << "\n";
    std::cout << "Maximum = " << h_max << "\n";
    std::cout << "Average = " << (float)h_sum / n << "\n";

    cudaFree(d_arr);
    cudaFree(d_min);
    cudaFree(d_max);
    cudaFree(d_sum);
    delete[] h_arr;
    return 0;
}


Writing min_max_avg.cu


In [5]:
!nvcc min_max_avg.cu -o min_max_avg


In [6]:
!./min_max_avg



Enter number of elements: 5
Array: 51 93 87 86 97 
Minimum = 2147483647
Maximum = -2147483648
Average = 0
