<a href="https://colab.research.google.com/github/Abhi-v-b/LP-5/blob/main/HPC/Assignment%203/Min_Max_Sum_Avg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [2]:
!pip install git+https://github.com/afnan47/cuda.git

Collecting git+https://github.com/afnan47/cuda.git
  Cloning https://github.com/afnan47/cuda.git to /tmp/pip-req-build-g5gm2mhv
  Running command git clone --filter=blob:none --quiet https://github.com/afnan47/cuda.git /tmp/pip-req-build-g5gm2mhv
  Resolved https://github.com/afnan47/cuda.git to commit aac710a35f52bb78ab34d2e52517237941399eff
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4289 sha256=c345e3cbb88e515d1c6a84b39c723f8b3f7fa8549fa12b1b7d9e9a3007802cca
  Stored in directory: /tmp/pip-ephem-wheel-cache-fk2ztzo7/wheels/aa/f3/44/e10c1d226ec561d971fcd4b0463f6bff08602afa928a3e7bc7
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [4]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [19]:
%%writefile operation.cu
#include <iostream>
#include <cuda_runtime.h>
#include <algorithm> // For min and max operations
#include <vector>

using namespace std;

// CUDA device function for reduction to find minimum value
__global__ void min_reduction(int* d_arr, int* d_result, int size) {
    extern __shared__ int sdata[];
    int tid = threadIdx.x;
    int global_id = blockIdx.x * blockDim.x + threadIdx.x;

    // Load data into shared memory
    if (global_id < size) {
        sdata[tid] = d_arr[global_id];
    } else {
        sdata[tid] = INT_MAX;
    }
    __syncthreads();

    // Reduction to find minimum
    for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (tid < stride && global_id + stride < size) {
            sdata[tid] = min(sdata[tid], sdata[tid + stride]);
        }
        __syncthreads();
    }

    // Store the result from the first thread in each block
    if (tid == 0) {
        d_result[blockIdx.x] = sdata[0];
    }
}

// CUDA device function for reduction to find maximum value
__global__ void max_reduction(int* d_arr, int* d_result, int size) {
    extern __shared__ int sdata[];
    int tid = threadIdx.x;
    int global_id = blockIdx.x * blockDim.x + threadIdx.x;

    // Load data into shared memory
    if (global_id < size) {
        sdata[tid] = d_arr[global_id];
    } else {
        sdata[tid] = INT_MIN;
    }
    __syncthreads();

    // Reduction to find maximum
    for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (tid < stride && global_id + stride < size) {
            sdata[tid] = max(sdata[tid], sdata[tid + stride]);
        }
        __syncthreads();
    }

    // Store the result from the first thread in each block
    if (tid == 0) {
        d_result[blockIdx.x] = sdata[0];
    }
}

// CUDA device function for reduction to find sum
__global__ void sum_reduction(int* d_arr, int* d_result, int size) {
    extern __shared__ int sdata[];
    int tid = threadIdx.x;
    int global_id = blockIdx.x * blockDim.x + threadIdx.x;

    // Load data into shared memory
    if (global_id < size) {
        sdata[tid] = d_arr[global_id];
    } else {
        sdata[tid] = 0;
    }
    __syncthreads();

    // Reduction to find sum
    for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
        if (tid < stride && global_id + stride < size) {
            sdata[tid] += sdata[tid + stride];
        }
        __syncthreads();
    }

    // Store the result from the first thread in each block
    if (tid == 0) {
        d_result[blockIdx.x] = sdata[0];
    }
}

// Function to calculate the final reduction on the host
int final_reduction(int* d_result, int size, int (*op)(int, int)) {
    std::vector<int> h_result(size);
    cudaMemcpy(h_result.data(), d_result, size * sizeof(int), cudaMemcpyDeviceToHost);

    // Perform final reduction on the host
    int final_result = h_result[0];
    for (int i = 1; i < size; i++) {
        final_result = op(final_result, h_result[i]);
    }

    return final_result;
}

int main() {
    int n = 5;
    int arr[] = {1, 2, 3, 4, 5};

    // Allocate memory on the GPU
    int* d_arr;
    cudaMalloc(&d_arr, n * sizeof(int));
    cudaMemcpy(d_arr, arr, n * sizeof(int), cudaMemcpyHostToDevice);

    int blockSize = 256;
    int gridSize = (n + blockSize - 1) / blockSize;

    // Timing events
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Minimum value
    int* d_min_result;
    cudaMalloc(&d_min_result, gridSize * sizeof(int));
    cudaEventRecord(start);
    min_reduction<<<gridSize, blockSize, blockSize * sizeof(int)>>>(d_arr, d_min_result, n);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float min_time_ms;
    cudaEventElapsedTime(&min_time_ms, start, stop);
    int minval = final_reduction(d_min_result, gridSize, [](int a, int b) { return std::min(a, b); });

    // Maximum value
    int* d_max_result;
    cudaMalloc(&d_max_result, gridSize * sizeof(int));
    cudaEventRecord(start);
    max_reduction<<<gridSize, blockSize, blockSize * sizeof(int)>>>(d_arr, d_max_result, n);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float max_time_ms;
    cudaEventElapsedTime(&max_time_ms, start, stop);
    int maxval = final_reduction(d_max_result, gridSize, [](int a, int b) { return std::max(a, b); });

    // Summation
    int* d_sum_result;
    cudaMalloc(&d_sum_result, gridSize * sizeof(int));
    cudaEventRecord(start);
    sum_reduction<<<gridSize, blockSize, blockSize * sizeof(int)>>>(d_arr, d_sum_result, n);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float sum_time_ms;
    cudaEventElapsedTime(&sum_time_ms, start, stop);
    int sumval = final_reduction(d_sum_result, gridSize, [](int a, int b) { return a + b; });

    // Timing for average computation
    cudaEventRecord(start);
    double average = (double)sumval / n;
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float average_time_ms;
    cudaEventElapsedTime(&average_time_ms, start, stop);

    // Print results
    std::cout << "The minimum value is: " << minval << " (Computed in " << min_time_ms << " ms)" << std::endl;
    std::cout << "The maximum value is: " << maxval << " (Computed in " << max_time_ms << " ms)" << std::endl;
    std::cout << "The summation is: " << sumval << " (Computed in " << sum_time_ms << " ms)" << std::endl;
    std::cout << "The average is: " << average << " (Computed in " << average_time_ms << " ms)" << std::endl;

    // Free memory
    cudaFree(d_arr);
    cudaFree(d_min_result);
    cudaFree(d_max_result);
    cudaFree(d_sum_result);

    return 0;
}


Overwriting operation.cu


In [20]:
!nvcc operation.cu -o opt

In [21]:
!./opt

The minimum value is: 1 (Computed in 0.245568 ms)
The maximum value is: 5 (Computed in 0.023136 ms)
The summation is: 15 (Computed in 0.0176 ms)
The average is: 3 (Computed in 0.002496 ms)
