<a href="https://colab.research.google.com/github/Andrey-AUF/HPC-2022/blob/main/LR1(update).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/andreinechaev/nvcc4jupyter.git
  Cloning https://github.com/andreinechaev/nvcc4jupyter.git to /tmp/pip-req-build-hoyjtgxj
  Running command git clone -q https://github.com/andreinechaev/nvcc4jupyter.git /tmp/pip-req-build-hoyjtgxj
Building wheels for collected packages: NVCCPlugin
  Building wheel for NVCCPlugin (setup.py) ... [?25l[?25hdone
  Created wheel for NVCCPlugin: filename=NVCCPlugin-0.0.2-py3-none-any.whl size=4306 sha256=946a518810dacc8eaf54ce9c5b15a5b91c4f560699d657abc8ea2271ca2ae566
  Stored in directory: /tmp/pip-ephem-wheel-cache-nr46ai9a/wheels/ca/33/8d/3c86eb85e97d2b6169d95c6e8f2c297fdec60db6e84cb56f5e
Successfully built NVCCPlugin
Installing collected packages: NVCCPlugin
Successfully installed NVCCPlugin-0.0.2


In [2]:
%load_ext nvcc_plugin

created output directory at /content/src
Out bin /content/result.out


In [6]:
%%cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <chrono>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>

void reductionWithCudaImproved(float* result, const float* input, int SIZE);
__global__ void reductionKernelImproved(float* result, const float* input, int SIZE);
void reductionCPU(float* result, const float* input, int SIZE);

#define TILE 32
#define ILP 8
#define BLOCK_X_IMPR (TILE / ILP)
#define BLOCK_Y_IMPR 32
#define BLOCK_COUNT_X_IMPR 100


void reductionCPU(float* result, const float* input, int SIZE)
{
    for (int i = 0; i < SIZE; i++)
        *result += input[i];
}

__global__ void reductionKernelImproved(float* result, const float* input, int SIZE)
{
    int i;
    int col = (blockDim.x * blockIdx.x + threadIdx.x) * ILP;
    int row = blockDim.y * blockIdx.y + threadIdx.y;
    int index = row * blockDim.x * gridDim.x * ILP + col;
    __shared__ float interResult;

    if (threadIdx.x == 0 && threadIdx.y == 0)
        interResult = 0.0;

    __syncthreads();

#pragma unroll
    for (i = 0; i < ILP; i++)
    {
        if (index < SIZE)
        {
            atomicAdd(&interResult, input[index]);
            index++;
        }
    }

    __syncthreads();

    if (threadIdx.x == 0 && threadIdx.y == 0)
        atomicAdd(result, interResult);
}

void reductionWithCudaImproved(float* result, const float* input, int SIZE, float cpuTime)
{
    dim3 dim_grid, dim_block;

    float* dev_input = 0;
    float* dev_result = 0;
    cudaEvent_t start, stop;
    float elapsed = 0;
    double gpuBandwidth;

    dim_block.x = BLOCK_X_IMPR;
    dim_block.y = BLOCK_Y_IMPR;
    dim_block.z = 1;

    dim_grid.x = BLOCK_COUNT_X_IMPR;
    dim_grid.y = (int)ceil((float)SIZE / (float)(TILE * dim_block.y * BLOCK_COUNT_X_IMPR));
    dim_grid.z = 1;

    cudaSetDevice(0);

    cudaMalloc((void**)&dev_input, SIZE * sizeof(float));
    cudaMalloc((void**)&dev_result, sizeof(float));
    cudaMemcpy(dev_input, input, SIZE * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_result, result, sizeof(float), cudaMemcpyHostToDevice);

    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    reductionKernelImproved << <dim_grid, dim_block >> > (dev_result, dev_input, SIZE);

    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    cudaEventElapsedTime(&elapsed, start, stop);

    printf("GPU Time (improved): %f ms\n", elapsed);
    printf("acceleration factor: %f ms\n", cpuTime/elapsed);

    cudaDeviceSynchronize();

    cudaMemcpy(result, dev_result, sizeof(float), cudaMemcpyDeviceToHost);

    cudaFree(dev_input);
    cudaFree(dev_result);

    return;
}


int main()
{
    int i,j;
    float* input;
    float resultCPU, resultGPU;
    double cpuTime, cpuBandwidth;
    int SIZE;
    int a[] = {1000,10000,100000,1000000,10000000};
 for (j = 0; j < 5; ++j) {
    SIZE = a[j];
    printf("Size : %d \n", SIZE);
    input = (float*)malloc(SIZE * sizeof(float));
    resultCPU = 0.0;
    resultGPU = 0.0;

      
    auto start = std::chrono::high_resolution_clock::now();
    auto end = std::chrono::high_resolution_clock::now();

    for (i = 0; i < SIZE; i++)
        input[i] = rand() % 10 - 5;

    start = std::chrono::high_resolution_clock::now();
    reductionCPU(&resultCPU, input, SIZE);
    end = std::chrono::high_resolution_clock::now();

    std::chrono::duration<double> diff = end - start;
    cpuTime = (diff.count() * 1000);
    printf("CPU Time: %f ms\n", cpuTime);

    reductionWithCudaImproved(&resultGPU, input, SIZE, cpuTime);
 }
    return 0;
 
}

Size : 1000 
CPU Time: 0.003420 ms
GPU Time (improved): 0.025344 ms
acceleration factor: 0.134943 ms
Size : 10000 
CPU Time: 0.030199 ms
GPU Time (improved): 0.030720 ms
acceleration factor: 0.983040 ms
Size : 100000 
CPU Time: 0.312485 ms
GPU Time (improved): 0.148512 ms
acceleration factor: 2.104106 ms
Size : 1000000 
CPU Time: 3.084768 ms
GPU Time (improved): 0.816480 ms
acceleration factor: 3.778131 ms
Size : 10000000 
CPU Time: 30.065839 ms
GPU Time (improved): 7.006240 ms
acceleration factor: 4.291295 ms



In [9]:
params = {
    'SIZE':            [1000, 10000, 100000, 1000000, 10000000],
    'acceleration factor':[0.134943, 0.983040, 2.104106, 3.778131, 4.291295],
    'cpu_time':     [0.003420, 0.030199, 0.312485, 3.084768, 30.065839],
    'gpu_time':     [0.025344, 0.030720, 0.148512, 0.816480, 7.006240],
}

exp_count = len(params["SIZE"])
import plotly.graph_objects as go
import numpy as np
import matplotlib.pyplot as plt
data = []
for k in ['cpu_time', 'gpu_time']:
      plot = go.Scatter(x=params['SIZE'],
                          y=params[k],
                          mode="lines+markers",
                          name=k
                          )
      data.append(plot)
layout = go.Layout(title="Зависимость времени от размера вектора",
                   xaxis= dict(title= 'Размер вектора',ticklen= 20,zeroline= False), 
                   yaxis= dict(title= 'Время, миллисекунды',ticklen= 20,zeroline= False))
fig1 = go.Figure(data=data, layout = layout)
fig1.show()

data = []
plot = go.Scatter(x=params['SIZE'],
                      y=params['acceleration factor'],
                      mode="lines+markers"
                      )
layout = go.Layout(title="График зависимости ускорения от размера вектора",
                   xaxis= dict(title= 'Размер вектора',ticklen= 20,zeroline= False), 
                   yaxis= dict(title= 'Ускорение',ticklen= 20,zeroline= False))
data.append(plot)
fig2 = go.Figure(data = data, layout = layout)
fig2.show()