# Вариант 6

In [None]:
%%writefile vector_square.cu
#include <stdio.h>

#define CSC(call)       \
do {                    \
    cudaError_t status = call;          \
    if  (status != cudaSuccess) {       \
        fprintf(stderr, "ERROR in %s:%d. Message: %s\n", __FILE__, __LINE__, cudaGetErrorString(status));   \
        exit(0);                        \
    }                                   \
} while (0)

__global__ void squareKernel(double *arr, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int offset = blockDim.x * gridDim.x;
    while (idx < n) {
        arr[idx] = arr[idx] * arr[idx];
        idx += offset;
    }
}

int main() {
    int n;

    scanf("%d", &n);

    double *arr = (double *)malloc(sizeof(double) * n);
    for (int i = 0; i < n; i++) {
        scanf("%lf", &arr[i]);
    }

    double *dev_arr;
    CSC(cudaMalloc(&dev_arr, sizeof(double) * n));
    CSC(cudaMemcpy(dev_arr, arr, sizeof(double) * n, cudaMemcpyHostToDevice));

    cudaEvent_t start, stop;
    CSC(cudaEventCreate(&start));
    CSC(cudaEventCreate(&stop));
    CSC(cudaEventRecord(start));

    int blockSize = 512;
    int numBlocks = (n + blockSize - 1) / blockSize;
    squareKernel<<<numBlocks, blockSize>>>(dev_arr, n);

    CSC(cudaDeviceSynchronize());
    CSC(cudaGetLastError());

    CSC(cudaEventRecord(stop));
    CSC(cudaEventSynchronize(stop));
    float elapsedTime;
    CSC(cudaEventElapsedTime(&elapsedTime, start, stop));
    CSC(cudaEventDestroy(start));
    CSC(cudaEventDestroy(stop));

    CSC(cudaMemcpy(arr, dev_arr, sizeof(double) * n, cudaMemcpyDeviceToHost));

    for (int i = 0; i < n; i++) {
        printf("%.10e ", arr[i]);
    }
    printf("\n");

    free(arr);
    CSC(cudaFree(dev_arr));

    //printf("%f\n", elapsedTime);

    return 0;
}

Overwriting vector_square.cu


In [None]:
!nvcc -o vector_square vector_square.cu

In [None]:
!./vector_square

3
1 5 3
1.0000000000e+00 2.5000000000e+01 9.0000000000e+00 
